5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1251return new TiktokenTokenizer(
1279=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1299=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1330return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1394return new TiktokenTokenizer(vocabStream,
91 references to TiktokenTokenizer
Microsoft.Extensions.AI.Evaluation.Integration.Tests (1)
Microsoft.Extensions.AI.Integration.Tests (1)
Microsoft.ML.GenAI.LLaMA (3)
Microsoft.ML.GenAI.LLaMA.Tests (1)
Microsoft.ML.GenAI.Samples (7)
Microsoft.ML.Tokenizers (17)
Model\TiktokenTokenizer.cs (13)
1218private static TiktokenTokenizer CreateForModel(
1273public static TiktokenTokenizer Create(
1293public static TiktokenTokenizer Create(
1314public static async Task<TiktokenTokenizer> CreateAsync(
1346public static async Task<TiktokenTokenizer> CreateAsync(
1372public static TiktokenTokenizer CreateForModel(
1411public static async Task<TiktokenTokenizer> CreateForModelAsync(
1448public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1458public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1488throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
Microsoft.ML.Tokenizers.Data.Tests (3)
Microsoft.ML.Tokenizers.Tests (58)
TiktokenTests.cs (58)
32public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens);
33public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2");
34public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003");
35public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
36public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
37public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
38public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");
46Assert.True(GPT4 is TiktokenTokenizer);
47IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
51string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
62Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
67tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
71tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
76tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
82tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream);
88tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream);
92tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
119TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
120TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
144TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
161private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded)
197TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
228TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
240TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
275TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
301TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
309TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
337TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text);
356TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
375TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text);
394TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
452Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
453Assert.True(tokenizer is TiktokenTokenizer);
465Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName);
466Assert.True(tokenizer is TiktokenTokenizer);
479Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName);
481Assert.True(tokenizer is TiktokenTokenizer);
482Assert.True(tokenizer1 is TiktokenTokenizer);
484TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
485TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!;
496Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!));
497Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_"));
498Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_"));
499Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_"));
500Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_"));
501Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_"));
521Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name);
522Assert.True(tokenizer is TiktokenTokenizer);
756private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
757=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
759private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken)
760=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
762private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
763=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;