5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1254return new TiktokenTokenizer(
1282=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1302=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1333return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1397return new TiktokenTokenizer(vocabStream,
90 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
Microsoft.ML.GenAI.LLaMA (3)
Microsoft.ML.GenAI.LLaMA.Tests (1)
Microsoft.ML.GenAI.Samples (7)
Microsoft.ML.Tokenizers (17)
Model\TiktokenTokenizer.cs (13)
1221private static TiktokenTokenizer CreateForModel(
1276public static TiktokenTokenizer Create(
1296public static TiktokenTokenizer Create(
1317public static async Task<TiktokenTokenizer> CreateAsync(
1349public static async Task<TiktokenTokenizer> CreateAsync(
1375public static TiktokenTokenizer CreateForModel(
1414public static async Task<TiktokenTokenizer> CreateForModelAsync(
1451public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1461public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1491throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
Microsoft.ML.Tokenizers.Data.Tests (3)
Microsoft.ML.Tokenizers.Tests (58)
TiktokenTests.cs (58)
32public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens);
33public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2");
34public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003");
35public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
36public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
37public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
38public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");
46Assert.True(GPT4 is TiktokenTokenizer);
47IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
51string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
62Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
67tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
71tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
76tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
82tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream);
88tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream);
92tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
119TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
120TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
144TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
161private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded)
197TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
228TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
240TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
275TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
301TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
309TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
337TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text);
356TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
375TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text);
394TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
455Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
456Assert.True(tokenizer is TiktokenTokenizer);
468Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName);
469Assert.True(tokenizer is TiktokenTokenizer);
482Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName);
484Assert.True(tokenizer is TiktokenTokenizer);
485Assert.True(tokenizer1 is TiktokenTokenizer);
487TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
488TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!;
499Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!));
500Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_"));
501Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_"));
502Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_"));
503Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_"));
504Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_"));
526Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name);
527Assert.True(tokenizer is TiktokenTokenizer);
761private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
762=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
764private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken)
765=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
767private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
768=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;