6 types derived from Tokenizer
Microsoft.ML.Tokenizers (5)
Model\BPETokenizer.cs (1)
23public sealed class BpeTokenizer : Tokenizer
Model\CodeGenTokenizer.cs (1)
23public class CodeGenTokenizer : Tokenizer
Model\EnglishRobertaTokenizer.cs (1)
20public sealed class EnglishRobertaTokenizer : Tokenizer
Model\SentencePieceBpeTokenizer.cs (1)
25public class SentencePieceBpeTokenizer : Tokenizer
Model\TiktokenTokenizer.cs (1)
24public sealed partial class TiktokenTokenizer : Tokenizer
Microsoft.ML.Tokenizers.Tests (1)
TokenizerTests.cs (1)
79private sealed class EnglishAlphabetTokenizer : Tokenizer
99 references to Tokenizer
Microsoft.ML.GenAI.Core (4)
Pipeline\CausalLMPipeline.cs (4)
18where TTokenizer : Tokenizer 65where TTokenizer : Tokenizer 93Tokenizer tokenizer, 112public Tokenizer Tokenizer { get; }
Microsoft.ML.GenAI.LLaMA (6)
LlamaCausalLMAgent.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 27ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline,
LlamaChatCompletionService.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 24public LlamaChatCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline, ISemanticKernelChatTemplateBuilder? templateBuilder = null)
LlamaTextCompletionService.cs (2)
20private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 22public LlamaTextCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline)
Microsoft.ML.GenAI.Phi (8)
Extension\SemanticKernelExtension.cs (2)
18ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline) 27ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
Phi3\Phi3CausalLMAgent.cs (2)
20private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 24ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline,
Phi3\Phi3CausalLMChatCompletionService.cs (2)
17private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 21public Phi3CausalLMChatCompletionService(ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
Phi3\Phi3CausalLMTextGenerationService.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 17public Phi3CausalLMTextGenerationService(ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
Microsoft.ML.GenAI.Phi.Tests (3)
AutoGenTests.cs (1)
19var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
SemanticKernelTests.cs (2)
22var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>(); 58var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
Microsoft.ML.Tokenizers (6)
Model\TiktokenTokenizer.cs (1)
1212using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
Tokenizer.cs (5)
17/// Initializes a new instance of the <see cref="Tokenizer"/> class. 39/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 180/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 223/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 361/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation.
Microsoft.ML.Tokenizers.Tests (55)
BpeTests.cs (6)
256Tokenizer tokenizer = bpe; 323private static Tokenizer? _gpt2Tokenizer = null; 325private static Tokenizer GetGpt2Tokenizer() 368Tokenizer tokenizer = GetGpt2Tokenizer(); 372private void ValidateTokenizer(Tokenizer tokenizer) 427Tokenizer tokenizer = GetGpt2Tokenizer();
CodeGenTests.cs (10)
17private static Tokenizer _codegen350MMonoTokenizer = CreateCodegen350MMonoTokenizer(); 18private static Tokenizer _codegen350MMonoTokenizerWithSpace = CreateCodegen350MMonoTokenizer(addPrefixSpace: true); 19private static Tokenizer _codegen350MMonoTokenizerWithBeginningOfSentence = CreateCodegen350MMonoTokenizer(bos: true); 20private static Tokenizer _codegen350MMonoTokenizerWithEndOfSentence = CreateCodegen350MMonoTokenizer(eos: true); 21private static Tokenizer _codegen350MMonoTokenizerWithBeginningAndEndOfSentence = CreateCodegen350MMonoTokenizer(bos: true, eos: true); 23private static Tokenizer CreateCodegen350MMonoTokenizer(bool addPrefixSpace = false, bool bos = false, bool eos = false) 34private static Tokenizer CreateCodegenPhi2Tokenizer() 223Tokenizer phi2Tokenizer = CreateCodegenPhi2Tokenizer(); 248private void TestDecoding(Tokenizer tokenizer, string text) 332Tokenizer tokenizer,
EnglishRobertaTests.cs (5)
79private static Tokenizer? _robertaTokenizer = null; 80private static Tokenizer GetRobertaTokenizer() 113Tokenizer tokenizer = EnglishRobertaTokenizer.Create(vocabFile, mergeFile, translationFile, RobertaPreTokenizer.Instance); 179Tokenizer tokenizer = GetRobertaTokenizer(); 236private void TestTokenizer(Tokenizer tokenizer, CallingOrder callingOrder = CallingOrder.Encode)
LlamaTests.cs (15)
22private static Tokenizer _llamaTokenizer = CreateLlamaTokenizer(); 23private static Tokenizer _llamaMistralTokenizer = CreateLMistralTokenizer(); 24private static Tokenizer _llamaPhi3Tokenizer = CreateLPhi3Tokenizer(); 25private static Tokenizer _llamaPhi3TokenizerWithTreatSpaceSuffix = CreateLPhi3Tokenizer(treatWhitespaceAsSuffix: true); 28private static Tokenizer CreateLlamaTokenizer() 36private static Tokenizer CreateLMistralTokenizer() 43private static Tokenizer CreateLPhi3Tokenizer(bool treatWhitespaceAsSuffix = false) 234public void TestLlamaTokenizer(Tokenizer tokenizer, string input, int[] ids, string[] tokens, (int Index, int Length)[] offsets) 237Tokenizer[] tokenizers = tokenizer == _llamaTokenizer ? new[] { tokenizer, _llamaPhi3Tokenizer } : new[] { tokenizer }; 239foreach (Tokenizer llamaTokenizer in tokenizers) 336public void TestLlamaTokenizerWithEmptyInput(Tokenizer llamaTokenizer) 352public void TestLlamaTokenizerProperties(Tokenizer llamaTokenizer) 495Tokenizer tokenizer = _llamaTokenizer; 551Tokenizer tokenizer = _llamaTokenizer; 628Tokenizer tokenizer = _llamaTokenizer;
NormalizerTests.cs (1)
64Tokenizer tokenizer = BpeTests.CreateEmptyBpe(preTokenizer: null, normalizer);
PreTokenizerTests.cs (1)
57Tokenizer tokenizer = BpeTests.CreateEmptyBpe(normalizer: null, preTokenizer: preTokenizer);
TitokenTests.cs (16)
31public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens); 32public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2"); 33public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003"); 34public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada"); 35public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); 36public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); 48using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!; 58Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokensEncoder); 108public async Task TestTokenizerUsingExternalVocab(Tokenizer tokenizer, string url) 134private void TestGPT4TokenizationEncoding(Tokenizer tokenizer) 209private void TestGPT4Tokenizer(Tokenizer gpt4Tokenizer) 441Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName); 454Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName); 468Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName); 507Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name); 562Tokenizer tokenizer = GPT4;
TokenizerTests.cs (1)
123internal static void TestTokenLimits(Tokenizer tokenizer)
Microsoft.ML.TorchSharp (17)
Extensions\TokenizerExtensions.cs (4)
17private static Tokenizer _instance; 19internal static Tokenizer GetInstance(IChannel ch) 41internal static EnglishRobertaTokenizer RobertaModel(this Tokenizer tokenizer) 52internal static IReadOnlyList<int> EncodeToConverted(this Tokenizer tokenizer, string sentence)
NasBert\NasBertTrainer.cs (3)
178public Tokenizer Tokenizer; 582private IList<int> PrepInputTokens(ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2, Tokenizer tokenizer) 612private protected void UpdateCacheIfNeeded(long position, TensorCacher outputCache, ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2, Tokenizer tokenizer)
NasBert\NerTrainer.cs (3)
329var tokenizer = TokenizerExtensions.GetInstance(ch); 377private void CondenseOutput(ref VBuffer<UInt32> dst, string sentence, Tokenizer tokenizer, TensorCacher outputCacher) 417Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\SentenceSimilarityTrainer.cs (2)
241var tokenizer = TokenizerExtensions.GetInstance(ch); 274Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\TextClassificationTrainer.cs (3)
265var tokenizer = TokenizerExtensions.GetInstance(ch); 320Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch); 351Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
Roberta\QATrainer.cs (2)
190public Tokenizer Tokenizer; 569public Tokenizer Tokenizer;