7 types derived from Tokenizer
Microsoft.ML.Tokenizers (6)
Model\BPETokenizer.cs (1)
23public sealed class BpeTokenizer : Tokenizer
Model\CodeGenTokenizer.cs (1)
23public class CodeGenTokenizer : Tokenizer
Model\EnglishRobertaTokenizer.cs (1)
20public sealed class EnglishRobertaTokenizer : Tokenizer
Model\SentencePieceTokenizer.cs (1)
25public class SentencePieceTokenizer : Tokenizer
Model\TiktokenTokenizer.cs (1)
25public sealed partial class TiktokenTokenizer : Tokenizer
Model\WordPieceTokenizer.cs (1)
25public partial class WordPieceTokenizer : Tokenizer
Microsoft.ML.Tokenizers.Tests (1)
TokenizerTests.cs (1)
79private sealed class EnglishAlphabetTokenizer : Tokenizer
114 references to Tokenizer
Microsoft.Extensions.AI.Integration.Tests (3)
ReducingChatClientTests.cs (3)
23private static readonly Tokenizer _gpt4oTokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); 133private readonly Tokenizer _tokenizer; 136public TokenCountingChatReducer(Tokenizer tokenizer, int tokenLimit)
Microsoft.ML.GenAI.Core (8)
CausalLMPipelineChatClient.cs (1)
18where TTokenizer : Tokenizer
Pipeline\CausalLMPipeline.cs (5)
18where TTokenizer : Tokenizer 28Tokenizer Tokenizer { get; } 69where TTokenizer : Tokenizer 97Tokenizer tokenizer, 116public Tokenizer Tokenizer { get; }
Trainer\CausalLMDataset.cs (2)
29Tokenizer tokenizer) 50public static CausalLMDataset Create(IEnumerable<string> inputs, IEnumerable<string> outputs, Tokenizer tokenizer)
Microsoft.ML.GenAI.Core.Tests (2)
CasualLMDatasetTest.cs (2)
21private static Tokenizer CreateLlamaTokenizer() 96var tokenizer = CreateLlamaTokenizer();
Microsoft.ML.GenAI.LLaMA (8)
Llama3CausalLMChatClient.cs (2)
12public class Llama3CausalLMChatClient : CausalLMPipelineChatClient<Tokenizer, LlamaForCausalLM> 17ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline,
LlamaCausalLMAgent.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 27ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline,
LlamaChatCompletionService.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 24public LlamaChatCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline, ISemanticKernelChatTemplateBuilder? templateBuilder = null)
LlamaTextCompletionService.cs (2)
20private readonly ICausalLMPipeline<Tokenizer, LlamaForCausalLM> _pipeline; 22public LlamaTextCompletionService(ICausalLMPipeline<Tokenizer, LlamaForCausalLM> pipeline)
Microsoft.ML.GenAI.Mistral (2)
MistralCausalLMAgent.cs (2)
18private readonly ICausalLMPipeline<Tokenizer, MistralForCausalLM> _pipeline; 31ICausalLMPipeline<Tokenizer, MistralForCausalLM> pipeline,
Microsoft.ML.GenAI.Phi (10)
Extension\SemanticKernelExtension.cs (2)
18ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline) 27ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
Phi3\Phi3CausalLMAgent.cs (2)
20private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 25ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline,
Phi3\Phi3CausalLMChatClient.cs (2)
17public class Phi3CausalLMChatClient : CausalLMPipelineChatClient<Tokenizer, Phi3ForCasualLM> 22ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline,
Phi3\Phi3CausalLMChatCompletionService.cs (2)
17private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 22ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline,
Phi3\Phi3CausalLMTextGenerationService.cs (2)
15private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline; 18ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
Microsoft.ML.GenAI.Phi.Tests (3)
AutoGenTests.cs (1)
19var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
SemanticKernelTests.cs (2)
22var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>(); 58var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
Microsoft.ML.GenAI.Samples (2)
Llama\SFT_Llama_3_2_1B.cs (2)
63if (p is not ICausalLMPipeline<Tokenizer, LlamaForCausalLM> llamaPipeline) 101public static CausalLMDataset CreateDataset(IEnumerable<Data> dataset, Tokenizer tokenizer, IMEAIChatTemplateBuilder templateBuilder)
Microsoft.ML.Tokenizers (5)
Tokenizer.cs (5)
17/// Initializes a new instance of the <see cref="Tokenizer"/> class. 39/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 180/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 223/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation. 361/// Types derived from <see cref="Tokenizer"/> may override this implementation to provide a more efficient implementation.
Microsoft.ML.Tokenizers.Tests (54)
BpeTests.cs (6)
256Tokenizer tokenizer = bpe; 323private static Tokenizer? _gpt2Tokenizer = null; 325private static Tokenizer GetGpt2Tokenizer() 368Tokenizer tokenizer = GetGpt2Tokenizer(); 372private void ValidateTokenizer(Tokenizer tokenizer) 427Tokenizer tokenizer = GetGpt2Tokenizer();
CodeGenTests.cs (10)
17private static Tokenizer _codegen350MMonoTokenizer = CreateCodegen350MMonoTokenizer(); 18private static Tokenizer _codegen350MMonoTokenizerWithSpace = CreateCodegen350MMonoTokenizer(addPrefixSpace: true); 19private static Tokenizer _codegen350MMonoTokenizerWithBeginningOfSentence = CreateCodegen350MMonoTokenizer(bos: true); 20private static Tokenizer _codegen350MMonoTokenizerWithEndOfSentence = CreateCodegen350MMonoTokenizer(eos: true); 21private static Tokenizer _codegen350MMonoTokenizerWithBeginningAndEndOfSentence = CreateCodegen350MMonoTokenizer(bos: true, eos: true); 23private static Tokenizer CreateCodegen350MMonoTokenizer(bool addPrefixSpace = false, bool bos = false, bool eos = false) 34private static Tokenizer CreateCodegenPhi2Tokenizer() 223Tokenizer phi2Tokenizer = CreateCodegenPhi2Tokenizer(); 248private void TestDecoding(Tokenizer tokenizer, string text) 332Tokenizer tokenizer,
EnglishRobertaTests.cs (5)
79private static Tokenizer? _robertaTokenizer = null; 80private static Tokenizer GetRobertaTokenizer() 113Tokenizer tokenizer = EnglishRobertaTokenizer.Create(vocabFile, mergeFile, translationFile, RobertaPreTokenizer.Instance); 179Tokenizer tokenizer = GetRobertaTokenizer(); 236private void TestTokenizer(Tokenizer tokenizer, CallingOrder callingOrder = CallingOrder.Encode)
LlamaTests.cs (15)
22private static Tokenizer _llamaTokenizer = CreateLlamaTokenizer(); 23private static Tokenizer _llamaMistralTokenizer = CreateLMistralTokenizer(); 24private static Tokenizer _llamaPhi3Tokenizer = CreateLPhi3Tokenizer(); 25private static Tokenizer _llamaPhi3TokenizerWithTreatSpaceSuffix = CreateLPhi3Tokenizer(treatWhitespaceAsSuffix: true); 28private static Tokenizer CreateLlamaTokenizer() 36private static Tokenizer CreateLMistralTokenizer() 43private static Tokenizer CreateLPhi3Tokenizer(bool treatWhitespaceAsSuffix = false) 234public void TestLlamaTokenizer(Tokenizer tokenizer, string input, int[] ids, string[] tokens, (int Index, int Length)[] offsets) 237Tokenizer[] tokenizers = tokenizer == _llamaTokenizer ? new[] { tokenizer, _llamaPhi3Tokenizer } : new[] { tokenizer }; 239foreach (Tokenizer llamaTokenizer in tokenizers) 336public void TestLlamaTokenizerWithEmptyInput(Tokenizer llamaTokenizer) 352public void TestLlamaTokenizerProperties(Tokenizer llamaTokenizer) 495Tokenizer tokenizer = _llamaTokenizer; 551Tokenizer tokenizer = _llamaTokenizer; 628Tokenizer tokenizer = _llamaTokenizer;
NormalizerTests.cs (1)
64Tokenizer tokenizer = BpeTests.CreateEmptyBpe(preTokenizer: null, normalizer);
PreTokenizerTests.cs (1)
64Tokenizer tokenizer = BpeTests.CreateEmptyBpe(normalizer: null, preTokenizer: preTokenizer);
TiktokenTests.cs (15)
31public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens); 32public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2"); 33public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003"); 34public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada"); 35public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); 36public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); 59Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens); 109public async Task TestTokenizerUsingExternalVocab(Tokenizer tokenizer, string url) 135private void TestGPT4TokenizationEncoding(Tokenizer tokenizer) 210private void TestGPT4Tokenizer(Tokenizer gpt4Tokenizer) 444Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName); 457Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName); 471Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName); 511Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name); 566Tokenizer tokenizer = GPT4;
TokenizerTests.cs (1)
123internal static void TestTokenLimits(Tokenizer tokenizer)
Microsoft.ML.TorchSharp (17)
Extensions\TokenizerExtensions.cs (4)
17private static Tokenizer _instance; 19internal static Tokenizer GetInstance(IChannel ch) 41internal static EnglishRobertaTokenizer RobertaModel(this Tokenizer tokenizer) 52internal static IReadOnlyList<int> EncodeToConverted(this Tokenizer tokenizer, string sentence)
NasBert\NasBertTrainer.cs (3)
178public Tokenizer Tokenizer; 582private IList<int> PrepInputTokens(ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2, Tokenizer tokenizer) 612private protected void UpdateCacheIfNeeded(long position, TensorCacher outputCache, ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2, Tokenizer tokenizer)
NasBert\NerTrainer.cs (3)
329var tokenizer = TokenizerExtensions.GetInstance(ch); 377private void CondenseOutput(ref VBuffer<UInt32> dst, string sentence, Tokenizer tokenizer, TensorCacher outputCacher) 417Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\SentenceSimilarityTrainer.cs (2)
242var tokenizer = TokenizerExtensions.GetInstance(ch); 275Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\TextClassificationTrainer.cs (3)
266var tokenizer = TokenizerExtensions.GetInstance(ch); 321Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch); 352Tokenizer tokenizer = TokenizerExtensions.GetInstance(ch);
Roberta\QATrainer.cs (2)
190public Tokenizer Tokenizer; 569public Tokenizer Tokenizer;