6 types derived from Tokenizer
Microsoft.ML.Tokenizers (5)
Model\BPETokenizer.cs (1)
23
public sealed class BpeTokenizer :
Tokenizer
Model\CodeGenTokenizer.cs (1)
23
public class CodeGenTokenizer :
Tokenizer
Model\EnglishRobertaTokenizer.cs (1)
20
public sealed class EnglishRobertaTokenizer :
Tokenizer
Model\SentencePieceBpeTokenizer.cs (1)
25
public class SentencePieceBpeTokenizer :
Tokenizer
Model\TiktokenTokenizer.cs (1)
24
public sealed partial class TiktokenTokenizer :
Tokenizer
Microsoft.ML.Tokenizers.Tests (1)
TokenizerTests.cs (1)
79
private sealed class EnglishAlphabetTokenizer :
Tokenizer
99 references to Tokenizer
Microsoft.ML.GenAI.Core (4)
Pipeline\CausalLMPipeline.cs (4)
18
where TTokenizer :
Tokenizer
65
where TTokenizer :
Tokenizer
93
Tokenizer
tokenizer,
112
public
Tokenizer
Tokenizer { get; }
Microsoft.ML.GenAI.LLaMA (6)
LlamaCausalLMAgent.cs (2)
15
private readonly ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> _pipeline;
27
ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> pipeline,
LlamaChatCompletionService.cs (2)
15
private readonly ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> _pipeline;
24
public LlamaChatCompletionService(ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> pipeline, ISemanticKernelChatTemplateBuilder? templateBuilder = null)
LlamaTextCompletionService.cs (2)
20
private readonly ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> _pipeline;
22
public LlamaTextCompletionService(ICausalLMPipeline<
Tokenizer
, LlamaForCausalLM> pipeline)
Microsoft.ML.GenAI.Phi (8)
Extension\SemanticKernelExtension.cs (2)
18
ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> pipeline)
27
ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> pipeline)
Phi3\Phi3CausalLMAgent.cs (2)
20
private readonly ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> _pipeline;
24
ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> pipeline,
Phi3\Phi3CausalLMChatCompletionService.cs (2)
17
private readonly ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> _pipeline;
21
public Phi3CausalLMChatCompletionService(ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> pipeline)
Phi3\Phi3CausalLMTextGenerationService.cs (2)
15
private readonly ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> _pipeline;
17
public Phi3CausalLMTextGenerationService(ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM> pipeline)
Microsoft.ML.GenAI.Phi.Tests (3)
AutoGenTests.cs (1)
19
var pipeline = Mock.Of<ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM>>();
SemanticKernelTests.cs (2)
22
var pipeline = Mock.Of<ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM>>();
58
var pipeline = Mock.Of<ICausalLMPipeline<
Tokenizer
, Phi3ForCasualLM>>();
Microsoft.ML.Tokenizers (6)
Model\TiktokenTokenizer.cs (1)
1212
using Stream compressedStream = typeof(
Tokenizer
).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
Tokenizer.cs (5)
17
/// Initializes a new instance of the <see cref="
Tokenizer
"/> class.
39
/// Types derived from <see cref="
Tokenizer
"/> may override this implementation to provide a more efficient implementation.
180
/// Types derived from <see cref="
Tokenizer
"/> may override this implementation to provide a more efficient implementation.
223
/// Types derived from <see cref="
Tokenizer
"/> may override this implementation to provide a more efficient implementation.
361
/// Types derived from <see cref="
Tokenizer
"/> may override this implementation to provide a more efficient implementation.
Microsoft.ML.Tokenizers.Tests (55)
BpeTests.cs (6)
256
Tokenizer
tokenizer = bpe;
323
private static
Tokenizer
? _gpt2Tokenizer = null;
325
private static
Tokenizer
GetGpt2Tokenizer()
368
Tokenizer
tokenizer = GetGpt2Tokenizer();
372
private void ValidateTokenizer(
Tokenizer
tokenizer)
427
Tokenizer
tokenizer = GetGpt2Tokenizer();
CodeGenTests.cs (10)
17
private static
Tokenizer
_codegen350MMonoTokenizer = CreateCodegen350MMonoTokenizer();
18
private static
Tokenizer
_codegen350MMonoTokenizerWithSpace = CreateCodegen350MMonoTokenizer(addPrefixSpace: true);
19
private static
Tokenizer
_codegen350MMonoTokenizerWithBeginningOfSentence = CreateCodegen350MMonoTokenizer(bos: true);
20
private static
Tokenizer
_codegen350MMonoTokenizerWithEndOfSentence = CreateCodegen350MMonoTokenizer(eos: true);
21
private static
Tokenizer
_codegen350MMonoTokenizerWithBeginningAndEndOfSentence = CreateCodegen350MMonoTokenizer(bos: true, eos: true);
23
private static
Tokenizer
CreateCodegen350MMonoTokenizer(bool addPrefixSpace = false, bool bos = false, bool eos = false)
34
private static
Tokenizer
CreateCodegenPhi2Tokenizer()
223
Tokenizer
phi2Tokenizer = CreateCodegenPhi2Tokenizer();
248
private void TestDecoding(
Tokenizer
tokenizer, string text)
332
Tokenizer
tokenizer,
EnglishRobertaTests.cs (5)
79
private static
Tokenizer
? _robertaTokenizer = null;
80
private static
Tokenizer
GetRobertaTokenizer()
113
Tokenizer
tokenizer = EnglishRobertaTokenizer.Create(vocabFile, mergeFile, translationFile, RobertaPreTokenizer.Instance);
179
Tokenizer
tokenizer = GetRobertaTokenizer();
236
private void TestTokenizer(
Tokenizer
tokenizer, CallingOrder callingOrder = CallingOrder.Encode)
LlamaTests.cs (15)
22
private static
Tokenizer
_llamaTokenizer = CreateLlamaTokenizer();
23
private static
Tokenizer
_llamaMistralTokenizer = CreateLMistralTokenizer();
24
private static
Tokenizer
_llamaPhi3Tokenizer = CreateLPhi3Tokenizer();
25
private static
Tokenizer
_llamaPhi3TokenizerWithTreatSpaceSuffix = CreateLPhi3Tokenizer(treatWhitespaceAsSuffix: true);
28
private static
Tokenizer
CreateLlamaTokenizer()
36
private static
Tokenizer
CreateLMistralTokenizer()
43
private static
Tokenizer
CreateLPhi3Tokenizer(bool treatWhitespaceAsSuffix = false)
234
public void TestLlamaTokenizer(
Tokenizer
tokenizer, string input, int[] ids, string[] tokens, (int Index, int Length)[] offsets)
237
Tokenizer
[] tokenizers = tokenizer == _llamaTokenizer ? new[] { tokenizer, _llamaPhi3Tokenizer } : new[] { tokenizer };
239
foreach (
Tokenizer
llamaTokenizer in tokenizers)
336
public void TestLlamaTokenizerWithEmptyInput(
Tokenizer
llamaTokenizer)
352
public void TestLlamaTokenizerProperties(
Tokenizer
llamaTokenizer)
495
Tokenizer
tokenizer = _llamaTokenizer;
551
Tokenizer
tokenizer = _llamaTokenizer;
628
Tokenizer
tokenizer = _llamaTokenizer;
NormalizerTests.cs (1)
64
Tokenizer
tokenizer = BpeTests.CreateEmptyBpe(preTokenizer: null, normalizer);
PreTokenizerTests.cs (1)
57
Tokenizer
tokenizer = BpeTests.CreateEmptyBpe(normalizer: null, preTokenizer: preTokenizer);
TitokenTests.cs (16)
31
public static
Tokenizer
GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens);
32
public static
Tokenizer
GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2");
33
public static
Tokenizer
P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003");
34
public static
Tokenizer
R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
35
public static
Tokenizer
P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
36
public static
Tokenizer
GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
48
using Stream compressedStream = typeof(
Tokenizer
).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
58
Tokenizer
tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokensEncoder);
108
public async Task TestTokenizerUsingExternalVocab(
Tokenizer
tokenizer, string url)
134
private void TestGPT4TokenizationEncoding(
Tokenizer
tokenizer)
209
private void TestGPT4Tokenizer(
Tokenizer
gpt4Tokenizer)
441
Tokenizer
tokenizer = TiktokenTokenizer.CreateForModel(modelName);
454
Tokenizer
tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName);
468
Tokenizer
tokenizer1 = TiktokenTokenizer.CreateForModel(modelName);
507
Tokenizer
tokenizer = TiktokenTokenizer.CreateForModel(name);
562
Tokenizer
tokenizer = GPT4;
TokenizerTests.cs (1)
123
internal static void TestTokenLimits(
Tokenizer
tokenizer)
Microsoft.ML.TorchSharp (17)
Extensions\TokenizerExtensions.cs (4)
17
private static
Tokenizer
_instance;
19
internal static
Tokenizer
GetInstance(IChannel ch)
41
internal static EnglishRobertaTokenizer RobertaModel(this
Tokenizer
tokenizer)
52
internal static IReadOnlyList<int> EncodeToConverted(this
Tokenizer
tokenizer, string sentence)
NasBert\NasBertTrainer.cs (3)
178
public
Tokenizer
Tokenizer;
582
private IList<int> PrepInputTokens(ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2,
Tokenizer
tokenizer)
612
private protected void UpdateCacheIfNeeded(long position, TensorCacher outputCache, ref ReadOnlyMemory<char> sentence1, ref ReadOnlyMemory<char> sentence2, ref ValueGetter<ReadOnlyMemory<char>> getSentence1, ref ValueGetter<ReadOnlyMemory<char>> getSentence2,
Tokenizer
tokenizer)
NasBert\NerTrainer.cs (3)
329
var
tokenizer = TokenizerExtensions.GetInstance(ch);
377
private void CondenseOutput(ref VBuffer<UInt32> dst, string sentence,
Tokenizer
tokenizer, TensorCacher outputCacher)
417
Tokenizer
tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\SentenceSimilarityTrainer.cs (2)
241
var
tokenizer = TokenizerExtensions.GetInstance(ch);
274
Tokenizer
tokenizer = TokenizerExtensions.GetInstance(ch);
NasBert\TextClassificationTrainer.cs (3)
265
var
tokenizer = TokenizerExtensions.GetInstance(ch);
320
Tokenizer
tokenizer = TokenizerExtensions.GetInstance(ch);
351
Tokenizer
tokenizer = TokenizerExtensions.GetInstance(ch);
Roberta\QATrainer.cs (2)
190
public
Tokenizer
Tokenizer;
569
public
Tokenizer
Tokenizer;