24 instantiations of EncodedToken
Microsoft.ML.Tokenizers (23)
Model\CodeGenTokenizer.cs (6)
379tokens.Add(new EncodedToken(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, (0, 0))); 1593tokens.Add(new EncodedToken(tokensToAdd[0].Id, tokensToAdd[0].Value, (offset == 0 ? tokensToAdd[0].Offset.Index : tokensToAdd[0].Offset.Index + offset - 1, offset == 0 ? tokensToAdd[0].Offset.Length - 1 : tokensToAdd[0].Offset.Length))); 1597tokens.Add(new EncodedToken(tokensToAdd[i].Id, tokensToAdd[i].Value, (tokensToAdd[i].Offset.Index + offset - 1, tokensToAdd[i].Offset.Length))); 1605tokens.Add(new EncodedToken(t.Id, t.Value, (t.Offset.Index + offset, t.Offset.Length))); 1625return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, (mapping[0], 1)) }; 1699return new EncodedToken(id, token, (tokenStartIndex, tokenLength));
Model\EnglishRobertaTokenizer.cs (4)
328tokens.Add(new EncodedToken(t.Id, t.Value, (split.Offset + t.Offset.Index, t.Offset.Length))); 918list.Add(new EncodedToken(tokens[j].Id, tokens[j].Value, (indexMapping[index], tokens[j].Value.Length))); 950return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, (indexMapping[0], 1)) }; 1039tokens.Add(new EncodedToken(_vocab[new StringSpanOrdinalKey(w)], w, (indexMapping[index], w.Length)));
Model\SentencePieceBpeTokenizer.cs (9)
275tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, (0, 0))); 289tokens.Add(new EncodedToken(id, _specialTokensReverse![id], (Offset, Length))); 302tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, (text.Length, 0))); 322tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, (0, 0))); 352tokens.Add(new EncodedToken( 367tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, (text.Length, 0))); 384tokens.Add(new EncodedToken(id, token, (index + i, 1))); 408tokens.Add(new EncodedToken(id, token, (index + i, length))); 436tokens.Add(new EncodedToken(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), (pieceSpan.Index, pieceSpan.Length)));
Model\TiktokenTokenizer.cs (3)
306tokens.Add(new EncodedToken( 318tokens.Add(new EncodedToken(mappedId.Id, mappedId.Token, (offset, mappedId.Token.Length))); 347tokens.Add(new EncodedToken(
Model\Word.cs (1)
299tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], (index + offset, _symbols[i].Len)));
Microsoft.ML.Tokenizers.Tests (1)
TokenizerTests.cs (1)
115tokens.Add(new EncodedToken(c - 'a', c.ToString(), (count, 1)));
131 references to EncodedToken
Microsoft.ML.Tokenizers (99)
Model\BPETokenizer.cs (7)
263/// Encodes input text to a list of <see cref="EncodedToken" />s. 268protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 272return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 286List<EncodedToken> tokens = new(); 301return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = charsConsumed }; 935internal void WordToTokens(ref Word word, List<EncodedToken> tokens, int offset) => word.ToTokens(VocabReverse, tokens, offset); 937internal void EncodeWithCache(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)
Model\CodeGenTokenizer.cs (31)
31private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 127_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 280/// Encodes input text to a list of <see cref="EncodedToken" />s. 285protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 299public IReadOnlyList<EncodedToken> EncodeToTokens(string text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true) 301EncodeResults<EncodedToken> result = EncodeToTokens(text, ReadOnlySpan<char>.Empty, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 317public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true) 319EncodeResults<EncodedToken> result = EncodeToTokens(null, text, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 324private EncodeResults<EncodedToken> EncodeToTokens(string? text, scoped ReadOnlySpan<char> textSpan, bool addPrefixSpace, bool addBos, bool addEos, bool considerPreTokenization, bool considerNormalization) 328return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 376List<EncodedToken> tokens = new(); 401return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = textSpanToEncode.Length }; 421private void EncodeInternal(string? text, scoped ReadOnlySpan<char> textSpan, List<EncodedToken> tokens, bool addPrefixSpace, int offset, PriorityQueue<SymbolPair> agenda) 434if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 459List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1008private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 1016foreach (var t in tokens) 1064private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 1072foreach (var t in tokens) 1118if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1142List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1182if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1206List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1587private static void AppendTokenWithOffsetAdjusting(IReadOnlyList<EncodedToken> tokensToAdd, List<EncodedToken> tokens, int offset, bool addPrefixSpace) 1603foreach (EncodedToken t in tokensToAdd) 1613private List<EncodedToken> EncodeToTokens(Span<char> text, Span<int> mapping, ReadOnlySpan<char> originalText, PriorityQueue<SymbolPair> agenda) 1625return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, (mapping[0], 1)) }; 1674List<EncodedToken> result = new List<EncodedToken>(text.Length); 1695static EncodedToken GetToken(int id, string token, int index, int length, ReadOnlySpan<char> originalText, Span<int> mapping)
Model\EnglishRobertaTokenizer.cs (27)
27private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 157_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 298/// Encodes input text to a list of <see cref="EncodedToken" />s. 303protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 307return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 323List<EncodedToken> tokens = new(); 326foreach (EncodedToken t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.Length))) 332return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = charsConsumed }; 336return new EncodeResults<EncodedToken> { Tokens = EncodeInternal(textSpanToEncode), NormalizedText = normalizedString, CharsConsumed = charsConsumed }; 345private IReadOnlyList<EncodedToken> EncodeInternal(ReadOnlySpan<char> text) 375if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 382List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 577private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 585foreach (var t in tokens) 614private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 622foreach (var t in tokens) 659if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 688List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 704if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 733List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 900private IReadOnlyList<EncodedToken> ModifyTokenListOffsets(IReadOnlyList<EncodedToken> tokens, Span<int> indexMapping) 910List<EncodedToken> list = new List<EncodedToken>(tokens.Count); 937private List<EncodedToken> EncodeToTokens(Span<char> token, Span<int> indexMapping) 950return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, (indexMapping[0], 1)) }; 1034var tokens = new List<EncodedToken>(word.Count);
Model\SentencePieceBpeTokenizer.cs (13)
190/// Encodes input text to a list of <see cref="EncodedToken" />s. 195protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 197return new EncodeResults<EncodedToken> 206/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 214/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 215public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 219/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 227/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 228public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 231private IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization, bool considerNormalization) 255List<EncodedToken>? tokens = new(); 269private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens) 314private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens)
Model\TiktokenTokenizer.cs (6)
252/// Encodes input text to a list of <see cref="EncodedToken" />s. 257protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 261return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 275List<EncodedToken> tokens = new(); 289return new EncodeResults<EncodedToken> { NormalizedText = normalizedString, Tokens = tokens, CharsConsumed = charsConsumed }; 298private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset)
Model\Word.cs (1)
292public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
Tokenizer.cs (14)
44EncodeResults<EncodedToken> results = EncodeToTokens(text, textSpan, settings); 133/// Encodes input text to a list of <see cref="EncodedToken" />s. 138protected abstract EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings); 141/// Encodes input text to a list of <see cref="EncodedToken" />s. 147/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 148public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true) 150EncodeResults<EncodedToken> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 157/// Encodes input text to a list of <see cref="EncodedToken" />s. 163/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 164public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true) 166EncodeResults<EncodedToken> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 235EncodeResults<EncodedToken> tokens = EncodeToTokens(text, textSpan, settings); 243var token = tokens.Tokens[tokenCount - 1]; 253var token = tokens.Tokens[tokens.Tokens.Count - tokenCount];
Microsoft.ML.Tokenizers.Tests (30)
BpeTests.cs (4)
257IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out _); 376IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 429IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 430IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
CodeGenTests.cs (4)
231private void ValidateEncoding(IReadOnlyList<EncodedToken> encoding, bool addPrefixSpace, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds, 252IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 347IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 549IReadOnlyList<EncodedToken> encoding = codeGenTokenizer.EncodeToTokens(text, out _);
EnglishRobertaTests.cs (3)
181IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 182IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 244IReadOnlyList<EncodedToken> encoding;
LlamaTests.cs (5)
244IReadOnlyList<EncodedToken> result = llamaTokenizer.EncodeToTokens(input, out _); 266IReadOnlyList<EncodedToken> bpeTokens = bpe.EncodeToTokens(normalizedInput.AsSpan(), out _, addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false); 500IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 501IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 657IReadOnlyList<EncodedToken> encodedTokens;
NormalizerTests.cs (1)
65IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
PreTokenizerTests.cs (1)
59IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _);
TitokenTests.cs (9)
142IReadOnlyList<EncodedToken> result = tokenizer.EncodeToTokens(text, out string? normalizedString); 195IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedString); 238IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedString); 257IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedString); 273IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedString); 307IReadOnlyList<EncodedToken> result = GPT4o.EncodeToTokens(text, out string? normalizedString); 564IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 565IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 679IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _);
TokenizerTests.cs (3)
105protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 107var tokens = new List<EncodedToken>(); 119return new EncodeResults<EncodedToken> { Tokens = tokens, CharsConsumed = count };
Microsoft.ML.TorchSharp (2)
NasBert\NerTrainer.cs (2)
170IReadOnlyList<EncodedToken> encoding = Tokenizer.EncodeToTokens(sentence, out string normalizedString); 380IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out string normalizedString);