136 instantiations of EncodedToken
Microsoft.ML.Tokenizers (39)
Model\BPETokenizer.cs (1)
985tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(offset, offset + text.Length)));
Model\CodeGenTokenizer.cs (8)
382tokens.Add(new EncodedToken(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, new Range(0, 0))); 402tokens.Add(new EncodedToken(EndOfSentenceId.Value, EndOfSentenceToken!, new Range(index, index))); 435tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(index, index + ((addPrefixSpace && offset == 0) ? textSpan.Length - 1 : textSpan.Length)))); 1599tokens.Add(new EncodedToken(tokensToAdd[0].Id, tokensToAdd[0].Value, new Range(r.s, r.e))); 1603tokens.Add(new EncodedToken(tokensToAdd[i].Id, tokensToAdd[i].Value, new Range(tokensToAdd[i].Offset.Start.Value + offset - 1, tokensToAdd[i].Offset.End.Value + offset - 1))); 1611tokens.Add(new EncodedToken(t.Id, t.Value, new Range(t.Offset.Start.Value + offset, t.Offset.End.Value + offset))); 1631return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) }; 1704return new EncodedToken(id, token, new Range(mapping[index], endIndex));
Model\EnglishRobertaTokenizer.cs (4)
339tokens.Add(new EncodedToken(t.Id, t.Value, new Range(split.Offset + t.Offset.Start.Value, split.Offset + t.Offset.End.Value))); 929list.Add(new EncodedToken(tokens[j].Id, tokens[j].Value, new Range(indexMapping[index], indexMapping[index] + tokens[j].Value.Length))); 961return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) }; 1050tokens.Add(new EncodedToken(_vocab[new StringSpanOrdinalKey(w)], w, new Range(indexMapping[index], indexMapping[index] + w.Length)));
Model\SentencePieceBpeModel.cs (9)
157tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 171tokens.Add(new EncodedToken(id, SpecialTokensReverse![id], new Range(Offset, Offset + Length))); 184tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 204tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 234tokens.Add(new EncodedToken( 249tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 266tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + 1))); 290tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + length))); 318tokens.Add(new EncodedToken(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.Length)));
Model\SentencePieceUnigramModel.cs (10)
271tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 287tokens.Add(new EncodedToken(id, SpecialTokensReverse![id], new Range(progressOffset, progressOffset + Length))); 303tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset))); 321tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 331tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset))); 415tokens.Add(new EncodedToken(node.Id, stringToken, new Range(0, tokenLength))); // we will update the range later. 432tokens[start] = new EncodedToken(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength)); 443tokens[start] = new EncodedToken(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength)); 484tokens.Insert(insertionStartPosition++, new EncodedToken(id, _vocabReverse[id].Piece, new Range(offsetStart, offsetStart + charLength))); 489tokens.Insert(insertionStartPosition++, new EncodedToken(id, _vocabReverse[id].Piece, new Range(offsetStart + charLength, offsetStart + charLength)));
Model\TiktokenTokenizer.cs (3)
307tokens.Add(new EncodedToken( 319tokens.Add(new EncodedToken(mappedId.Id, mappedId.Token, new Range(offset, offset + mappedId.Token.Length))); 348tokens.Add(new EncodedToken(
Model\Word.cs (1)
299tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
Model\WordPieceTokenizer.cs (3)
319tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + text.Length))); 352curToken = new EncodedToken(id, _vocabReverse[id], new Range(offset + start, offset + end)); 373tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + textLength)));
Microsoft.ML.Tokenizers.Tests (97)
BertTokenizerTests.cs (66)
60new EncodedToken(8, "hello", new Range(0, 5)), 61new EncodedToken(6, ",", new Range(5, 6)), 62new EncodedToken(10, "how", new Range(7, 10)), 63new EncodedToken(11, "are", new Range(11, 14)), 64new EncodedToken(12, "you", new Range(15, 18)), 65new EncodedToken(13, "[SPECIAL]", new Range(19, 28)), 66new EncodedToken(7, "?", new Range(28, 29)) 80new EncodedToken(2, "[CLS]", new Range(0, 5)), 81new EncodedToken(8, "hello", new Range(6, 11)), 82new EncodedToken(6, ",", new Range(11, 12)), 83new EncodedToken(10, "how", new Range(13, 16)), 84new EncodedToken(11, "are", new Range(17, 20)), 85new EncodedToken(12, "you", new Range(21, 24)), 86new EncodedToken(13, "[SPECIAL]", new Range(25, 34)), 87new EncodedToken(7, "?", new Range(34, 35)), 88new EncodedToken(3, "[SEP]", new Range(36, 41)) 133new EncodedToken(8, "hello", new Range(0, 5)), 134new EncodedToken(6, ",", new Range(5, 6)), 135new EncodedToken(10, "how", new Range(7, 10)), 136new EncodedToken(11, "are", new Range(11, 14)), 137new EncodedToken(12, "you", new Range(15, 18)), 138new EncodedToken(7, "?", new Range(18, 19)) 152new EncodedToken(2, "[CLS]", new Range(0, 5)), 153new EncodedToken(8, "hello", new Range(6, 11)), 154new EncodedToken(6, ",", new Range(11, 12)), 155new EncodedToken(10, "how", new Range(13, 16)), 156new EncodedToken(11, "are", new Range(17, 20)), 157new EncodedToken(12, "you", new Range(21, 24)), 158new EncodedToken(7, "?", new Range(24, 25)), 159new EncodedToken(3, "[SEP]", new Range(26, 31)) 201new EncodedToken(1, "[UNK]", new Range(0, 5)), 202new EncodedToken(6, ",", new Range(5, 6)), 203new EncodedToken(1, "[UNK]", new Range(7, 10)), 204new EncodedToken(11, "are", new Range(11, 14)), 205new EncodedToken(12, "you", new Range(15, 18)), 206new EncodedToken(7, "?", new Range(18, 19)) 241new EncodedToken(10, "café", new Range(0, 4)), 242new EncodedToken(12, "über", new Range(5, 9)), 243new EncodedToken(15, "ångström", new Range(10, 18)), 244new EncodedToken(18, "résumé", new Range(19, 25)), 245new EncodedToken(5, "!", new Range(25, 26)), 256new EncodedToken(8, "Café", new Range(0, 4)), 257new EncodedToken(11, "Über", new Range(5, 9)), 258new EncodedToken(14, "Ångström", new Range(10, 18)), 259new EncodedToken(17, "Résumé", new Range(19, 25)), 260new EncodedToken(5, "!", new Range(25, 26)), 272new EncodedToken(9, "cafe", new Range(0, 4)), 273new EncodedToken(13, "uber", new Range(5, 9)), 274new EncodedToken(16, "angstrom", new Range(10, 18)), 275new EncodedToken(19, "resume", new Range(19, 25)), 276new EncodedToken(5, "!", new Range(25, 26)), 286new EncodedToken(20, "Cafe", new Range(0, 4)), 287new EncodedToken(21, "Uber", new Range(5, 9)), 288new EncodedToken(22, "Angstrom", new Range(10, 18)), 289new EncodedToken(23, "Resume", new Range(19, 25)), 290new EncodedToken(5, "!", new Range(25, 26)), 317new EncodedToken(9, "叟", new Range(1, 2)), 318new EncodedToken(11, "驷", new Range(4, 5)), 319new EncodedToken(10, "叢", new Range(8, 9)), 320new EncodedToken(12, "驸", new Range(11, 12)), 321new EncodedToken(5, "!", new Range(13, 14)) 335new EncodedToken(9, "叟", new Range(0, 1)), 336new EncodedToken(6, "##驷", new Range(1, 2)), 337new EncodedToken(10, "叢", new Range(3, 4)), 338new EncodedToken(7, "##驸", new Range(4, 5)), 339new EncodedToken(5, "!", new Range(5, 6))
BpeTests.cs (13)
510new EncodedToken(15496, "Hello", new Range(0, 5)), 511new EncodedToken(11, ",", new Range(5, 6)), 512new EncodedToken(88, "y", new Range(7, 8)), 513new EncodedToken(6, "'", new Range(8, 9)), 514new EncodedToken(439, "all", new Range(9, 12)), 515new EncodedToken(0, "!", new Range(12, 13)), 516new EncodedToken(9, "<issue_comment>", new Range(14, 29)), 517new EncodedToken(2437, "How", new Range(29, 32)), 518new EncodedToken(533, "are", new Range(33, 36)), 519new EncodedToken(5832, "you", new Range(37, 40)), 520new EncodedToken(50256, "<|endoftext|>", new Range(41, 43)), 521new EncodedToken(30, "?", new Range(44, 45)), 522new EncodedToken(0, "<|endoftext|>", new Range(45, 58))
TokenizerTests.cs (1)
115tokens.Add(new EncodedToken(c - 'a', c.ToString(), new Range(count, count + 1)));
WordPieceTests.cs (17)
78new EncodedToken(7, "un", new Range(0, 2)), 79new EncodedToken(4, "##want", new Range(2, 6)), 80new EncodedToken(5, "##ed", new Range(6, 8)), 81new EncodedToken(8, "runn", new Range(9, 13)), 82new EncodedToken(9, "##ing", new Range(13, 16)) 159new EncodedToken(0, "[UNK]", new Range(0, 9)), 160new EncodedToken(8, "runn", new Range(10, 14)), 161new EncodedToken(9, "##ing", new Range(14, 17)) 197new EncodedToken(0, "[UNK]", new Range(0, 5)), 198new EncodedToken(7, "un", new Range(6, 8)), 199new EncodedToken(4, "##want", new Range(8, 12)), 200new EncodedToken(5, "##ed", new Range(12, 14)), 201new EncodedToken(2, "[SEP]", new Range(15, 20)), 202new EncodedToken(1, "[CLS]", new Range(20, 25)), 203new EncodedToken(8, "runn", new Range(26, 30)), 204new EncodedToken(9, "##ing", new Range(30, 33)), 205new EncodedToken(1, "[CLS]", new Range(34, 39)),
157 references to EncodedToken
Microsoft.ML.Tokenizers (117)
EncodedToken.cs (2)
13public readonly struct EncodedToken : IEquatable<EncodedToken> 44public bool Equals(EncodedToken other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);
Model\BPETokenizer.cs (7)
308/// Encodes input text to a list of <see cref="EncodedToken" />s. 313protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 317return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 331List<EncodedToken> tokens = new(); 346return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 979internal void WordToTokens(ref Word word, List<EncodedToken> tokens, int offset) => word.ToTokens(VocabReverse, tokens, offset); 981internal void EncodeWithCache(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)
Model\CodeGenTokenizer.cs (31)
34private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 130_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 283/// Encodes input text to a list of <see cref="EncodedToken" />s. 288protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 302public IReadOnlyList<EncodedToken> EncodeToTokens(string text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 304EncodeResults<EncodedToken> result = EncodeToTokens(text, ReadOnlySpan<char>.Empty, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 320public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 322EncodeResults<EncodedToken> result = EncodeToTokens(null, text, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 327private EncodeResults<EncodedToken> EncodeToTokens(string? text, scoped ReadOnlySpan<char> textSpan, bool addPrefixSpace, bool addBos, bool addEos, bool considerPreTokenization, bool considerNormalization) 331return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 379List<EncodedToken> tokens = new(); 405return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = textSpanToEncode.Length }; 425private void EncodeInternal(string? text, scoped ReadOnlySpan<char> textSpan, List<EncodedToken> tokens, bool addPrefixSpace, int offset, PriorityQueue<SymbolPair> agenda) 439if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 464List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1013private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 1021foreach (var t in tokens) 1069private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 1077foreach (var t in tokens) 1123if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1147List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1187if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1211List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1592private static void AppendTokenWithOffsetAdjusting(IReadOnlyList<EncodedToken> tokensToAdd, List<EncodedToken> tokens, int offset, bool addPrefixSpace) 1609foreach (EncodedToken t in tokensToAdd) 1619private List<EncodedToken> EncodeToTokens(Span<char> text, Span<int> mapping, ReadOnlySpan<char> originalText, PriorityQueue<SymbolPair> agenda) 1631return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) }; 1680List<EncodedToken> result = new List<EncodedToken>(text.Length); 1701static EncodedToken GetToken(int id, string token, int index, int length, ReadOnlySpan<char> originalText, Span<int> mapping)
Model\EnglishRobertaTokenizer.cs (27)
27private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 169_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 309/// Encodes input text to a list of <see cref="EncodedToken" />s. 314protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 318return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 334List<EncodedToken> tokens = new(); 337foreach (EncodedToken t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.Length))) 343return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 347return new EncodeResults<EncodedToken> { Tokens = EncodeInternal(textSpanToEncode), NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 356private IReadOnlyList<EncodedToken> EncodeInternal(ReadOnlySpan<char> text) 386if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 393List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 588private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 596foreach (var t in tokens) 625private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 633foreach (var t in tokens) 670if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 699List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 715if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 744List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 911private IReadOnlyList<EncodedToken> ModifyTokenListOffsets(IReadOnlyList<EncodedToken> tokens, Span<int> indexMapping) 921List<EncodedToken> list = new List<EncodedToken>(tokens.Count); 948private List<EncodedToken> EncodeToTokens(Span<char> token, Span<int> indexMapping) 961return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) }; 1045var tokens = new List<EncodedToken>(word.Count);
Model\SentencePieceBaseModel.cs (1)
165public abstract IReadOnlyList<EncodedToken> EncodeToTokens(
Model\SentencePieceBpeModel.cs (4)
113public override IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization) 137List<EncodedToken> tokens = new(); 151private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens) 196private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens)
Model\SentencePieceTokenizer.cs (9)
124/// Encodes input text to a list of <see cref="EncodedToken" />s. 129protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 131return new EncodeResults<EncodedToken> 140/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 148/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 149public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 153/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 161/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 162public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)
Model\SentencePieceUnigramModel.cs (8)
150public override IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization) 156return Array.Empty<EncodedToken>(); 159List<EncodedToken> tokens = new(); 262List<EncodedToken> tokens, 314List<EncodedToken> tokens, 380List<EncodedToken> tokens, 426EncodedToken temp = tokens[start]; 462private void FallbackToByteEncoding(ReadOnlySpan<char> normalizationSpan, List<EncodedToken> tokens, int insertionStartPosition)
Model\TiktokenTokenizer.cs (6)
253/// Encodes input text to a list of <see cref="EncodedToken" />s. 258protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 262return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 276List<EncodedToken> tokens = new(); 290return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 299private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset)
Model\Word.cs (1)
292public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
Model\WordPieceTokenizer.cs (7)
267/// Encodes input text to a list of <see cref="EncodedToken" />s. 272protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 276return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 290List<EncodedToken> tokens = new(); 304return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 313private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset) 337EncodedToken curToken = default;
Tokenizer.cs (14)
44EncodeResults<EncodedToken> results = EncodeToTokens(text, textSpan, settings); 133/// Encodes input text to a list of <see cref="EncodedToken" />s. 138protected abstract EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings); 141/// Encodes input text to a list of <see cref="EncodedToken" />s. 147/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 148public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 150EncodeResults<EncodedToken> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 157/// Encodes input text to a list of <see cref="EncodedToken" />s. 163/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 164public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 166EncodeResults<EncodedToken> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 235EncodeResults<EncodedToken> tokens = EncodeToTokens(text, textSpan, settings); 243var token = tokens.Tokens[tokenCount - 1]; 253var token = tokens.Tokens[tokens.Tokens.Count - tokenCount];
Microsoft.ML.Tokenizers.Tests (38)
BpeTests.cs (6)
257IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out _); 376IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 429IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 430IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 507IReadOnlyList<EncodedToken> tokens = bpeTokenizer.EncodeToTokens(input, out _); 509EncodedToken[] expectedTokens = [
CodeGenTests.cs (4)
231private void ValidateEncoding(IReadOnlyList<EncodedToken> encoding, bool addPrefixSpace, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds, 252IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 347IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 549IReadOnlyList<EncodedToken> encoding = codeGenTokenizer.EncodeToTokens(text, out _);
EnglishRobertaTests.cs (3)
181IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 182IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 244IReadOnlyList<EncodedToken> encoding;
LlamaTests.cs (5)
244IReadOnlyList<EncodedToken> result = llamaTokenizer.EncodeToTokens(input, out _); 266IReadOnlyList<EncodedToken> bpeTokens = bpe.EncodeToTokens(normalizedInput.AsSpan(), out _, addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false); 500IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 501IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 657IReadOnlyList<EncodedToken> encodedTokens;
NormalizerTests.cs (1)
65IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out normalizedText);
PreTokenizerTests.cs (1)
66IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _);
TiktokenTests.cs (9)
146IReadOnlyList<EncodedToken> result = tokenizer.EncodeToTokens(text, out string? normalizedText); 199IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 242IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 261IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 277IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 311IReadOnlyList<EncodedToken> result = GPT4o.EncodeToTokens(text, out string? normalizedText); 582IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 583IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 697IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _);
TokenizerTests.cs (3)
105protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 107var tokens = new List<EncodedToken>(); 119return new EncodeResults<EncodedToken> { Tokens = tokens, CharsConsumed = count };
UnigramTests.cs (3)
326IReadOnlyList<EncodedToken> tokens, 331List<EncodedToken> writableTokens = tokens.ToList(); 404IReadOnlyList<EncodedToken> result = _unigramTokenizer.EncodeToTokens(inputText, out string? normalized);
WordPieceTests.cs (3)
62IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens("", out _); 156IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out _); 194IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out _);
Microsoft.ML.TorchSharp (2)
NasBert\NerTrainer.cs (2)
170IReadOnlyList<EncodedToken> encoding = Tokenizer.EncodeToTokens(sentence, out string normalizedText); 380IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out string normalizedText);