126 instantiations of EncodedToken
Microsoft.ML.Tokenizers (29)
Model\BPETokenizer.cs (1)
985tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(offset, offset + text.Length)));
Model\CodeGenTokenizer.cs (8)
379tokens.Add(new EncodedToken(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, new Range(0, 0))); 399tokens.Add(new EncodedToken(EndOfSentenceId.Value, EndOfSentenceToken!, new Range(index, index))); 432tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(index, index + ((addPrefixSpace && offset == 0) ? textSpan.Length - 1 : textSpan.Length)))); 1596tokens.Add(new EncodedToken(tokensToAdd[0].Id, tokensToAdd[0].Value, new Range(r.s, r.e))); 1600tokens.Add(new EncodedToken(tokensToAdd[i].Id, tokensToAdd[i].Value, new Range(tokensToAdd[i].Offset.Start.Value + offset - 1, tokensToAdd[i].Offset.End.Value + offset - 1))); 1608tokens.Add(new EncodedToken(t.Id, t.Value, new Range(t.Offset.Start.Value + offset, t.Offset.End.Value + offset))); 1628return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) }; 1701return new EncodedToken(id, token, new Range(mapping[index], endIndex));
Model\EnglishRobertaTokenizer.cs (4)
339tokens.Add(new EncodedToken(t.Id, t.Value, new Range(split.Offset + t.Offset.Start.Value, split.Offset + t.Offset.End.Value))); 929list.Add(new EncodedToken(tokens[j].Id, tokens[j].Value, new Range(indexMapping[index], indexMapping[index] + tokens[j].Value.Length))); 961return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) }; 1050tokens.Add(new EncodedToken(_vocab[new StringSpanOrdinalKey(w)], w, new Range(indexMapping[index], indexMapping[index] + w.Length)));
Model\SentencePieceTokenizer.cs (9)
276tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 290tokens.Add(new EncodedToken(id, _specialTokensReverse![id], new Range(Offset, Offset + Length))); 303tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 323tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 353tokens.Add(new EncodedToken( 368tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 385tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + 1))); 409tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + length))); 437tokens.Add(new EncodedToken(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.Length)));
Model\TiktokenTokenizer.cs (3)
307tokens.Add(new EncodedToken( 319tokens.Add(new EncodedToken(mappedId.Id, mappedId.Token, new Range(offset, offset + mappedId.Token.Length))); 348tokens.Add(new EncodedToken(
Model\Word.cs (1)
299tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
Model\WordPieceTokenizer.cs (3)
319tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + text.Length))); 352curToken = new EncodedToken(id, _vocabReverse[id], new Range(offset + start, offset + end)); 373tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + textLength)));
Microsoft.ML.Tokenizers.Tests (97)
BertTokenizerTests.cs (66)
60new EncodedToken(8, "hello", new Range(0, 5)), 61new EncodedToken(6, ",", new Range(5, 6)), 62new EncodedToken(10, "how", new Range(7, 10)), 63new EncodedToken(11, "are", new Range(11, 14)), 64new EncodedToken(12, "you", new Range(15, 18)), 65new EncodedToken(13, "[SPECIAL]", new Range(19, 28)), 66new EncodedToken(7, "?", new Range(28, 29)) 80new EncodedToken(2, "[CLS]", new Range(0, 5)), 81new EncodedToken(8, "hello", new Range(6, 11)), 82new EncodedToken(6, ",", new Range(11, 12)), 83new EncodedToken(10, "how", new Range(13, 16)), 84new EncodedToken(11, "are", new Range(17, 20)), 85new EncodedToken(12, "you", new Range(21, 24)), 86new EncodedToken(13, "[SPECIAL]", new Range(25, 34)), 87new EncodedToken(7, "?", new Range(34, 35)), 88new EncodedToken(3, "[SEP]", new Range(36, 41)) 133new EncodedToken(8, "hello", new Range(0, 5)), 134new EncodedToken(6, ",", new Range(5, 6)), 135new EncodedToken(10, "how", new Range(7, 10)), 136new EncodedToken(11, "are", new Range(11, 14)), 137new EncodedToken(12, "you", new Range(15, 18)), 138new EncodedToken(7, "?", new Range(18, 19)) 152new EncodedToken(2, "[CLS]", new Range(0, 5)), 153new EncodedToken(8, "hello", new Range(6, 11)), 154new EncodedToken(6, ",", new Range(11, 12)), 155new EncodedToken(10, "how", new Range(13, 16)), 156new EncodedToken(11, "are", new Range(17, 20)), 157new EncodedToken(12, "you", new Range(21, 24)), 158new EncodedToken(7, "?", new Range(24, 25)), 159new EncodedToken(3, "[SEP]", new Range(26, 31)) 201new EncodedToken(1, "[UNK]", new Range(0, 5)), 202new EncodedToken(6, ",", new Range(5, 6)), 203new EncodedToken(1, "[UNK]", new Range(7, 10)), 204new EncodedToken(11, "are", new Range(11, 14)), 205new EncodedToken(12, "you", new Range(15, 18)), 206new EncodedToken(7, "?", new Range(18, 19)) 241new EncodedToken(10, "café", new Range(0, 4)), 242new EncodedToken(12, "über", new Range(5, 9)), 243new EncodedToken(15, "ångström", new Range(10, 18)), 244new EncodedToken(18, "résumé", new Range(19, 25)), 245new EncodedToken(5, "!", new Range(25, 26)), 256new EncodedToken(8, "Café", new Range(0, 4)), 257new EncodedToken(11, "Über", new Range(5, 9)), 258new EncodedToken(14, "Ångström", new Range(10, 18)), 259new EncodedToken(17, "Résumé", new Range(19, 25)), 260new EncodedToken(5, "!", new Range(25, 26)), 272new EncodedToken(9, "cafe", new Range(0, 4)), 273new EncodedToken(13, "uber", new Range(5, 9)), 274new EncodedToken(16, "angstrom", new Range(10, 18)), 275new EncodedToken(19, "resume", new Range(19, 25)), 276new EncodedToken(5, "!", new Range(25, 26)), 286new EncodedToken(20, "Cafe", new Range(0, 4)), 287new EncodedToken(21, "Uber", new Range(5, 9)), 288new EncodedToken(22, "Angstrom", new Range(10, 18)), 289new EncodedToken(23, "Resume", new Range(19, 25)), 290new EncodedToken(5, "!", new Range(25, 26)), 317new EncodedToken(9, "叟", new Range(1, 2)), 318new EncodedToken(11, "驷", new Range(4, 5)), 319new EncodedToken(10, "叢", new Range(8, 9)), 320new EncodedToken(12, "驸", new Range(11, 12)), 321new EncodedToken(5, "!", new Range(13, 14)) 335new EncodedToken(9, "叟", new Range(0, 1)), 336new EncodedToken(6, "##驷", new Range(1, 2)), 337new EncodedToken(10, "叢", new Range(3, 4)), 338new EncodedToken(7, "##驸", new Range(4, 5)), 339new EncodedToken(5, "!", new Range(5, 6))
BpeTests.cs (13)
510new EncodedToken(15496, "Hello", new Range(0, 5)), 511new EncodedToken(11, ",", new Range(5, 6)), 512new EncodedToken(88, "y", new Range(7, 8)), 513new EncodedToken(6, "'", new Range(8, 9)), 514new EncodedToken(439, "all", new Range(9, 12)), 515new EncodedToken(0, "!", new Range(12, 13)), 516new EncodedToken(9, "<issue_comment>", new Range(14, 29)), 517new EncodedToken(2437, "How", new Range(29, 32)), 518new EncodedToken(533, "are", new Range(33, 36)), 519new EncodedToken(5832, "you", new Range(37, 40)), 520new EncodedToken(50256, "<|endoftext|>", new Range(41, 43)), 521new EncodedToken(30, "?", new Range(44, 45)), 522new EncodedToken(0, "<|endoftext|>", new Range(45, 58))
TokenizerTests.cs (1)
115tokens.Add(new EncodedToken(c - 'a', c.ToString(), new Range(count, count + 1)));
WordPieceTests.cs (17)
78new EncodedToken(7, "un", new Range(0, 2)), 79new EncodedToken(4, "##want", new Range(2, 6)), 80new EncodedToken(5, "##ed", new Range(6, 8)), 81new EncodedToken(8, "runn", new Range(9, 13)), 82new EncodedToken(9, "##ing", new Range(13, 16)) 159new EncodedToken(0, "[UNK]", new Range(0, 9)), 160new EncodedToken(8, "runn", new Range(10, 14)), 161new EncodedToken(9, "##ing", new Range(14, 17)) 197new EncodedToken(0, "[UNK]", new Range(0, 5)), 198new EncodedToken(7, "un", new Range(6, 8)), 199new EncodedToken(4, "##want", new Range(8, 12)), 200new EncodedToken(5, "##ed", new Range(12, 14)), 201new EncodedToken(2, "[SEP]", new Range(15, 20)), 202new EncodedToken(1, "[CLS]", new Range(20, 25)), 203new EncodedToken(8, "runn", new Range(26, 30)), 204new EncodedToken(9, "##ing", new Range(30, 33)), 205new EncodedToken(1, "[CLS]", new Range(34, 39)),
145 references to EncodedToken
Microsoft.ML.Tokenizers (108)
EncodedToken.cs (2)
13public readonly struct EncodedToken : IEquatable<EncodedToken> 44public bool Equals(EncodedToken other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);
Model\BPETokenizer.cs (7)
308/// Encodes input text to a list of <see cref="EncodedToken" />s. 313protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 317return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 331List<EncodedToken> tokens = new(); 346return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 979internal void WordToTokens(ref Word word, List<EncodedToken> tokens, int offset) => word.ToTokens(VocabReverse, tokens, offset); 981internal void EncodeWithCache(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)
Model\CodeGenTokenizer.cs (31)
31private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 127_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 280/// Encodes input text to a list of <see cref="EncodedToken" />s. 285protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 299public IReadOnlyList<EncodedToken> EncodeToTokens(string text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 301EncodeResults<EncodedToken> result = EncodeToTokens(text, ReadOnlySpan<char>.Empty, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 317public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 319EncodeResults<EncodedToken> result = EncodeToTokens(null, text, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization); 324private EncodeResults<EncodedToken> EncodeToTokens(string? text, scoped ReadOnlySpan<char> textSpan, bool addPrefixSpace, bool addBos, bool addEos, bool considerPreTokenization, bool considerNormalization) 328return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 376List<EncodedToken> tokens = new(); 402return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = textSpanToEncode.Length }; 422private void EncodeInternal(string? text, scoped ReadOnlySpan<char> textSpan, List<EncodedToken> tokens, bool addPrefixSpace, int offset, PriorityQueue<SymbolPair> agenda) 436if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 461List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1010private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 1018foreach (var t in tokens) 1066private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 1074foreach (var t in tokens) 1120if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1144List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1184if (_cache.TryGetValue(textSpan, out List<EncodedToken>? hit)) 1208List<EncodedToken> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda); 1589private static void AppendTokenWithOffsetAdjusting(IReadOnlyList<EncodedToken> tokensToAdd, List<EncodedToken> tokens, int offset, bool addPrefixSpace) 1606foreach (EncodedToken t in tokensToAdd) 1616private List<EncodedToken> EncodeToTokens(Span<char> text, Span<int> mapping, ReadOnlySpan<char> originalText, PriorityQueue<SymbolPair> agenda) 1628return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) }; 1677List<EncodedToken> result = new List<EncodedToken>(text.Length); 1698static EncodedToken GetToken(int id, string token, int index, int length, ReadOnlySpan<char> originalText, Span<int> mapping)
Model\EnglishRobertaTokenizer.cs (27)
27private readonly StringSpanOrdinalKeyCache<List<EncodedToken>> _cache; 169_cache = new StringSpanOrdinalKeyCache<List<EncodedToken>>(); 309/// Encodes input text to a list of <see cref="EncodedToken" />s. 314protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 318return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 334List<EncodedToken> tokens = new(); 337foreach (EncodedToken t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.Length))) 343return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 347return new EncodeResults<EncodedToken> { Tokens = EncodeInternal(textSpanToEncode), NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 356private IReadOnlyList<EncodedToken> EncodeInternal(ReadOnlySpan<char> text) 386if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 393List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 588private int EncodeToIdsResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed) 596foreach (var t in tokens) 625private int EncodeToIdsFromEndResult(List<EncodedToken> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex) 633foreach (var t in tokens) 670if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 699List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 715if (_cache.TryGetValue(text, out List<EncodedToken>? hit)) 744List<EncodedToken> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping); 911private IReadOnlyList<EncodedToken> ModifyTokenListOffsets(IReadOnlyList<EncodedToken> tokens, Span<int> indexMapping) 921List<EncodedToken> list = new List<EncodedToken>(tokens.Count); 948private List<EncodedToken> EncodeToTokens(Span<char> token, Span<int> indexMapping) 961return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) }; 1045var tokens = new List<EncodedToken>(word.Count);
Model\SentencePieceTokenizer.cs (13)
191/// Encodes input text to a list of <see cref="EncodedToken" />s. 196protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 198return new EncodeResults<EncodedToken> 207/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 215/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 216public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 220/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 228/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 229public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 232private IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization, bool considerNormalization) 256List<EncodedToken>? tokens = new(); 270private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens) 315private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens)
Model\TiktokenTokenizer.cs (6)
253/// Encodes input text to a list of <see cref="EncodedToken" />s. 258protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 262return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 276List<EncodedToken> tokens = new(); 290return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 299private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset)
Model\Word.cs (1)
292public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
Model\WordPieceTokenizer.cs (7)
267/// Encodes input text to a list of <see cref="EncodedToken" />s. 272protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 276return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 290List<EncodedToken> tokens = new(); 304return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 313private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset) 337EncodedToken curToken = default;
Tokenizer.cs (14)
44EncodeResults<EncodedToken> results = EncodeToTokens(text, textSpan, settings); 133/// Encodes input text to a list of <see cref="EncodedToken" />s. 138protected abstract EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings); 141/// Encodes input text to a list of <see cref="EncodedToken" />s. 147/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 148public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 150EncodeResults<EncodedToken> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 157/// Encodes input text to a list of <see cref="EncodedToken" />s. 163/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 164public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 166EncodeResults<EncodedToken> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 235EncodeResults<EncodedToken> tokens = EncodeToTokens(text, textSpan, settings); 243var token = tokens.Tokens[tokenCount - 1]; 253var token = tokens.Tokens[tokens.Tokens.Count - tokenCount];
Microsoft.ML.Tokenizers.Tests (35)
BpeTests.cs (6)
257IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out _); 376IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 429IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 430IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 507IReadOnlyList<EncodedToken> tokens = bpeTokenizer.EncodeToTokens(input, out _); 509EncodedToken[] expectedTokens = [
CodeGenTests.cs (4)
231private void ValidateEncoding(IReadOnlyList<EncodedToken> encoding, bool addPrefixSpace, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds, 252IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 347IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 549IReadOnlyList<EncodedToken> encoding = codeGenTokenizer.EncodeToTokens(text, out _);
EnglishRobertaTests.cs (3)
181IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 182IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 244IReadOnlyList<EncodedToken> encoding;
LlamaTests.cs (5)
244IReadOnlyList<EncodedToken> result = llamaTokenizer.EncodeToTokens(input, out _); 266IReadOnlyList<EncodedToken> bpeTokens = bpe.EncodeToTokens(normalizedInput.AsSpan(), out _, addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false); 500IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 501IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 657IReadOnlyList<EncodedToken> encodedTokens;
NormalizerTests.cs (1)
65IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out normalizedText);
PreTokenizerTests.cs (1)
66IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _);
TiktokenTests.cs (9)
143IReadOnlyList<EncodedToken> result = tokenizer.EncodeToTokens(text, out string? normalizedText); 196IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 239IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 258IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 274IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 308IReadOnlyList<EncodedToken> result = GPT4o.EncodeToTokens(text, out string? normalizedText); 568IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _); 569IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); 683IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _);
TokenizerTests.cs (3)
105protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 107var tokens = new List<EncodedToken>(); 119return new EncodeResults<EncodedToken> { Tokens = tokens, CharsConsumed = count };
WordPieceTests.cs (3)
62IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens("", out _); 156IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out _); 194IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out _);
Microsoft.ML.TorchSharp (2)
NasBert\NerTrainer.cs (2)
170IReadOnlyList<EncodedToken> encoding = Tokenizer.EncodeToTokens(sentence, out string normalizedText); 380IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(sentence, out string normalizedText);