24 instantiations of EncodedToken
Microsoft.ML.Tokenizers (23)
Model\CodeGenTokenizer.cs (6)
379
tokens.Add(new
EncodedToken
(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, (0, 0)));
1593
tokens.Add(new
EncodedToken
(tokensToAdd[0].Id, tokensToAdd[0].Value, (offset == 0 ? tokensToAdd[0].Offset.Index : tokensToAdd[0].Offset.Index + offset - 1, offset == 0 ? tokensToAdd[0].Offset.Length - 1 : tokensToAdd[0].Offset.Length)));
1597
tokens.Add(new
EncodedToken
(tokensToAdd[i].Id, tokensToAdd[i].Value, (tokensToAdd[i].Offset.Index + offset - 1, tokensToAdd[i].Offset.Length)));
1605
tokens.Add(new
EncodedToken
(t.Id, t.Value, (t.Offset.Index + offset, t.Offset.Length)));
1625
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, (mapping[0], 1)) };
1699
return new
EncodedToken
(id, token, (tokenStartIndex, tokenLength));
Model\EnglishRobertaTokenizer.cs (4)
328
tokens.Add(new
EncodedToken
(t.Id, t.Value, (split.Offset + t.Offset.Index, t.Offset.Length)));
918
list.Add(new
EncodedToken
(tokens[j].Id, tokens[j].Value, (indexMapping[index], tokens[j].Value.Length)));
950
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, (indexMapping[0], 1)) };
1039
tokens.Add(new
EncodedToken
(_vocab[new StringSpanOrdinalKey(w)], w, (indexMapping[index], w.Length)));
Model\SentencePieceBpeTokenizer.cs (9)
275
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, (0, 0)));
289
tokens.Add(new
EncodedToken
(id, _specialTokensReverse![id], (Offset, Length)));
302
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, (text.Length, 0)));
322
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, (0, 0)));
352
tokens.Add(new
EncodedToken
(
367
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, (text.Length, 0)));
384
tokens.Add(new
EncodedToken
(id, token, (index + i, 1)));
408
tokens.Add(new
EncodedToken
(id, token, (index + i, length)));
436
tokens.Add(new
EncodedToken
(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), (pieceSpan.Index, pieceSpan.Length)));
Model\TiktokenTokenizer.cs (3)
306
tokens.Add(new
EncodedToken
(
318
tokens.Add(new
EncodedToken
(mappedId.Id, mappedId.Token, (offset, mappedId.Token.Length)));
347
tokens.Add(new
EncodedToken
(
Model\Word.cs (1)
299
tokens.Add(new
EncodedToken
(_symbols[i].C, vocabReverse[_symbols[i].C], (index + offset, _symbols[i].Len)));
Microsoft.ML.Tokenizers.Tests (1)
TokenizerTests.cs (1)
115
tokens.Add(new
EncodedToken
(c - 'a', c.ToString(), (count, 1)));
131 references to EncodedToken
Microsoft.ML.Tokenizers (99)
Model\BPETokenizer.cs (7)
263
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
268
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
272
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
286
List<
EncodedToken
> tokens = new();
301
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = charsConsumed };
935
internal void WordToTokens(ref Word word, List<
EncodedToken
> tokens, int offset) => word.ToTokens(VocabReverse, tokens, offset);
937
internal void EncodeWithCache(ReadOnlySpan<char> text, List<
EncodedToken
> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)
Model\CodeGenTokenizer.cs (31)
31
private readonly StringSpanOrdinalKeyCache<List<
EncodedToken
>> _cache;
127
_cache = new StringSpanOrdinalKeyCache<List<
EncodedToken
>>();
280
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
285
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
299
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true)
301
EncodeResults<
EncodedToken
> result = EncodeToTokens(text, ReadOnlySpan<char>.Empty, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization);
317
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true)
319
EncodeResults<
EncodedToken
> result = EncodeToTokens(null, text, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization);
324
private EncodeResults<
EncodedToken
> EncodeToTokens(string? text, scoped ReadOnlySpan<char> textSpan, bool addPrefixSpace, bool addBos, bool addEos, bool considerPreTokenization, bool considerNormalization)
328
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
376
List<
EncodedToken
> tokens = new();
401
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = textSpanToEncode.Length };
421
private void EncodeInternal(string? text, scoped ReadOnlySpan<char> textSpan, List<
EncodedToken
> tokens, bool addPrefixSpace, int offset, PriorityQueue<SymbolPair> agenda)
434
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
459
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1008
private int EncodeToIdsResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed)
1016
foreach (
var
t in tokens)
1064
private int EncodeToIdsFromEndResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex)
1072
foreach (
var
t in tokens)
1118
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
1142
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1182
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
1206
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1587
private static void AppendTokenWithOffsetAdjusting(IReadOnlyList<
EncodedToken
> tokensToAdd, List<
EncodedToken
> tokens, int offset, bool addPrefixSpace)
1603
foreach (
EncodedToken
t in tokensToAdd)
1613
private List<
EncodedToken
> EncodeToTokens(Span<char> text, Span<int> mapping, ReadOnlySpan<char> originalText, PriorityQueue<SymbolPair> agenda)
1625
return new List<
EncodedToken
> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, (mapping[0], 1)) };
1674
List<
EncodedToken
> result = new List<
EncodedToken
>(text.Length);
1695
static
EncodedToken
GetToken(int id, string token, int index, int length, ReadOnlySpan<char> originalText, Span<int> mapping)
Model\EnglishRobertaTokenizer.cs (27)
27
private readonly StringSpanOrdinalKeyCache<List<
EncodedToken
>> _cache;
157
_cache = new StringSpanOrdinalKeyCache<List<
EncodedToken
>>();
298
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
303
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
307
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
323
List<
EncodedToken
> tokens = new();
326
foreach (
EncodedToken
t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.Length)))
332
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedString, CharsConsumed = charsConsumed };
336
return new EncodeResults<
EncodedToken
> { Tokens = EncodeInternal(textSpanToEncode), NormalizedText = normalizedString, CharsConsumed = charsConsumed };
345
private IReadOnlyList<
EncodedToken
> EncodeInternal(ReadOnlySpan<char> text)
375
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
382
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
577
private int EncodeToIdsResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed)
585
foreach (
var
t in tokens)
614
private int EncodeToIdsFromEndResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex)
622
foreach (
var
t in tokens)
659
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
688
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
704
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
733
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
900
private IReadOnlyList<
EncodedToken
> ModifyTokenListOffsets(IReadOnlyList<
EncodedToken
> tokens, Span<int> indexMapping)
910
List<
EncodedToken
> list = new List<
EncodedToken
>(tokens.Count);
937
private List<
EncodedToken
> EncodeToTokens(Span<char> token, Span<int> indexMapping)
950
return new List<
EncodedToken
> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, (indexMapping[0], 1)) };
1034
var tokens = new List<
EncodedToken
>(word.Count);
Model\SentencePieceBpeTokenizer.cs (13)
190
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
195
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
197
return new EncodeResults<
EncodedToken
>
206
/// Encodes input text a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.
214
/// <returns>The tokenization result includes a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.</returns>
215
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)
219
/// Encodes input text a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.
227
/// <returns>The tokenization result includes a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.</returns>
228
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)
231
private IReadOnlyList<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedString, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization, bool considerNormalization)
255
List<
EncodedToken
>? tokens = new();
269
private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<
EncodedToken
> tokens)
314
private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<
EncodedToken
> tokens)
Model\TiktokenTokenizer.cs (6)
252
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
257
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
261
return new EncodeResults<
EncodedToken
> { NormalizedText = null, Tokens = [], CharsConsumed = 0 };
275
List<
EncodedToken
> tokens = new();
289
return new EncodeResults<
EncodedToken
> { NormalizedText = normalizedString, Tokens = tokens, CharsConsumed = charsConsumed };
298
private void EncodeToTokens(ReadOnlySpan<char> text, List<
EncodedToken
> tokens, int offset)
Model\Word.cs (1)
292
public void ToTokens(SortedDictionary<int, string> vocabReverse, List<
EncodedToken
> tokens, int offset)
Tokenizer.cs (14)
44
EncodeResults<
EncodedToken
> results = EncodeToTokens(text, textSpan, settings);
133
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
138
protected abstract EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings);
141
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
147
/// <returns>The list of encoded <see cref="
EncodedToken
" />s.</returns>
148
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true)
150
EncodeResults<
EncodedToken
> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
157
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
163
/// <returns>The list of encoded <see cref="
EncodedToken
" />s.</returns>
164
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedString, bool considerPreTokenization = true, bool considerNormalization = true)
166
EncodeResults<
EncodedToken
> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
235
EncodeResults<
EncodedToken
> tokens = EncodeToTokens(text, textSpan, settings);
243
var
token = tokens.Tokens[tokenCount - 1];
253
var
token = tokens.Tokens[tokens.Tokens.Count - tokenCount];
Microsoft.ML.Tokenizers.Tests (30)
BpeTests.cs (4)
257
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(sentence, out _);
376
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
429
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
430
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
CodeGenTests.cs (4)
231
private void ValidateEncoding(IReadOnlyList<
EncodedToken
> encoding, bool addPrefixSpace, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds,
252
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
347
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
549
IReadOnlyList<
EncodedToken
> encoding = codeGenTokenizer.EncodeToTokens(text, out _);
EnglishRobertaTests.cs (3)
181
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
182
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
244
IReadOnlyList<
EncodedToken
> encoding;
LlamaTests.cs (5)
244
IReadOnlyList<
EncodedToken
> result = llamaTokenizer.EncodeToTokens(input, out _);
266
IReadOnlyList<
EncodedToken
> bpeTokens = bpe.EncodeToTokens(normalizedInput.AsSpan(), out _, addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
500
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
501
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
657
IReadOnlyList<
EncodedToken
> encodedTokens;
NormalizerTests.cs (1)
65
IReadOnlyList<
EncodedToken
> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
PreTokenizerTests.cs (1)
59
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
TitokenTests.cs (9)
142
IReadOnlyList<
EncodedToken
> result = tokenizer.EncodeToTokens(text, out string? normalizedString);
195
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedString);
238
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedString);
257
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedString);
273
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedString);
307
IReadOnlyList<
EncodedToken
> result = GPT4o.EncodeToTokens(text, out string? normalizedString);
564
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
565
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
679
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out _);
TokenizerTests.cs (3)
105
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
107
var tokens = new List<
EncodedToken
>();
119
return new EncodeResults<
EncodedToken
> { Tokens = tokens, CharsConsumed = count };
Microsoft.ML.TorchSharp (2)
NasBert\NerTrainer.cs (2)
170
IReadOnlyList<
EncodedToken
> encoding = Tokenizer.EncodeToTokens(sentence, out string normalizedString);
380
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(sentence, out string normalizedString);