136 instantiations of EncodedToken
Microsoft.ML.Tokenizers (39)
Model\BPETokenizer.cs (1)
985
tokens.Add(new
EncodedToken
(value.specialTokenId, value.specialToken, new Range(offset, offset + text.Length)));
Model\CodeGenTokenizer.cs (8)
382
tokens.Add(new
EncodedToken
(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, new Range(0, 0)));
402
tokens.Add(new
EncodedToken
(EndOfSentenceId.Value, EndOfSentenceToken!, new Range(index, index)));
435
tokens.Add(new
EncodedToken
(value.specialTokenId, value.specialToken, new Range(index, index + ((addPrefixSpace && offset == 0) ? textSpan.Length - 1 : textSpan.Length))));
1599
tokens.Add(new
EncodedToken
(tokensToAdd[0].Id, tokensToAdd[0].Value, new Range(r.s, r.e)));
1603
tokens.Add(new
EncodedToken
(tokensToAdd[i].Id, tokensToAdd[i].Value, new Range(tokensToAdd[i].Offset.Start.Value + offset - 1, tokensToAdd[i].Offset.End.Value + offset - 1)));
1611
tokens.Add(new
EncodedToken
(t.Id, t.Value, new Range(t.Offset.Start.Value + offset, t.Offset.End.Value + offset)));
1631
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) };
1704
return new
EncodedToken
(id, token, new Range(mapping[index], endIndex));
Model\EnglishRobertaTokenizer.cs (4)
339
tokens.Add(new
EncodedToken
(t.Id, t.Value, new Range(split.Offset + t.Offset.Start.Value, split.Offset + t.Offset.End.Value)));
929
list.Add(new
EncodedToken
(tokens[j].Id, tokens[j].Value, new Range(indexMapping[index], indexMapping[index] + tokens[j].Value.Length)));
961
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) };
1050
tokens.Add(new
EncodedToken
(_vocab[new StringSpanOrdinalKey(w)], w, new Range(indexMapping[index], indexMapping[index] + w.Length)));
Model\SentencePieceBpeModel.cs (9)
157
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
171
tokens.Add(new
EncodedToken
(id, SpecialTokensReverse![id], new Range(Offset, Offset + Length)));
184
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length)));
204
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
234
tokens.Add(new
EncodedToken
(
249
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length)));
266
tokens.Add(new
EncodedToken
(id, token, new Range(index + i, index + i + 1)));
290
tokens.Add(new
EncodedToken
(id, token, new Range(index + i, index + i + length)));
318
tokens.Add(new
EncodedToken
(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.Length)));
Model\SentencePieceUnigramModel.cs (10)
271
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
287
tokens.Add(new
EncodedToken
(id, SpecialTokensReverse![id], new Range(progressOffset, progressOffset + Length)));
303
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
321
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
331
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
415
tokens.Add(new
EncodedToken
(node.Id, stringToken, new Range(0, tokenLength))); // we will update the range later.
432
tokens[start] = new
EncodedToken
(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength));
443
tokens[start] = new
EncodedToken
(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength));
484
tokens.Insert(insertionStartPosition++, new
EncodedToken
(id, _vocabReverse[id].Piece, new Range(offsetStart, offsetStart + charLength)));
489
tokens.Insert(insertionStartPosition++, new
EncodedToken
(id, _vocabReverse[id].Piece, new Range(offsetStart + charLength, offsetStart + charLength)));
Model\TiktokenTokenizer.cs (3)
307
tokens.Add(new
EncodedToken
(
319
tokens.Add(new
EncodedToken
(mappedId.Id, mappedId.Token, new Range(offset, offset + mappedId.Token.Length)));
348
tokens.Add(new
EncodedToken
(
Model\Word.cs (1)
299
tokens.Add(new
EncodedToken
(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
Model\WordPieceTokenizer.cs (3)
319
tokens.Add(new
EncodedToken
(UnknownTokenId, UnknownToken, new Range(offset, offset + text.Length)));
352
curToken = new
EncodedToken
(id, _vocabReverse[id], new Range(offset + start, offset + end));
373
tokens.Add(new
EncodedToken
(UnknownTokenId, UnknownToken, new Range(offset, offset + textLength)));
Microsoft.ML.Tokenizers.Tests (97)
BertTokenizerTests.cs (66)
60
new
EncodedToken
(8, "hello", new Range(0, 5)),
61
new
EncodedToken
(6, ",", new Range(5, 6)),
62
new
EncodedToken
(10, "how", new Range(7, 10)),
63
new
EncodedToken
(11, "are", new Range(11, 14)),
64
new
EncodedToken
(12, "you", new Range(15, 18)),
65
new
EncodedToken
(13, "[SPECIAL]", new Range(19, 28)),
66
new
EncodedToken
(7, "?", new Range(28, 29))
80
new
EncodedToken
(2, "[CLS]", new Range(0, 5)),
81
new
EncodedToken
(8, "hello", new Range(6, 11)),
82
new
EncodedToken
(6, ",", new Range(11, 12)),
83
new
EncodedToken
(10, "how", new Range(13, 16)),
84
new
EncodedToken
(11, "are", new Range(17, 20)),
85
new
EncodedToken
(12, "you", new Range(21, 24)),
86
new
EncodedToken
(13, "[SPECIAL]", new Range(25, 34)),
87
new
EncodedToken
(7, "?", new Range(34, 35)),
88
new
EncodedToken
(3, "[SEP]", new Range(36, 41))
133
new
EncodedToken
(8, "hello", new Range(0, 5)),
134
new
EncodedToken
(6, ",", new Range(5, 6)),
135
new
EncodedToken
(10, "how", new Range(7, 10)),
136
new
EncodedToken
(11, "are", new Range(11, 14)),
137
new
EncodedToken
(12, "you", new Range(15, 18)),
138
new
EncodedToken
(7, "?", new Range(18, 19))
152
new
EncodedToken
(2, "[CLS]", new Range(0, 5)),
153
new
EncodedToken
(8, "hello", new Range(6, 11)),
154
new
EncodedToken
(6, ",", new Range(11, 12)),
155
new
EncodedToken
(10, "how", new Range(13, 16)),
156
new
EncodedToken
(11, "are", new Range(17, 20)),
157
new
EncodedToken
(12, "you", new Range(21, 24)),
158
new
EncodedToken
(7, "?", new Range(24, 25)),
159
new
EncodedToken
(3, "[SEP]", new Range(26, 31))
201
new
EncodedToken
(1, "[UNK]", new Range(0, 5)),
202
new
EncodedToken
(6, ",", new Range(5, 6)),
203
new
EncodedToken
(1, "[UNK]", new Range(7, 10)),
204
new
EncodedToken
(11, "are", new Range(11, 14)),
205
new
EncodedToken
(12, "you", new Range(15, 18)),
206
new
EncodedToken
(7, "?", new Range(18, 19))
241
new
EncodedToken
(10, "café", new Range(0, 4)),
242
new
EncodedToken
(12, "über", new Range(5, 9)),
243
new
EncodedToken
(15, "ångström", new Range(10, 18)),
244
new
EncodedToken
(18, "résumé", new Range(19, 25)),
245
new
EncodedToken
(5, "!", new Range(25, 26)),
256
new
EncodedToken
(8, "Café", new Range(0, 4)),
257
new
EncodedToken
(11, "Über", new Range(5, 9)),
258
new
EncodedToken
(14, "Ångström", new Range(10, 18)),
259
new
EncodedToken
(17, "Résumé", new Range(19, 25)),
260
new
EncodedToken
(5, "!", new Range(25, 26)),
272
new
EncodedToken
(9, "cafe", new Range(0, 4)),
273
new
EncodedToken
(13, "uber", new Range(5, 9)),
274
new
EncodedToken
(16, "angstrom", new Range(10, 18)),
275
new
EncodedToken
(19, "resume", new Range(19, 25)),
276
new
EncodedToken
(5, "!", new Range(25, 26)),
286
new
EncodedToken
(20, "Cafe", new Range(0, 4)),
287
new
EncodedToken
(21, "Uber", new Range(5, 9)),
288
new
EncodedToken
(22, "Angstrom", new Range(10, 18)),
289
new
EncodedToken
(23, "Resume", new Range(19, 25)),
290
new
EncodedToken
(5, "!", new Range(25, 26)),
317
new
EncodedToken
(9, "叟", new Range(1, 2)),
318
new
EncodedToken
(11, "驷", new Range(4, 5)),
319
new
EncodedToken
(10, "叢", new Range(8, 9)),
320
new
EncodedToken
(12, "驸", new Range(11, 12)),
321
new
EncodedToken
(5, "!", new Range(13, 14))
335
new
EncodedToken
(9, "叟", new Range(0, 1)),
336
new
EncodedToken
(6, "##驷", new Range(1, 2)),
337
new
EncodedToken
(10, "叢", new Range(3, 4)),
338
new
EncodedToken
(7, "##驸", new Range(4, 5)),
339
new
EncodedToken
(5, "!", new Range(5, 6))
BpeTests.cs (13)
510
new
EncodedToken
(15496, "Hello", new Range(0, 5)),
511
new
EncodedToken
(11, ",", new Range(5, 6)),
512
new
EncodedToken
(88, "y", new Range(7, 8)),
513
new
EncodedToken
(6, "'", new Range(8, 9)),
514
new
EncodedToken
(439, "all", new Range(9, 12)),
515
new
EncodedToken
(0, "!", new Range(12, 13)),
516
new
EncodedToken
(9, "<issue_comment>", new Range(14, 29)),
517
new
EncodedToken
(2437, "How", new Range(29, 32)),
518
new
EncodedToken
(533, "are", new Range(33, 36)),
519
new
EncodedToken
(5832, "you", new Range(37, 40)),
520
new
EncodedToken
(50256, "<|endoftext|>", new Range(41, 43)),
521
new
EncodedToken
(30, "?", new Range(44, 45)),
522
new
EncodedToken
(0, "<|endoftext|>", new Range(45, 58))
TokenizerTests.cs (1)
115
tokens.Add(new
EncodedToken
(c - 'a', c.ToString(), new Range(count, count + 1)));
WordPieceTests.cs (17)
78
new
EncodedToken
(7, "un", new Range(0, 2)),
79
new
EncodedToken
(4, "##want", new Range(2, 6)),
80
new
EncodedToken
(5, "##ed", new Range(6, 8)),
81
new
EncodedToken
(8, "runn", new Range(9, 13)),
82
new
EncodedToken
(9, "##ing", new Range(13, 16))
159
new
EncodedToken
(0, "[UNK]", new Range(0, 9)),
160
new
EncodedToken
(8, "runn", new Range(10, 14)),
161
new
EncodedToken
(9, "##ing", new Range(14, 17))
197
new
EncodedToken
(0, "[UNK]", new Range(0, 5)),
198
new
EncodedToken
(7, "un", new Range(6, 8)),
199
new
EncodedToken
(4, "##want", new Range(8, 12)),
200
new
EncodedToken
(5, "##ed", new Range(12, 14)),
201
new
EncodedToken
(2, "[SEP]", new Range(15, 20)),
202
new
EncodedToken
(1, "[CLS]", new Range(20, 25)),
203
new
EncodedToken
(8, "runn", new Range(26, 30)),
204
new
EncodedToken
(9, "##ing", new Range(30, 33)),
205
new
EncodedToken
(1, "[CLS]", new Range(34, 39)),
157 references to EncodedToken
Microsoft.ML.Tokenizers (117)
EncodedToken.cs (2)
13
public readonly struct EncodedToken : IEquatable<
EncodedToken
>
44
public bool Equals(
EncodedToken
other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);
Model\BPETokenizer.cs (7)
308
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
313
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
317
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
331
List<
EncodedToken
> tokens = new();
346
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed };
979
internal void WordToTokens(ref Word word, List<
EncodedToken
> tokens, int offset) => word.ToTokens(VocabReverse, tokens, offset);
981
internal void EncodeWithCache(ReadOnlySpan<char> text, List<
EncodedToken
> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)
Model\CodeGenTokenizer.cs (31)
34
private readonly StringSpanOrdinalKeyCache<List<
EncodedToken
>> _cache;
130
_cache = new StringSpanOrdinalKeyCache<List<
EncodedToken
>>();
283
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
288
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
302
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true)
304
EncodeResults<
EncodedToken
> result = EncodeToTokens(text, ReadOnlySpan<char>.Empty, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization);
320
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, bool addPrefixSpace, bool addBeginningOfSentence, bool addEndOfSentence, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true)
322
EncodeResults<
EncodedToken
> result = EncodeToTokens(null, text, addPrefixSpace, addBeginningOfSentence, addEndOfSentence, considerPreTokenization, considerNormalization);
327
private EncodeResults<
EncodedToken
> EncodeToTokens(string? text, scoped ReadOnlySpan<char> textSpan, bool addPrefixSpace, bool addBos, bool addEos, bool considerPreTokenization, bool considerNormalization)
331
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
379
List<
EncodedToken
> tokens = new();
405
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = textSpanToEncode.Length };
425
private void EncodeInternal(string? text, scoped ReadOnlySpan<char> textSpan, List<
EncodedToken
> tokens, bool addPrefixSpace, int offset, PriorityQueue<SymbolPair> agenda)
439
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
464
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1013
private int EncodeToIdsResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed)
1021
foreach (
var
t in tokens)
1069
private int EncodeToIdsFromEndResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex)
1077
foreach (
var
t in tokens)
1123
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
1147
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1187
if (_cache.TryGetValue(textSpan, out List<
EncodedToken
>? hit))
1211
List<
EncodedToken
> result = EncodeToTokens(token.Slice(0, encodedLength), mapping.Slice(0, encodedLength), textSpan, agenda);
1592
private static void AppendTokenWithOffsetAdjusting(IReadOnlyList<
EncodedToken
> tokensToAdd, List<
EncodedToken
> tokens, int offset, bool addPrefixSpace)
1609
foreach (
EncodedToken
t in tokensToAdd)
1619
private List<
EncodedToken
> EncodeToTokens(Span<char> text, Span<int> mapping, ReadOnlySpan<char> originalText, PriorityQueue<SymbolPair> agenda)
1631
return new List<
EncodedToken
> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) };
1680
List<
EncodedToken
> result = new List<
EncodedToken
>(text.Length);
1701
static
EncodedToken
GetToken(int id, string token, int index, int length, ReadOnlySpan<char> originalText, Span<int> mapping)
Model\EnglishRobertaTokenizer.cs (27)
27
private readonly StringSpanOrdinalKeyCache<List<
EncodedToken
>> _cache;
169
_cache = new StringSpanOrdinalKeyCache<List<
EncodedToken
>>();
309
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
314
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
318
return new EncodeResults<
EncodedToken
> { Tokens = [], NormalizedText = null, CharsConsumed = 0 };
334
List<
EncodedToken
> tokens = new();
337
foreach (
EncodedToken
t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.Length)))
343
return new EncodeResults<
EncodedToken
> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed };
347
return new EncodeResults<
EncodedToken
> { Tokens = EncodeInternal(textSpanToEncode), NormalizedText = normalizedText, CharsConsumed = charsConsumed };
356
private IReadOnlyList<
EncodedToken
> EncodeInternal(ReadOnlySpan<char> text)
386
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
393
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
588
private int EncodeToIdsResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int charsConsumed)
596
foreach (
var
t in tokens)
625
private int EncodeToIdsFromEndResult(List<
EncodedToken
> tokens, IList<int>? accumulatedIds, int maxTokens, int fullTextLength, out int textIndex)
633
foreach (
var
t in tokens)
670
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
699
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
715
if (_cache.TryGetValue(text, out List<
EncodedToken
>? hit))
744
List<
EncodedToken
> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
911
private IReadOnlyList<
EncodedToken
> ModifyTokenListOffsets(IReadOnlyList<
EncodedToken
> tokens, Span<int> indexMapping)
921
List<
EncodedToken
> list = new List<
EncodedToken
>(tokens.Count);
948
private List<
EncodedToken
> EncodeToTokens(Span<char> token, Span<int> indexMapping)
961
return new List<
EncodedToken
> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) };
1045
var tokens = new List<
EncodedToken
>(word.Count);
Model\SentencePieceBaseModel.cs (1)
165
public abstract IReadOnlyList<
EncodedToken
> EncodeToTokens(
Model\SentencePieceBpeModel.cs (4)
113
public override IReadOnlyList<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization)
137
List<
EncodedToken
> tokens = new();
151
private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<
EncodedToken
> tokens)
196
private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<
EncodedToken
> tokens)
Model\SentencePieceTokenizer.cs (9)
124
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
129
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
131
return new EncodeResults<
EncodedToken
>
140
/// Encodes input text a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.
148
/// <returns>The tokenization result includes a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.</returns>
149
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)
153
/// Encodes input text a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.
161
/// <returns>The tokenization result includes a list of <see cref="
EncodedToken
" />s with string value of the token, id, and offset.</returns>
162
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)
Model\SentencePieceUnigramModel.cs (8)
150
public override IReadOnlyList<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization)
156
return Array.Empty<
EncodedToken
>();
159
List<
EncodedToken
> tokens = new();
262
List<
EncodedToken
> tokens,
314
List<
EncodedToken
> tokens,
380
List<
EncodedToken
> tokens,
426
EncodedToken
temp = tokens[start];
462
private void FallbackToByteEncoding(ReadOnlySpan<char> normalizationSpan, List<
EncodedToken
> tokens, int insertionStartPosition)
Model\TiktokenTokenizer.cs (6)
253
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
258
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
262
return new EncodeResults<
EncodedToken
> { NormalizedText = null, Tokens = [], CharsConsumed = 0 };
276
List<
EncodedToken
> tokens = new();
290
return new EncodeResults<
EncodedToken
> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed };
299
private void EncodeToTokens(ReadOnlySpan<char> text, List<
EncodedToken
> tokens, int offset)
Model\Word.cs (1)
292
public void ToTokens(SortedDictionary<int, string> vocabReverse, List<
EncodedToken
> tokens, int offset)
Model\WordPieceTokenizer.cs (7)
267
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
272
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
276
return new EncodeResults<
EncodedToken
> { NormalizedText = null, Tokens = [], CharsConsumed = 0 };
290
List<
EncodedToken
> tokens = new();
304
return new EncodeResults<
EncodedToken
> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed };
313
private void EncodeToTokens(ReadOnlySpan<char> text, List<
EncodedToken
> tokens, int offset)
337
EncodedToken
curToken = default;
Tokenizer.cs (14)
44
EncodeResults<
EncodedToken
> results = EncodeToTokens(text, textSpan, settings);
133
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
138
protected abstract EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings);
141
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
147
/// <returns>The list of encoded <see cref="
EncodedToken
" />s.</returns>
148
public IReadOnlyList<
EncodedToken
> EncodeToTokens(string text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true)
150
EncodeResults<
EncodedToken
> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
157
/// Encodes input text to a list of <see cref="
EncodedToken
" />s.
163
/// <returns>The list of encoded <see cref="
EncodedToken
" />s.</returns>
164
public IReadOnlyList<
EncodedToken
> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true)
166
EncodeResults<
EncodedToken
> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
235
EncodeResults<
EncodedToken
> tokens = EncodeToTokens(text, textSpan, settings);
243
var
token = tokens.Tokens[tokenCount - 1];
253
var
token = tokens.Tokens[tokens.Tokens.Count - tokenCount];
Microsoft.ML.Tokenizers.Tests (38)
BpeTests.cs (6)
257
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(sentence, out _);
376
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
429
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
430
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
507
IReadOnlyList<
EncodedToken
> tokens = bpeTokenizer.EncodeToTokens(input, out _);
509
EncodedToken
[] expectedTokens = [
CodeGenTests.cs (4)
231
private void ValidateEncoding(IReadOnlyList<
EncodedToken
> encoding, bool addPrefixSpace, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds,
252
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
347
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
549
IReadOnlyList<
EncodedToken
> encoding = codeGenTokenizer.EncodeToTokens(text, out _);
EnglishRobertaTests.cs (3)
181
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
182
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
244
IReadOnlyList<
EncodedToken
> encoding;
LlamaTests.cs (5)
244
IReadOnlyList<
EncodedToken
> result = llamaTokenizer.EncodeToTokens(input, out _);
266
IReadOnlyList<
EncodedToken
> bpeTokens = bpe.EncodeToTokens(normalizedInput.AsSpan(), out _, addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
500
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
501
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
657
IReadOnlyList<
EncodedToken
> encodedTokens;
NormalizerTests.cs (1)
65
IReadOnlyList<
EncodedToken
> tokens = tokenizer.EncodeToTokens(text, out normalizedText);
PreTokenizerTests.cs (1)
66
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
TiktokenTests.cs (9)
146
IReadOnlyList<
EncodedToken
> result = tokenizer.EncodeToTokens(text, out string? normalizedText);
199
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedText);
242
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedText);
261
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedText);
277
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out string? normalizedText);
311
IReadOnlyList<
EncodedToken
> result = GPT4o.EncodeToTokens(text, out string? normalizedText);
582
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(text, out _);
583
IReadOnlyList<
EncodedToken
> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
697
IReadOnlyList<
EncodedToken
> result = GPT4.EncodeToTokens(text, out _);
TokenizerTests.cs (3)
105
protected override EncodeResults<
EncodedToken
> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings)
107
var tokens = new List<
EncodedToken
>();
119
return new EncodeResults<
EncodedToken
> { Tokens = tokens, CharsConsumed = count };
UnigramTests.cs (3)
326
IReadOnlyList<
EncodedToken
> tokens,
331
List<
EncodedToken
> writableTokens = tokens.ToList();
404
IReadOnlyList<
EncodedToken
> result = _unigramTokenizer.EncodeToTokens(inputText, out string? normalized);
WordPieceTests.cs (3)
62
IReadOnlyList<
EncodedToken
> tokens = tokenizer.EncodeToTokens("", out _);
156
IReadOnlyList<
EncodedToken
> tokens = tokenizer.EncodeToTokens(text, out _);
194
IReadOnlyList<
EncodedToken
> tokens = tokenizer.EncodeToTokens(text, out _);
Microsoft.ML.TorchSharp (2)
NasBert\NerTrainer.cs (2)
170
IReadOnlyList<
EncodedToken
> encoding = Tokenizer.EncodeToTokens(sentence, out string normalizedText);
380
IReadOnlyList<
EncodedToken
> encoding = tokenizer.EncodeToTokens(sentence, out string normalizedText);