136 references to EncodedToken
Microsoft.ML.Tokenizers (39)
Model\BPETokenizer.cs (1)
985
tokens.Add(new
EncodedToken
(value.specialTokenId, value.specialToken, new Range(offset, offset + text.Length)));
Model\CodeGenTokenizer.cs (8)
382
tokens.Add(new
EncodedToken
(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, new Range(0, 0)));
402
tokens.Add(new
EncodedToken
(EndOfSentenceId.Value, EndOfSentenceToken!, new Range(index, index)));
435
tokens.Add(new
EncodedToken
(value.specialTokenId, value.specialToken, new Range(index, index + ((addPrefixSpace && offset == 0) ? textSpan.Length - 1 : textSpan.Length))));
1599
tokens.Add(new
EncodedToken
(tokensToAdd[0].Id, tokensToAdd[0].Value, new Range(r.s, r.e)));
1603
tokens.Add(new
EncodedToken
(tokensToAdd[i].Id, tokensToAdd[i].Value, new Range(tokensToAdd[i].Offset.Start.Value + offset - 1, tokensToAdd[i].Offset.End.Value + offset - 1)));
1611
tokens.Add(new
EncodedToken
(t.Id, t.Value, new Range(t.Offset.Start.Value + offset, t.Offset.End.Value + offset)));
1631
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) };
1704
return new
EncodedToken
(id, token, new Range(mapping[index], endIndex));
Model\EnglishRobertaTokenizer.cs (4)
339
tokens.Add(new
EncodedToken
(t.Id, t.Value, new Range(split.Offset + t.Offset.Start.Value, split.Offset + t.Offset.End.Value)));
929
list.Add(new
EncodedToken
(tokens[j].Id, tokens[j].Value, new Range(indexMapping[index], indexMapping[index] + tokens[j].Value.Length)));
961
return new List<EncodedToken> { new
EncodedToken
(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) };
1050
tokens.Add(new
EncodedToken
(_vocab[new StringSpanOrdinalKey(w)], w, new Range(indexMapping[index], indexMapping[index] + w.Length)));
Model\SentencePieceBpeModel.cs (9)
157
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
171
tokens.Add(new
EncodedToken
(id, SpecialTokensReverse![id], new Range(Offset, Offset + Length)));
184
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length)));
204
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
234
tokens.Add(new
EncodedToken
(
249
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length)));
266
tokens.Add(new
EncodedToken
(id, token, new Range(index + i, index + i + 1)));
290
tokens.Add(new
EncodedToken
(id, token, new Range(index + i, index + i + length)));
318
tokens.Add(new
EncodedToken
(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.Length)));
Model\SentencePieceUnigramModel.cs (10)
271
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
287
tokens.Add(new
EncodedToken
(id, SpecialTokensReverse![id], new Range(progressOffset, progressOffset + Length)));
303
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
321
tokens.Add(new
EncodedToken
(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
331
tokens.Add(new
EncodedToken
(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
415
tokens.Add(new
EncodedToken
(node.Id, stringToken, new Range(0, tokenLength))); // we will update the range later.
432
tokens[start] = new
EncodedToken
(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength));
443
tokens[start] = new
EncodedToken
(tokens[start].Id, tokens[start].Value, new Range(tokensOffset, tokensOffset + tokenLength));
484
tokens.Insert(insertionStartPosition++, new
EncodedToken
(id, _vocabReverse[id].Piece, new Range(offsetStart, offsetStart + charLength)));
489
tokens.Insert(insertionStartPosition++, new
EncodedToken
(id, _vocabReverse[id].Piece, new Range(offsetStart + charLength, offsetStart + charLength)));
Model\TiktokenTokenizer.cs (3)
307
tokens.Add(new
EncodedToken
(
319
tokens.Add(new
EncodedToken
(mappedId.Id, mappedId.Token, new Range(offset, offset + mappedId.Token.Length)));
348
tokens.Add(new
EncodedToken
(
Model\Word.cs (1)
299
tokens.Add(new
EncodedToken
(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
Model\WordPieceTokenizer.cs (3)
319
tokens.Add(new
EncodedToken
(UnknownTokenId, UnknownToken, new Range(offset, offset + text.Length)));
352
curToken = new
EncodedToken
(id, _vocabReverse[id], new Range(offset + start, offset + end));
373
tokens.Add(new
EncodedToken
(UnknownTokenId, UnknownToken, new Range(offset, offset + textLength)));
Microsoft.ML.Tokenizers.Tests (97)
BertTokenizerTests.cs (66)
60
new
EncodedToken
(8, "hello", new Range(0, 5)),
61
new
EncodedToken
(6, ",", new Range(5, 6)),
62
new
EncodedToken
(10, "how", new Range(7, 10)),
63
new
EncodedToken
(11, "are", new Range(11, 14)),
64
new
EncodedToken
(12, "you", new Range(15, 18)),
65
new
EncodedToken
(13, "[SPECIAL]", new Range(19, 28)),
66
new
EncodedToken
(7, "?", new Range(28, 29))
80
new
EncodedToken
(2, "[CLS]", new Range(0, 5)),
81
new
EncodedToken
(8, "hello", new Range(6, 11)),
82
new
EncodedToken
(6, ",", new Range(11, 12)),
83
new
EncodedToken
(10, "how", new Range(13, 16)),
84
new
EncodedToken
(11, "are", new Range(17, 20)),
85
new
EncodedToken
(12, "you", new Range(21, 24)),
86
new
EncodedToken
(13, "[SPECIAL]", new Range(25, 34)),
87
new
EncodedToken
(7, "?", new Range(34, 35)),
88
new
EncodedToken
(3, "[SEP]", new Range(36, 41))
133
new
EncodedToken
(8, "hello", new Range(0, 5)),
134
new
EncodedToken
(6, ",", new Range(5, 6)),
135
new
EncodedToken
(10, "how", new Range(7, 10)),
136
new
EncodedToken
(11, "are", new Range(11, 14)),
137
new
EncodedToken
(12, "you", new Range(15, 18)),
138
new
EncodedToken
(7, "?", new Range(18, 19))
152
new
EncodedToken
(2, "[CLS]", new Range(0, 5)),
153
new
EncodedToken
(8, "hello", new Range(6, 11)),
154
new
EncodedToken
(6, ",", new Range(11, 12)),
155
new
EncodedToken
(10, "how", new Range(13, 16)),
156
new
EncodedToken
(11, "are", new Range(17, 20)),
157
new
EncodedToken
(12, "you", new Range(21, 24)),
158
new
EncodedToken
(7, "?", new Range(24, 25)),
159
new
EncodedToken
(3, "[SEP]", new Range(26, 31))
201
new
EncodedToken
(1, "[UNK]", new Range(0, 5)),
202
new
EncodedToken
(6, ",", new Range(5, 6)),
203
new
EncodedToken
(1, "[UNK]", new Range(7, 10)),
204
new
EncodedToken
(11, "are", new Range(11, 14)),
205
new
EncodedToken
(12, "you", new Range(15, 18)),
206
new
EncodedToken
(7, "?", new Range(18, 19))
241
new
EncodedToken
(10, "café", new Range(0, 4)),
242
new
EncodedToken
(12, "über", new Range(5, 9)),
243
new
EncodedToken
(15, "ångström", new Range(10, 18)),
244
new
EncodedToken
(18, "résumé", new Range(19, 25)),
245
new
EncodedToken
(5, "!", new Range(25, 26)),
256
new
EncodedToken
(8, "Café", new Range(0, 4)),
257
new
EncodedToken
(11, "Über", new Range(5, 9)),
258
new
EncodedToken
(14, "Ångström", new Range(10, 18)),
259
new
EncodedToken
(17, "Résumé", new Range(19, 25)),
260
new
EncodedToken
(5, "!", new Range(25, 26)),
272
new
EncodedToken
(9, "cafe", new Range(0, 4)),
273
new
EncodedToken
(13, "uber", new Range(5, 9)),
274
new
EncodedToken
(16, "angstrom", new Range(10, 18)),
275
new
EncodedToken
(19, "resume", new Range(19, 25)),
276
new
EncodedToken
(5, "!", new Range(25, 26)),
286
new
EncodedToken
(20, "Cafe", new Range(0, 4)),
287
new
EncodedToken
(21, "Uber", new Range(5, 9)),
288
new
EncodedToken
(22, "Angstrom", new Range(10, 18)),
289
new
EncodedToken
(23, "Resume", new Range(19, 25)),
290
new
EncodedToken
(5, "!", new Range(25, 26)),
317
new
EncodedToken
(9, "叟", new Range(1, 2)),
318
new
EncodedToken
(11, "驷", new Range(4, 5)),
319
new
EncodedToken
(10, "叢", new Range(8, 9)),
320
new
EncodedToken
(12, "驸", new Range(11, 12)),
321
new
EncodedToken
(5, "!", new Range(13, 14))
335
new
EncodedToken
(9, "叟", new Range(0, 1)),
336
new
EncodedToken
(6, "##驷", new Range(1, 2)),
337
new
EncodedToken
(10, "叢", new Range(3, 4)),
338
new
EncodedToken
(7, "##驸", new Range(4, 5)),
339
new
EncodedToken
(5, "!", new Range(5, 6))
BpeTests.cs (13)
510
new
EncodedToken
(15496, "Hello", new Range(0, 5)),
511
new
EncodedToken
(11, ",", new Range(5, 6)),
512
new
EncodedToken
(88, "y", new Range(7, 8)),
513
new
EncodedToken
(6, "'", new Range(8, 9)),
514
new
EncodedToken
(439, "all", new Range(9, 12)),
515
new
EncodedToken
(0, "!", new Range(12, 13)),
516
new
EncodedToken
(9, "<issue_comment>", new Range(14, 29)),
517
new
EncodedToken
(2437, "How", new Range(29, 32)),
518
new
EncodedToken
(533, "are", new Range(33, 36)),
519
new
EncodedToken
(5832, "you", new Range(37, 40)),
520
new
EncodedToken
(50256, "<|endoftext|>", new Range(41, 43)),
521
new
EncodedToken
(30, "?", new Range(44, 45)),
522
new
EncodedToken
(0, "<|endoftext|>", new Range(45, 58))
TokenizerTests.cs (1)
115
tokens.Add(new
EncodedToken
(c - 'a', c.ToString(), new Range(count, count + 1)));
WordPieceTests.cs (17)
78
new
EncodedToken
(7, "un", new Range(0, 2)),
79
new
EncodedToken
(4, "##want", new Range(2, 6)),
80
new
EncodedToken
(5, "##ed", new Range(6, 8)),
81
new
EncodedToken
(8, "runn", new Range(9, 13)),
82
new
EncodedToken
(9, "##ing", new Range(13, 16))
159
new
EncodedToken
(0, "[UNK]", new Range(0, 9)),
160
new
EncodedToken
(8, "runn", new Range(10, 14)),
161
new
EncodedToken
(9, "##ing", new Range(14, 17))
197
new
EncodedToken
(0, "[UNK]", new Range(0, 5)),
198
new
EncodedToken
(7, "un", new Range(6, 8)),
199
new
EncodedToken
(4, "##want", new Range(8, 12)),
200
new
EncodedToken
(5, "##ed", new Range(12, 14)),
201
new
EncodedToken
(2, "[SEP]", new Range(15, 20)),
202
new
EncodedToken
(1, "[CLS]", new Range(20, 25)),
203
new
EncodedToken
(8, "runn", new Range(26, 30)),
204
new
EncodedToken
(9, "##ing", new Range(30, 33)),
205
new
EncodedToken
(1, "[CLS]", new Range(34, 39)),