126 references to EncodedToken
Microsoft.ML.Tokenizers (29)
Model\BPETokenizer.cs (1)
985tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(offset, offset + text.Length)));
Model\CodeGenTokenizer.cs (8)
379tokens.Add(new EncodedToken(BeginningOfSentenceId.Value, BeginningOfSentenceToken!, new Range(0, 0))); 399tokens.Add(new EncodedToken(EndOfSentenceId.Value, EndOfSentenceToken!, new Range(index, index))); 432tokens.Add(new EncodedToken(value.specialTokenId, value.specialToken, new Range(index, index + ((addPrefixSpace && offset == 0) ? textSpan.Length - 1 : textSpan.Length)))); 1596tokens.Add(new EncodedToken(tokensToAdd[0].Id, tokensToAdd[0].Value, new Range(r.s, r.e))); 1600tokens.Add(new EncodedToken(tokensToAdd[i].Id, tokensToAdd[i].Value, new Range(tokensToAdd[i].Offset.Start.Value + offset - 1, tokensToAdd[i].Offset.End.Value + offset - 1))); 1608tokens.Add(new EncodedToken(t.Id, t.Value, new Range(t.Offset.Start.Value + offset, t.Offset.End.Value + offset))); 1628return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)].Id, tokenValue, new Range(mapping[0], mapping[0] + 1)) }; 1701return new EncodedToken(id, token, new Range(mapping[index], endIndex));
Model\EnglishRobertaTokenizer.cs (4)
339tokens.Add(new EncodedToken(t.Id, t.Value, new Range(split.Offset + t.Offset.Start.Value, split.Offset + t.Offset.End.Value))); 929list.Add(new EncodedToken(tokens[j].Id, tokens[j].Value, new Range(indexMapping[index], indexMapping[index] + tokens[j].Value.Length))); 961return new List<EncodedToken> { new EncodedToken(_vocab[new StringSpanOrdinalKey(tokenValue)], tokenValue, new Range(indexMapping[0], indexMapping[0] + 1)) }; 1050tokens.Add(new EncodedToken(_vocab[new StringSpanOrdinalKey(w)], w, new Range(indexMapping[index], indexMapping[index] + w.Length)));
Model\SentencePieceTokenizer.cs (9)
276tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 290tokens.Add(new EncodedToken(id, _specialTokensReverse![id], new Range(Offset, Offset + Length))); 303tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 323tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0))); 353tokens.Add(new EncodedToken( 368tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(text.Length, text.Length))); 385tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + 1))); 409tokens.Add(new EncodedToken(id, token, new Range(index + i, index + i + length))); 437tokens.Add(new EncodedToken(id.Id, text.Slice(pieceSpan.Index, pieceSpan.Length).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.Length)));
Model\TiktokenTokenizer.cs (3)
307tokens.Add(new EncodedToken( 319tokens.Add(new EncodedToken(mappedId.Id, mappedId.Token, new Range(offset, offset + mappedId.Token.Length))); 348tokens.Add(new EncodedToken(
Model\Word.cs (1)
299tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
Model\WordPieceTokenizer.cs (3)
319tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + text.Length))); 352curToken = new EncodedToken(id, _vocabReverse[id], new Range(offset + start, offset + end)); 373tokens.Add(new EncodedToken(UnknownTokenId, UnknownToken, new Range(offset, offset + textLength)));
Microsoft.ML.Tokenizers.Tests (97)
BertTokenizerTests.cs (66)
60new EncodedToken(8, "hello", new Range(0, 5)), 61new EncodedToken(6, ",", new Range(5, 6)), 62new EncodedToken(10, "how", new Range(7, 10)), 63new EncodedToken(11, "are", new Range(11, 14)), 64new EncodedToken(12, "you", new Range(15, 18)), 65new EncodedToken(13, "[SPECIAL]", new Range(19, 28)), 66new EncodedToken(7, "?", new Range(28, 29)) 80new EncodedToken(2, "[CLS]", new Range(0, 5)), 81new EncodedToken(8, "hello", new Range(6, 11)), 82new EncodedToken(6, ",", new Range(11, 12)), 83new EncodedToken(10, "how", new Range(13, 16)), 84new EncodedToken(11, "are", new Range(17, 20)), 85new EncodedToken(12, "you", new Range(21, 24)), 86new EncodedToken(13, "[SPECIAL]", new Range(25, 34)), 87new EncodedToken(7, "?", new Range(34, 35)), 88new EncodedToken(3, "[SEP]", new Range(36, 41)) 133new EncodedToken(8, "hello", new Range(0, 5)), 134new EncodedToken(6, ",", new Range(5, 6)), 135new EncodedToken(10, "how", new Range(7, 10)), 136new EncodedToken(11, "are", new Range(11, 14)), 137new EncodedToken(12, "you", new Range(15, 18)), 138new EncodedToken(7, "?", new Range(18, 19)) 152new EncodedToken(2, "[CLS]", new Range(0, 5)), 153new EncodedToken(8, "hello", new Range(6, 11)), 154new EncodedToken(6, ",", new Range(11, 12)), 155new EncodedToken(10, "how", new Range(13, 16)), 156new EncodedToken(11, "are", new Range(17, 20)), 157new EncodedToken(12, "you", new Range(21, 24)), 158new EncodedToken(7, "?", new Range(24, 25)), 159new EncodedToken(3, "[SEP]", new Range(26, 31)) 201new EncodedToken(1, "[UNK]", new Range(0, 5)), 202new EncodedToken(6, ",", new Range(5, 6)), 203new EncodedToken(1, "[UNK]", new Range(7, 10)), 204new EncodedToken(11, "are", new Range(11, 14)), 205new EncodedToken(12, "you", new Range(15, 18)), 206new EncodedToken(7, "?", new Range(18, 19)) 241new EncodedToken(10, "café", new Range(0, 4)), 242new EncodedToken(12, "über", new Range(5, 9)), 243new EncodedToken(15, "ångström", new Range(10, 18)), 244new EncodedToken(18, "résumé", new Range(19, 25)), 245new EncodedToken(5, "!", new Range(25, 26)), 256new EncodedToken(8, "Café", new Range(0, 4)), 257new EncodedToken(11, "Über", new Range(5, 9)), 258new EncodedToken(14, "Ångström", new Range(10, 18)), 259new EncodedToken(17, "Résumé", new Range(19, 25)), 260new EncodedToken(5, "!", new Range(25, 26)), 272new EncodedToken(9, "cafe", new Range(0, 4)), 273new EncodedToken(13, "uber", new Range(5, 9)), 274new EncodedToken(16, "angstrom", new Range(10, 18)), 275new EncodedToken(19, "resume", new Range(19, 25)), 276new EncodedToken(5, "!", new Range(25, 26)), 286new EncodedToken(20, "Cafe", new Range(0, 4)), 287new EncodedToken(21, "Uber", new Range(5, 9)), 288new EncodedToken(22, "Angstrom", new Range(10, 18)), 289new EncodedToken(23, "Resume", new Range(19, 25)), 290new EncodedToken(5, "!", new Range(25, 26)), 317new EncodedToken(9, "叟", new Range(1, 2)), 318new EncodedToken(11, "驷", new Range(4, 5)), 319new EncodedToken(10, "叢", new Range(8, 9)), 320new EncodedToken(12, "驸", new Range(11, 12)), 321new EncodedToken(5, "!", new Range(13, 14)) 335new EncodedToken(9, "叟", new Range(0, 1)), 336new EncodedToken(6, "##驷", new Range(1, 2)), 337new EncodedToken(10, "叢", new Range(3, 4)), 338new EncodedToken(7, "##驸", new Range(4, 5)), 339new EncodedToken(5, "!", new Range(5, 6))
BpeTests.cs (13)
510new EncodedToken(15496, "Hello", new Range(0, 5)), 511new EncodedToken(11, ",", new Range(5, 6)), 512new EncodedToken(88, "y", new Range(7, 8)), 513new EncodedToken(6, "'", new Range(8, 9)), 514new EncodedToken(439, "all", new Range(9, 12)), 515new EncodedToken(0, "!", new Range(12, 13)), 516new EncodedToken(9, "<issue_comment>", new Range(14, 29)), 517new EncodedToken(2437, "How", new Range(29, 32)), 518new EncodedToken(533, "are", new Range(33, 36)), 519new EncodedToken(5832, "you", new Range(37, 40)), 520new EncodedToken(50256, "<|endoftext|>", new Range(41, 43)), 521new EncodedToken(30, "?", new Range(44, 45)), 522new EncodedToken(0, "<|endoftext|>", new Range(45, 58))
TokenizerTests.cs (1)
115tokens.Add(new EncodedToken(c - 'a', c.ToString(), new Range(count, count + 1)));
WordPieceTests.cs (17)
78new EncodedToken(7, "un", new Range(0, 2)), 79new EncodedToken(4, "##want", new Range(2, 6)), 80new EncodedToken(5, "##ed", new Range(6, 8)), 81new EncodedToken(8, "runn", new Range(9, 13)), 82new EncodedToken(9, "##ing", new Range(13, 16)) 159new EncodedToken(0, "[UNK]", new Range(0, 9)), 160new EncodedToken(8, "runn", new Range(10, 14)), 161new EncodedToken(9, "##ing", new Range(14, 17)) 197new EncodedToken(0, "[UNK]", new Range(0, 5)), 198new EncodedToken(7, "un", new Range(6, 8)), 199new EncodedToken(4, "##want", new Range(8, 12)), 200new EncodedToken(5, "##ed", new Range(12, 14)), 201new EncodedToken(2, "[SEP]", new Range(15, 20)), 202new EncodedToken(1, "[CLS]", new Range(20, 25)), 203new EncodedToken(8, "runn", new Range(26, 30)), 204new EncodedToken(9, "##ing", new Range(30, 33)), 205new EncodedToken(1, "[CLS]", new Range(34, 39)),