155 references to Length
Microsoft.ML.Tokenizers (123)
Model\BPETokenizer.cs (8)
466
EncodeWithCache(textSpanToEncode.Slice(split.Offset, split.
Length
), tokens, split.Offset, ref priorityQueue);
527
EncodeToIdsWithCache(textSpanToEncode.Slice(split.Offset, split.
Length
), ids, maxTokenCount - ids.Count, out int length, ref priorityQueue);
530
if (length < split.
Length
|| ids.Count >= maxTokenCount)
588
count += EncodeToIdsWithCache(textSpanToEncode.Slice(split.Offset, split.
Length
), null, maxTokenCount - count, out int length, ref priorityQueue);
591
if (length < split.
Length
|| count >= maxTokenCount)
668
count += EncodeToIdsWithCache(textSpanToEncode.Slice(split.Offset, split.
Length
), null, maxTokenCount - count, out int length, ref priorityQueue);
671
if (length < split.
Length
|| count >= maxTokenCount)
722
tokenCount += EncodeToIdsFromEndWithCache(textSpanToEncode.Slice(split.Offset, split.
Length
), null, maxTokenCount - tokenCount, out int textIndex, ref priorityQueue);
Model\CodeGenTokenizer.cs (19)
391
EncodeInternal(null, textSpanToEncode.Slice(split.Offset, split.
Length
), tokens, addPrefixSpace, split.Offset, agenda);
632
EncodeToIdsInternal(null, textSpanToEncode.Slice(split.Offset, split.
Length
), ids, agenda, out int length, maxTokenCount - ids.Count);
635
if (length < split.
Length
|| ids.Count >= maxTokenCount)
844
count += EncodeToIdsInternal(null, textSpanToEncode.Slice(split.Offset, split.
Length
), null, agenda, out int length, maxTokenCount - count);
847
if (length < split.
Length
|| count >= maxTokenCount)
983
tokenCount += EncodeToIdsFromEndInternal(null, textSpanToEncode.Slice(split.Offset, split.
Length
), null, agenda, out int textIndex, maxTokenCount - tokenCount);
1636
if (symbols[top.Left].pieceSpan.
Length
== 0 || symbols[top.Right].pieceSpan.
Length
== 0 ||
1637
symbols[top.Left].pieceSpan.
Length
+ symbols[top.Right].pieceSpan.
Length
!= top.Length)
1643
symbols[top.Left].pieceSpan = (symbols[top.Left].pieceSpan.Index, symbols[top.Left].pieceSpan.
Length
+ symbols[top.Right].pieceSpan.
Length
);
1663
if (_vocab.TryGetValue(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), out (int Id, string Token) value))
1665
result.Add(GetToken(value.Id, value.Token, symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
, originalText, mapping));
1669
result.Add(GetToken(UnknownTokenId.Value, UnknownToken!, symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
, originalText, mapping));
1693
if (!_mergeRanks.TryGetValue(textSpan.Slice(symbols[left].pieceSpan.Index, symbols[left].pieceSpan.
Length
), textSpan.Slice(symbols[right].pieceSpan.Index, symbols[right].pieceSpan.
Length
), out int rank))
1698
SymbolPair pair = new(left, right, rank, symbols[left].pieceSpan.
Length
+ symbols[right].pieceSpan.
Length
);
Model\EnglishRobertaTokenizer.cs (6)
337
foreach (EncodedToken t in EncodeInternal(textSpanToEncode.Slice(split.Offset, split.
Length
)))
441
EncodeToIdsInternal(textSpanToEncode.Slice(split.Offset, split.
Length
), ids, out int length, maxTokenCount - ids.Count);
444
if (length < split.
Length
|| ids.Count >= maxTokenCount)
525
count += EncodeToIdsInternal(textSpanToEncode.Slice(split.Offset, split.
Length
), null, out int length, maxTokenCount - count);
528
if (length < split.
Length
|| count >= maxTokenCount)
572
tokenCount += EncodeToIdsFromEndInternal(textSpanToEncode.Slice(split.Offset, split.
Length
), null, out int textIndex, maxTokenCount - tokenCount);
Model\SentencePieceBpeModel.cs (45)
168
if (_vocab.TryGetValue(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), out (int Id, float Score, byte Type) tokenInfo))
184
EncodeAsBytes(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), symbols[index].pieceSpan.Index);
190
GetTokenString(id, symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
, text),
191
new Range(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.Index + symbols[index].pieceSpan.
Length
)));
262
if (!_vocab.TryGetValue(text.Slice(pieceSpan.Index, pieceSpan.
Length
), out (int Id, float Score, byte Type) id))
264
EncodeAsBytes(text.Slice(pieceSpan.Index, pieceSpan.
Length
), pieceSpan.Index);
270
!revMerge.TryGetValue((pieceSpan.Index, pieceSpan.
Length
), out (int LeftIndex, int LeftLen, int RightIndex, int RightLen) merge))
272
tokens.Add(new EncodedToken(id.Id, text.Slice(pieceSpan.Index, pieceSpan.
Length
).ToString(), new Range(pieceSpan.Index, pieceSpan.Index + pieceSpan.
Length
)));
446
if (_vocab.TryGetValue(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), out (int Id, float Score, byte Type) tokenInfo))
462
if (!EncodeAsBytes(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), symbols[index].pieceSpan.Index, ref charsConsumed))
473
charsConsumed += symbols[index].pieceSpan.
Length
;
568
if (!_vocab.TryGetValue(text.Slice(pieceSpan.Index, pieceSpan.
Length
), out (int Id, float Score, byte Type) id))
570
return EncodeAsBytes(text.Slice(pieceSpan.Index, pieceSpan.
Length
), pieceSpan.Index, ref charsConsumed);
575
!revMerge.TryGetValue((pieceSpan.Index, pieceSpan.
Length
), out (int LeftIndex, int LeftLen, int RightIndex, int RightLen) merge))
580
charsConsumed += pieceSpan.
Length
;
714
if (_vocab.TryGetValue(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), out (int Id, float Score, byte Type) tokenInfo))
730
if (!EncodeAsBytes(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), symbols[index].pieceSpan.Index, ref charsConsumed))
740
charsConsumed += symbols[index].pieceSpan.
Length
;
827
if (!_vocab.TryGetValue(text.Slice(pieceSpan.Index, pieceSpan.
Length
), out (int Id, float Score, byte Type) id))
829
return EncodeAsBytes(text.Slice(pieceSpan.Index, pieceSpan.
Length
), pieceSpan.Index, ref charsConsumed);
834
!revMerge.TryGetValue((pieceSpan.Index, pieceSpan.
Length
), out (int LeftIndex, int LeftLen, int RightIndex, int RightLen) merge))
839
charsConsumed += pieceSpan.
Length
;
919
if (current.Offset + current.
Length
< text.Length)
921
splitText = text.Slice(current.Offset + current.
Length
);
930
if (InternalSpecialTokens!.TryGetValue(text.Slice(current.Offset, current.
Length
), out int id))
934
textIndex -= current.
Length
;
938
int start = i > 0 ? splits[i - 1].Offset + splits[i - 1].
Length
: 0;
990
if (_vocab.TryGetValue(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), out (int Id, float Score, byte Type) tokenInfo))
1006
if (!EncodeAsBytesFromEnd(text.Slice(symbols[index].pieceSpan.Index, symbols[index].pieceSpan.
Length
), symbols[index].pieceSpan.Index, ref textIndex))
1016
textIndex -= symbols[index].pieceSpan.
Length
;
1103
if (!_vocab.TryGetValue(text.Slice(pieceSpan.Index, pieceSpan.
Length
), out (int Id, float Score, byte Type) id))
1105
return EncodeAsBytesFromEnd(text.Slice(pieceSpan.Index, pieceSpan.
Length
), pieceSpan.Index, ref textIndex);
1110
!revMerge.TryGetValue((pieceSpan.Index, pieceSpan.
Length
), out (int LeftIndex, int LeftLen, int RightIndex, int RightLen) merge))
1115
textIndex -= pieceSpan.
Length
;
1164
if (symbols[top.Left].pieceSpan.
Length
== 0 || symbols[top.Right].pieceSpan.
Length
== 0 ||
1165
symbols[top.Left].pieceSpan.
Length
+ symbols[top.Right].pieceSpan.
Length
!= top.Length)
1171
symbols[top.Left].pieceSpan = (symbols[top.Left].pieceSpan.Index, symbols[top.Left].pieceSpan.
Length
+ symbols[top.Right].pieceSpan.
Length
);
1197
int pieceLength = symbols[left].pieceSpan.
Length
+ symbols[right].pieceSpan.
Length
;
1211
revMerge.Add((symbols[left].pieceSpan.Index, pieceLength), (symbols[left].pieceSpan.Index, symbols[left].pieceSpan.
Length
, symbols[right].pieceSpan.Index, symbols[right].pieceSpan.
Length
));
Model\SentencePieceUnigramModel.cs (6)
1295
if (current.Offset + current.
Length
< text.Length)
1297
GetIndexByTokenCountFromEndInternal(text.Slice(current.Offset + current.
Length
), considerNormalization, ref tokenCount, buffer, ref normalizedString, ref normalizedStringCountFromEnd, ref charConsumedFromEnd, maxTokenCount);
1306
if (InternalSpecialTokens!.TryGetValue(text.Slice(current.Offset, current.
Length
), out int id))
1311
charConsumedFromEnd += current.
Length
;
1316
StoreNormalizedTextFromEnd(text.Slice(current.Offset, current.
Length
), ref normalizedString, ref normalizedStringCountFromEnd);
1321
int start = i > 0 ? splits[i - 1].Offset + splits[i - 1].
Length
: 0;
Model\TiktokenTokenizer.cs (6)
282
EncodeToTokens(textSpanToEncode.Slice(split.Offset, split.
Length
), tokens, split.Offset);
393
EncodeToIds(textSpanToEncode.Slice(split.Offset, split.
Length
), ids, out int length, maxTokenCount - ids.Count);
396
if (length < split.
Length
|| ids.Count >= maxTokenCount)
560
count += CountTokens(textSpanToEncode.Slice(split.Offset, split.
Length
), out int length, maxTokenCount - count);
563
if (length < split.
Length
|| count >= maxTokenCount)
687
tokenCount += CountTokensFromEnd(textSpanToEncode.Slice(split.Offset, split.
Length
), out int textIndex, maxTokenCount - tokenCount);
Model\WordPieceTokenizer.cs (9)
296
EncodeToTokens(textSpanToEncode.Slice(split.Offset, split.
Length
), tokens, split.Offset);
420
EncodeToIds(textSpanToEncode.Slice(split.Offset, split.
Length
), ids, out int length, maxTokenCount - ids.Count);
422
if (length < split.
Length
|| ids.Count >= maxTokenCount)
570
count += EncodeToIds(textSpanToEncode.Slice(split.Offset, split.
Length
), accumulatedIds: null, out int length, maxTokenCount - count);
572
if (length < split.
Length
|| count >= maxTokenCount)
649
int count = EncodeToIds(textSpanToEncode.Slice(split.Offset, split.
Length
), accumulatedIds: null, out charsConsumed, settings.MaxTokenCount - tokenCount);
650
if (charsConsumed != split.
Length
)
652
return fromEnd ? split.Offset + split.
Length
: split.Offset;
659
return fromEnd ? split.Offset : split.Offset + split.
Length
;
PreTokenizer\CompositePreTokenizer.cs (8)
88
yield return (range.Offset, range.
Length
);
89
beginning += range.
Length
;
116
beginning = range.Offset + range.
Length
;
118
yield return (offset + range.Offset, range.
Length
);
166
yield return (range.Offset, range.
Length
);
167
beginning += range.
Length
;
194
beginning = range.Offset + range.
Length
;
196
yield return (offset + range.Offset, range.
Length
);
PreTokenizer\PreTokenizer.cs (4)
38
yield return (match.Offset, match.
Length
);
39
beginning = match.Offset + match.
Length
;
146
yield return (match.Offset, match.
Length
);
147
beginning = match.Offset + match.
Length
;
PreTokenizer\RegexPreTokenizer.cs (12)
74
yield return (match.Offset, match.
Length
);
75
beginning = match.Offset + match.
Length
;
78
yield return (specialMatch.Offset, specialMatch.
Length
);
79
beginning = specialMatch.Offset + specialMatch.
Length
;
85
yield return (match.Offset, match.
Length
);
86
beginning = match.
Length
+ match.Offset;
125
yield return (match.Offset, match.
Length
);
126
beginning = match.Offset + match.
Length
;
129
yield return (specialMatch.Offset, specialMatch.
Length
);
130
beginning = specialMatch.Offset + specialMatch.
Length
;
136
yield return (match.Offset, match.
Length
);
137
beginning = match.
Length
+ match.Offset;
Microsoft.ML.Tokenizers.Tests (23)
BpeTests.cs (3)
497
int expectedLength = expectedOffsets[expectedOffsets.Length - 3].Index + expectedOffsets[expectedOffsets.Length - 3].
Length
;
506
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text, expectedIds.Length - 3, out normalizedText, out int tokenCount));
509
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIds.Length - 3, out normalizedText, out tokenCount));
CodeGenTests.cs (8)
404
int expectedLength = offsets.Length > expectedTokensToExclude ? offsets[offsets.Length - expectedTokensToExclude - 1].Index + offsets[offsets.Length - expectedTokensToExclude - 1].
Length
: 0;
443
Assert.Equal(offsets[offsets.Length - 1].Index + offsets[offsets.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text, ids.Length, out normalizedText, out int tokenCount));
446
Assert.Equal(offsets[offsets.Length - 1].Index + offsets[offsets.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text.AsSpan(), ids.Length, out normalizedText, out tokenCount));
450
Assert.Equal(expectedOffsets[expectedOffsets.Length - 1].Index + expectedOffsets[expectedOffsets.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text, expectedIds.Length, addPrefixSpace: false, addBeginningOfSentence: false, addEndOfSentence: false, out normalizedText, out tokenCount));
453
Assert.Equal(expectedOffsets[expectedOffsets.Length - 1].Index + expectedOffsets[expectedOffsets.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIds.Length, addPrefixSpace: false, addBeginningOfSentence: false, addEndOfSentence: false, out normalizedText, out tokenCount));
457
Assert.Equal(expectedOffsetsWithSpace[expectedOffsetsWithSpace.Length - 1].Index + expectedOffsetsWithSpace[expectedOffsetsWithSpace.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text, expectedIdsWithSpace.Length, addPrefixSpace: true, addBeginningOfSentence: false, addEndOfSentence: false, out normalizedText, out tokenCount));
460
Assert.Equal(expectedOffsetsWithSpace[expectedOffsetsWithSpace.Length - 1].Index + expectedOffsetsWithSpace[expectedOffsetsWithSpace.Length - 1].
Length
, codeGenTokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIdsWithSpace.Length, addPrefixSpace: true, addBeginningOfSentence: false, addEndOfSentence: false, out normalizedText, out tokenCount));
995
int calculatedLengthUsingOffsets = expectedTokenCount > 0 ? offsets[expectedTokenCount - 1].Index + offsets[expectedTokenCount - 1].
Length
: 0;
EnglishRobertaTests.cs (3)
203
int expectedLength = expectedOffsets[expectedOffsets.Length - 3].Index + expectedOffsets[expectedOffsets.Length - 3].
Length
;
212
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text, expectedIds.Length - 3, out normalizedText, out int tokenCount));
215
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIds.Length - 3, out normalizedText, out tokenCount));
LlamaTests.cs (3)
608
int expectedLength = expectedOffsets1[expectedOffsets1.Length - 1].Index + expectedOffsets1[expectedOffsets1.Length - 1].
Length
;
635
Assert.Equal(expectedOffsets[expectedOffsets.Length - 7].Index + expectedOffsets[expectedOffsets.Length - 7].
Length
, tokenizer.GetIndexByTokenCount(text, expectedIds.Length - 6, out string? normalizedString, out int tokenCount));
638
Assert.Equal(expectedOffsets[expectedOffsets.Length - 7].Index + expectedOffsets[expectedOffsets.Length - 7].
Length
, tokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIds.Length - 6, out normalizedString, out tokenCount));
TiktokenTests.cs (6)
652
int expectedLength = expectedOffsets[expectedOffsets.Length - 5].Index + expectedOffsets[expectedOffsets.Length - 5].
Length
;
661
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text, expectedIds.Length - 3, out normalizedText, out int tokenCount));
664
Assert.Equal(expectedOffsets[expectedOffsets.Length - 4].Index + expectedOffsets[expectedOffsets.Length - 4].
Length
, tokenizer.GetIndexByTokenCount(text.AsSpan(), expectedIds.Length - 3, out normalizedText, out tokenCount));
762
Assert.True(offsets[count + 1].Index < offsets[count].Index + offsets[count].
Length
);
767
Assert.Equal(offsets[count - 1].Index + offsets[count - 1].
Length
, length);
780
Assert.True(offsets[offsets.Length - tokenCount].Index < offsets[offsets.Length - tokenCount - 1].Index + offsets[offsets.Length - tokenCount - 1].
Length
);
System.Formats.Cbor (4)
System\Formats\Cbor\Reader\CborReader.Map.cs (3)
139
ReadOnlySpan<byte> previousKeyEncoding = buffer.Slice(previousKeyEncodingRange.Offset, previousKeyEncodingRange.
Length
);
140
ReadOnlySpan<byte> currentKeyEncoding = buffer.Slice(currentKeyEncodingRange.Offset, currentKeyEncodingRange.
Length
);
210
return _reader._data.Span.Slice(range.Offset, range.
Length
);
System\Formats\Cbor\Writer\CborWriter.Map.cs (1)
272
return _writer._buffer.AsSpan(range.Offset, range.
Length
);
System.Text.Json (2)
System\Text\Json\Document\JsonDocument.PropertyNameSet.cs (2)
89
ReadOnlySpan<byte> previousPropertyName = utf8Json.Span.Slice(range.Start, range.
Length
);
124
ReadOnlyMemory<byte> propertyName = utf8Json.Slice(range.Start, range.
Length
);
System.Text.RegularExpressions (3)
System\Text\RegularExpressions\Regex.EnumerateSplits.cs (3)
224
_currentSplit = !_regex.RightToLeft ? (_lastMatch.Index + _lastMatch.
Length
).._input.Length : 0.._lastMatch.Index;
229
(bool Success, int Index, int Length, int TextPosition) match = _regex.RunSingleMatch(RegexRunnerMode.BoundsRequired, _lastMatch.
Length
, _input, _startAt);
235
int start = _lastMatch.Index + Math.Max(_lastMatch.
Length
, 0);