EncodedToken

EncodedToken.cs (2)

13public readonly struct EncodedToken : IEquatable<EncodedToken> 44public bool Equals(EncodedToken other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);

Model\BPETokenizer.cs (7)

431/// Encodes input text to a list of <see cref="EncodedToken" />s. 436protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 440return new EncodeResults<EncodedToken> { Tokens = [], NormalizedText = null, CharsConsumed = 0 }; 454List<EncodedToken> tokens = new(); 479return new EncodeResults<EncodedToken> { Tokens = tokens, NormalizedText = normalizedText, CharsConsumed = charsConsumed }; 1305internal void WordToTokens(ref Word word, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping) => word.ToTokens(VocabReverse, tokens, offset, mapping); 1307internal void EncodeWithCache(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset, ref PriorityQueue<Merge>? priorityQueue)

Model\CodeGenTokenizer.cs (31)

Model\EnglishRobertaTokenizer.cs (27)

Model\SentencePieceBaseModel.cs (1)

104public abstract IReadOnlyList<EncodedToken> EncodeToTokens(

Model\SentencePieceBpeModel.cs (4)

67public override IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization) 91List<EncodedToken> tokens = new(); 105private void EncodeWithSpecialTokens(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens) 150private void EncodeInternal(ReadOnlySpan<char> text, bool addBeginOfSentence, bool addEndOfSentence, List<EncodedToken> tokens)

Model\SentencePieceTokenizer.cs (9)

114/// Encodes input text to a list of <see cref="EncodedToken" />s. 119protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 121return new EncodeResults<EncodedToken> 130/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 138/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 139public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true) 143/// Encodes input text a list of <see cref="EncodedToken" />s with string value of the token, id, and offset. 151/// <returns>The tokenization result includes a list of <see cref="EncodedToken" />s with string value of the token, id, and offset.</returns> 152public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerPreTokenization = true, bool considerNormalization = true)

Model\SentencePieceUnigramModel.cs (8)

100public override IReadOnlyList<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, out string? normalizedText, bool addBeginningOfSentence, bool addEndOfSentence, bool considerNormalization) 106return Array.Empty<EncodedToken>(); 109List<EncodedToken> tokens = new(); 212List<EncodedToken> tokens, 264List<EncodedToken> tokens, 330List<EncodedToken> tokens, 376EncodedToken temp = tokens[start]; 412private void FallbackToByteEncoding(ReadOnlySpan<char> normalizationSpan, List<EncodedToken> tokens, int insertionStartPosition)

Model\TiktokenTokenizer.cs (6)

253/// Encodes input text to a list of <see cref="EncodedToken" />s. 258protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 262return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 276List<EncodedToken> tokens = new(); 290return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 299private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset)

Model\Word.cs (1)

292public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping)

Model\WordPieceTokenizer.cs (7)

267/// Encodes input text to a list of <see cref="EncodedToken" />s. 272protected override EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings) 276return new EncodeResults<EncodedToken> { NormalizedText = null, Tokens = [], CharsConsumed = 0 }; 290List<EncodedToken> tokens = new(); 304return new EncodeResults<EncodedToken> { NormalizedText = normalizedText, Tokens = tokens, CharsConsumed = charsConsumed }; 313private void EncodeToTokens(ReadOnlySpan<char> text, List<EncodedToken> tokens, int offset) 337EncodedToken curToken = default;

Tokenizer.cs (14)

44EncodeResults<EncodedToken> results = EncodeToTokens(text, textSpan, settings); 133/// Encodes input text to a list of <see cref="EncodedToken" />s. 138protected abstract EncodeResults<EncodedToken> EncodeToTokens(string? text, ReadOnlySpan<char> textSpan, EncodeSettings settings); 141/// Encodes input text to a list of <see cref="EncodedToken" />s. 147/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 148public IReadOnlyList<EncodedToken> EncodeToTokens(string text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 150EncodeResults<EncodedToken> result = EncodeToTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 157/// Encodes input text to a list of <see cref="EncodedToken" />s. 163/// <returns>The list of encoded <see cref="EncodedToken" />s.</returns> 164public IReadOnlyList<EncodedToken> EncodeToTokens(ReadOnlySpan<char> text, out string? normalizedText, bool considerPreTokenization = true, bool considerNormalization = true) 166EncodeResults<EncodedToken> result = EncodeToTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization }); 235EncodeResults<EncodedToken> tokens = EncodeToTokens(text, textSpan, settings); 243var token = tokens.Tokens[tokenCount - 1]; 253var token = tokens.Tokens[tokens.Tokens.Count - tokenCount];