3 types derived from PreTokenizer
Microsoft.ML.Tokenizers (2)
PreTokenizer\RegexPreTokenizer.cs (1)
16public sealed partial class RegexPreTokenizer : PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13public sealed partial class RobertaPreTokenizer : PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76public class SpacePreTokenizer : PreTokenizer
67 references to PreTokenizer
Microsoft.ML.Tokenizers (58)
Model\BertTokenizer.cs (2)
801options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? options.SpecialTokens : null) : PreTokenizer.CreateWhiteSpace();
Model\BPETokenizer.cs (9)
30private readonly PreTokenizer? _preTokenizer; 93=> Create(vocabFile, mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 113PreTokenizer? preTokenizer = null, 143=> Create(vocabStream, mergesStream, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, specialTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 163PreTokenizer? preTokenizer = null, 199PreTokenizer? preTokenizer = null, 232PreTokenizer? preTokenizer, 243_preTokenizer = preTokenizer ?? PreTokenizer.CreateWordOrNonWord(); // Default to WordOrNonWord pre-tokenizer 300public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
32private readonly PreTokenizer? _preTokenizer; 55PreTokenizer? preTokenizer = null, 87PreTokenizer? preTokenizer = null, 100private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream, PreTokenizer? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? specialTokens, bool addPrefixSpace, 249public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28private readonly PreTokenizer? _preTokenizer; 67PreTokenizer? preTokenizer = null, 104PreTokenizer? preTokenizer = null, 118internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 135internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 140private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream) 255public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37PreTokenizer? preTokenizer = null, 68PreTokenizer? preTokenizer = null,
Model\SentencePieceTokenizer.cs (5)
159public override PreTokenizer? PreTokenizer => null; 281foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 611foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 944foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 1302(int Offset, int Length)[] splits = PreTokenizer.SplitText(text, _specialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33private readonly PreTokenizer? _preTokenizer; 46internal TiktokenTokenizer(string vocabFilePath, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 61internal TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 80PreTokenizer? preTokenizer, 102private TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream) 127public override PreTokenizer? PreTokenizer => _preTokenizer; 1178[GeneratedRegex(Cl100kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] 1181[GeneratedRegex(P50kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] 1184[GeneratedRegex(O200kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] 1260PreTokenizer? preTokenizer, 1280PreTokenizer? preTokenizer, 1301PreTokenizer? preTokenizer, 1333PreTokenizer? preTokenizer,
Model\WordPieceOptions.cs (1)
21public PreTokenizer? PreTokenizer { get; set; }
Model\WordPieceTokenizer.cs (3)
27private readonly PreTokenizer? _preTokenizer; 72_preTokenizer = options.PreTokenizer ?? PreTokenizer.CreateWhiteSpace(options.SpecialTokens); 243public override PreTokenizer? PreTokenizer => _preTokenizer;
PreTokenizer\PreTokenizer.cs (9)
47private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer; 56/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the whitespace or punctuation characters. 63public static PreTokenizer CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null) 75private static PreTokenizer? _wordOrNonWordPreTokenizer; 85/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the word or non-word boundary. 93public static PreTokenizer CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null) 105private static PreTokenizer? _whiteSpacePreTokenizer; 115/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the white spaces. 122public static PreTokenizer CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokens = null)
Tokenizer.cs (2)
24public virtual PreTokenizer? PreTokenizer => null; 433PreTokenizer? preTokenizer,
Microsoft.ML.Tokenizers.Tests (9)
BpeTests.cs (4)
254BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, unknownToken: unknownToken, 503var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream, PreTokenizer.CreateWordOrNonWord(specialTokens), normalizer: null, specialTokens: specialTokens, unknownToken: "<|endoftext|>"); 550internal static BpeTokenizer CreateEmptyBpe(PreTokenizer? preTokenizer = null, Normalizer? normalizer = null) 559vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ?? PreTokenizer.CreateWordOrNonWord(), normalizer: normalizer, unknownToken: "Ukn");
PreTokenizerTests.cs (5)
21PreTokenizer.CreateWordOrNonWord(), 28PreTokenizer.CreateWordOrNonWord(), 35PreTokenizer.CreateWhiteSpace(), 58public void TestPreTokenizer(PreTokenizer preTokenizer, string text, (int Offset, int Length)[] splits) 73Assert.Empty(PreTokenizer.CreateWordOrNonWord().PreTokenize((string)null!));