3 types derived from PreTokenizer
Microsoft.ML.Tokenizers (2)
PreTokenizer\RegexPreTokenizer.cs (1)
16public sealed partial class RegexPreTokenizer : PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13public sealed partial class RobertaPreTokenizer : PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76public class SpacePreTokenizer : PreTokenizer
75 references to PreTokenizer
Microsoft.ML.Tokenizers (66)
Model\BertTokenizer.cs (4)
28PreTokenizer? preTokenizer, 808PreTokenizer? preTokenizer = doBasicTokenization ? 809PreTokenizer.CreateWhiteSpaceOrPunctuationPreTokenizer(splitOnSpecialTokens ? specialTokens : null) : 810PreTokenizer.CreateWhiteSpacePreTokenizer();
Model\BPETokenizer.cs (9)
30private readonly PreTokenizer? _preTokenizer; 90=> Create(vocabFile, mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 107PreTokenizer? preTokenizer = null, 134=> Create(vocabStream, mergesStream, preTokenizer: PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: null, addedTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 151PreTokenizer? preTokenizer = null, 184PreTokenizer? preTokenizer = null, 217PreTokenizer? preTokenizer, 228_preTokenizer = preTokenizer ?? PreTokenizer.CreateWordOrNonWordPreTokenizer(); // Default to WordOrNonWord pre-tokenizer 285public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
32private readonly PreTokenizer? _preTokenizer; 55PreTokenizer? preTokenizer = null, 87PreTokenizer? preTokenizer = null, 100private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream, PreTokenizer? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? addedTokens, bool addPrefixSpace, 249public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28private readonly PreTokenizer? _preTokenizer; 61PreTokenizer? preTokenizer = null, 92PreTokenizer? preTokenizer = null, 106internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 123internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 128private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream) 243public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37PreTokenizer? preTokenizer = null, 68PreTokenizer? preTokenizer = null,
Model\SentencePieceTokenizer.cs (5)
159public override PreTokenizer? PreTokenizer => null; 281foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 611foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 944foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, _specialTokensRegex!)) 1302(int Offset, int Length)[] splits = PreTokenizer.SplitText(text, _specialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33private readonly PreTokenizer? _preTokenizer; 46internal TiktokenTokenizer(string vocabFilePath, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 61internal TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 80PreTokenizer? preTokenizer, 102private TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream) 127public override PreTokenizer? PreTokenizer => _preTokenizer; 1188private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1191internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1194internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1257PreTokenizer? preTokenizer, 1274PreTokenizer? preTokenizer, 1292PreTokenizer? preTokenizer, 1321PreTokenizer? preTokenizer,
Model\WordPieceTokenizer.cs (10)
27private readonly PreTokenizer? _preTokenizer; 38PreTokenizer? preTokenizer, 77_preTokenizer = preTokenizer ?? PreTokenizer.CreateWhiteSpacePreTokenizer(specialTokens); 142PreTokenizer? preTokenizer = null, 166PreTokenizer? preTokenizer = null, 175PreTokenizer? preTokenizer, 220PreTokenizer? preTokenizer = null, 255PreTokenizer? preTokenizer = null, 266PreTokenizer? preTokenizer, 298public override PreTokenizer? PreTokenizer => _preTokenizer;
PreTokenizer\PreTokenizer.cs (9)
47private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer; 56/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the whitespace or punctuation characters. 60public static PreTokenizer CreateWhiteSpaceOrPunctuationPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null) 72private static PreTokenizer? _wordOrNonWordPreTokenizer; 82/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the word or non-word boundary. 87public static PreTokenizer CreateWordOrNonWordPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null) 99private static PreTokenizer? _whiteSpacePreTokenizer; 109/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the white spaces. 113public static PreTokenizer CreateWhiteSpacePreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
Tokenizer.cs (2)
24public virtual PreTokenizer? PreTokenizer => null; 433PreTokenizer? preTokenizer,
Microsoft.ML.Tokenizers.Tests (9)
BpeTests.cs (4)
254BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: null, unknownToken: unknownToken, 503var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream, PreTokenizer.CreateWordOrNonWordPreTokenizer(addedTokens), normalizer: null, addedTokens: addedTokens, unknownToken: "<|endoftext|>"); 550internal static BpeTokenizer CreateEmptyBpe(PreTokenizer? preTokenizer = null, Normalizer? normalizer = null) 559vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ?? PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: normalizer, unknownToken: "Ukn");
PreTokenizerTests.cs (5)
21PreTokenizer.CreateWordOrNonWordPreTokenizer(), 28PreTokenizer.CreateWordOrNonWordPreTokenizer(), 35PreTokenizer.CreateWhiteSpacePreTokenizer(), 58public void TestPreTokenizer(PreTokenizer preTokenizer, string text, (int Offset, int Length)[] splits) 73Assert.Empty(PreTokenizer.CreateWordOrNonWordPreTokenizer().PreTokenize((string)null!));