4 types derived from PreTokenizer
Microsoft.ML.Tokenizers (3)
PreTokenizer\CompositePreTokenizer.cs (1)
16public class CompositePreTokenizer : PreTokenizer
PreTokenizer\RegexPreTokenizer.cs (1)
16public sealed partial class RegexPreTokenizer : PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13public sealed partial class RobertaPreTokenizer : PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76public class SpacePreTokenizer : PreTokenizer
87 references to PreTokenizer
Microsoft.ML.Tokenizers (73)
Model\BertTokenizer.cs (2)
809options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) : PreTokenizer.CreateWhiteSpace();
Model\BpeOptions.cs (1)
51public PreTokenizer? PreTokenizer { get; set; }
Model\BPETokenizer.cs (9)
30private readonly PreTokenizer? _preTokenizer; 94=> Create(vocabFile, mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 114PreTokenizer? preTokenizer = null, 204=> Create(vocabStream, mergesStream, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, specialTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false); 224PreTokenizer? preTokenizer = null, 260PreTokenizer? preTokenizer = null, 296PreTokenizer? preTokenizer, 311_preTokenizer = preTokenizer ?? PreTokenizer.CreateWordOrNonWord(); // Default to WordOrNonWord pre-tokenizer 418public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
35private readonly PreTokenizer? _preTokenizer; 58PreTokenizer? preTokenizer = null, 90PreTokenizer? preTokenizer = null, 103private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream, PreTokenizer? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? specialTokens, bool addPrefixSpace, 252public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28private readonly PreTokenizer? _preTokenizer; 67PreTokenizer? preTokenizer = null, 104PreTokenizer? preTokenizer = null, 118internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 135internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) : 140private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream, PreTokenizer? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream) 255public override PreTokenizer? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37PreTokenizer? preTokenizer = null, 68PreTokenizer? preTokenizer = null,
Model\SentencePieceBpeModel.cs (4)
116foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 375foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 652foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 907(int Offset, int Length)[] splits = PreTokenizer.SplitText(text, SpecialTokensRegex!).ToArray();
Model\SentencePieceTokenizer.cs (1)
101public override PreTokenizer? PreTokenizer => null;
Model\SentencePieceUnigramModel.cs (4)
228foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 651foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 1022foreach ((int Offset, int Length) in PreTokenizer.SplitText(text, SpecialTokensRegex!)) 1283(int Offset, int Length)[] splits = PreTokenizer.SplitText(text, SpecialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33private readonly PreTokenizer? _preTokenizer; 46internal TiktokenTokenizer(string vocabFilePath, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 61internal TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) : 80PreTokenizer? preTokenizer, 102private TiktokenTokenizer(Stream vocabStream, PreTokenizer? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream) 127public override PreTokenizer? PreTokenizer => _preTokenizer; 1203private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1206internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1209internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); 1275PreTokenizer? preTokenizer, 1295PreTokenizer? preTokenizer, 1316PreTokenizer? preTokenizer, 1348PreTokenizer? preTokenizer,
Model\WordPieceOptions.cs (1)
21public PreTokenizer? PreTokenizer { get; set; }
Model\WordPieceTokenizer.cs (3)
27private readonly PreTokenizer? _preTokenizer; 72_preTokenizer = options.PreTokenizer ?? PreTokenizer.CreateWhiteSpace(options.SpecialTokens); 243public override PreTokenizer? PreTokenizer => _preTokenizer;
PreTokenizer\CompositePreTokenizer.cs (10)
19private readonly IReadOnlyList<PreTokenizer> _preTokenizers; 31public CompositePreTokenizer(IReadOnlyList<PreTokenizer> preTokenizers, IReadOnlyDictionary<string, int>? specialTokens = null) 44foreach (var preTokenizer in preTokenizers) 54var list = new List<PreTokenizer>(specialTokens.Count + 1); 58foreach (var preTokenizer in preTokenizers) 74public IReadOnlyList<PreTokenizer> PreTokenizers => _preTokenizers; 90static IEnumerable<(int Offset, int Length)> SplitText(string text, IReadOnlyList<PreTokenizer> preTokenizers, int preTokenizerIndex, int offset, int length) 93var preTokenizer = preTokenizers[preTokenizerIndex]; 168static IEnumerable<(int Offset, int Length)> SplitText(char[] text, IReadOnlyList<PreTokenizer> preTokenizers, int preTokenizerIndex, int offset, int length) 171var preTokenizer = preTokenizers[preTokenizerIndex];
PreTokenizer\PreTokenizer.cs (9)
47private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer; 56/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the whitespace or punctuation characters. 63public static PreTokenizer CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null) 75private static PreTokenizer? _wordOrNonWordPreTokenizer; 85/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the word or non-word boundary. 93public static PreTokenizer CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null) 105private static PreTokenizer? _whiteSpacePreTokenizer; 115/// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the white spaces. 122public static PreTokenizer CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokens = null)
Tokenizer.cs (2)
24public virtual PreTokenizer? PreTokenizer => null; 433PreTokenizer? preTokenizer,
Microsoft.ML.Tokenizers.Tests (14)
BpeTests.cs (9)
256BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWord(), normalizer: null, unknownToken: unknownToken, 273PreTokenizer = PreTokenizer.CreateWordOrNonWord(), 536var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream, PreTokenizer.CreateWordOrNonWord(specialTokens), normalizer: null, specialTokens: specialTokens, unknownToken: "<|endoftext|>"); 583internal static BpeTokenizer CreateEmptyBpe(PreTokenizer? preTokenizer = null, Normalizer? normalizer = null) 592vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ?? PreTokenizer.CreateWordOrNonWord(), normalizer: normalizer, unknownToken: "Ukn"); 918IReadOnlyList<PreTokenizer>? preTokenizers = GetPreTokenizer(root, out bool byteLevel); 955private static IReadOnlyList<PreTokenizer>? GetPreTokenizer(JsonElement root, out bool byteLevel) 958List<PreTokenizer> preTokenizers = new List<PreTokenizer>();
PreTokenizerTests.cs (5)
21PreTokenizer.CreateWordOrNonWord(), 28PreTokenizer.CreateWordOrNonWord(), 35PreTokenizer.CreateWhiteSpace(), 58public void TestPreTokenizer(PreTokenizer preTokenizer, string text, (int Offset, int Length)[] splits) 73Assert.Empty(PreTokenizer.CreateWordOrNonWord().PreTokenize((string)null!));