12 references to RegexPreTokenizer
Microsoft.ML.GenAI.LLaMA (1)
LlamaTokenizerHelper.cs (1)
52var preTokenizer = new RegexPreTokenizer(new Regex(_re), _specialTokens);
Microsoft.ML.Tokenizers (11)
Model\CodeGenTokenizer.cs (1)
1897new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (3)
1240new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens), 1380new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens), 1420new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
PreTokenizer\PreTokenizer.cs (6)
68return _whiteSpaceOrPunctuationPreTokenizer ??= new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), null); 71return new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), specialTokens); 98return _wordOrNonWordPreTokenizer ??= new RegexPreTokenizer(WordOrNonWordRegex(), null); 101return new RegexPreTokenizer(WordOrNonWordRegex(), specialTokens); 127return _whiteSpacePreTokenizer ??= new RegexPreTokenizer(WhiteSpaceRegex(), null); 130return new RegexPreTokenizer(WhiteSpaceRegex(), specialTokens);