12 references to RegexPreTokenizer
Microsoft.ML.GenAI.LLaMA (1)
LlamaTokenizerHelper.cs (1)
52
var preTokenizer = new
RegexPreTokenizer
(new Regex(_re), _specialTokens);
Microsoft.ML.Tokenizers (11)
Model\CodeGenTokenizer.cs (1)
1897
new
RegexPreTokenizer
(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new
RegexPreTokenizer
(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (3)
1240
new
RegexPreTokenizer
(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
1380
new
RegexPreTokenizer
(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
1420
new
RegexPreTokenizer
(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
PreTokenizer\PreTokenizer.cs (6)
68
return _whiteSpaceOrPunctuationPreTokenizer ??= new
RegexPreTokenizer
(WhiteSpaceOrPunctuationRegex(), null);
71
return new
RegexPreTokenizer
(WhiteSpaceOrPunctuationRegex(), specialTokens);
98
return _wordOrNonWordPreTokenizer ??= new
RegexPreTokenizer
(WordOrNonWordRegex(), null);
101
return new
RegexPreTokenizer
(WordOrNonWordRegex(), specialTokens);
127
return _whiteSpacePreTokenizer ??= new
RegexPreTokenizer
(WhiteSpaceRegex(), null);
130
return new
RegexPreTokenizer
(WhiteSpaceRegex(), specialTokens);