14 references to RegexPreTokenizer
Microsoft.ML.GenAI.LLaMA (1)
LlamaTokenizerHelper.cs (1)
52var preTokenizer = new RegexPreTokenizer(new Regex(_re), _specialTokens);
Microsoft.ML.Tokenizers (12)
Model\CodeGenTokenizer.cs (1)
1879new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (3)
1255new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens), 1395new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens), 1435new RegexPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
PreTokenizer\CompositePreTokenizer.cs (1)
56list.Add(new RegexPreTokenizer(new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled), null));
PreTokenizer\PreTokenizer.cs (6)
68return _whiteSpaceOrPunctuationPreTokenizer ??= new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), null); 71return new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), specialTokens); 98return _wordOrNonWordPreTokenizer ??= new RegexPreTokenizer(WordOrNonWordRegex(), null); 101return new RegexPreTokenizer(WordOrNonWordRegex(), specialTokens); 127return _whiteSpacePreTokenizer ??= new RegexPreTokenizer(WhiteSpaceRegex(), null); 130return new RegexPreTokenizer(WhiteSpaceRegex(), specialTokens);
Microsoft.ML.Tokenizers.Tests (1)
BpeTests.cs (1)
984preTokenizers.Add(new RegexPreTokenizer(new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds)), null));