8 references to P50kBaseRegex
Microsoft.ML.Tokenizers (8)
Model\CodeGenTokenizer.cs (1)
1897new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (4)
1127return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName); 1133return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName); 1137{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName); 1140return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
PreTokenizer\RobertaPreTokenizer.cs (2)
32return SplitText(text, TiktokenTokenizer.P50kBaseRegex()); 47return SplitText(text, TiktokenTokenizer.P50kBaseRegex());