8 references to P50kBaseRegex
Microsoft.ML.Tokenizers (8)
Model\CodeGenTokenizer.cs (1)
1895new TiktokenPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens),
Model\Phi2Tokenizer.cs (1)
116vocabStream, mergesStream, new TiktokenPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens), normalizer: null,
Model\TiktokenTokenizer.cs (4)
1128return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile); 1132{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile); 1135return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile); 1138return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
PreTokenizer\RobertaPreTokenizer.cs (2)
32return SplitText(text, TiktokenTokenizer.P50kBaseRegex()); 47return SplitText(text, TiktokenTokenizer.P50kBaseRegex());