8 references to P50kBaseRegex
Microsoft.ML.Tokenizers (8)
Model\CodeGenTokenizer.cs (1)
1895
new TiktokenPreTokenizer(TiktokenTokenizer.
P50kBaseRegex
(), CodeGenTokenizer.CodeGenAddedTokens),
Model\Phi2Tokenizer.cs (1)
116
vocabStream, mergesStream, new TiktokenPreTokenizer(TiktokenTokenizer.
P50kBaseRegex
(), CodeGenTokenizer.CodeGenAddedTokens), normalizer: null,
Model\TiktokenTokenizer.cs (4)
1128
return (new Dictionary<string, int> { { EndOfText, 50256 } },
P50kBaseRegex
(), P50RanksFile);
1132
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } },
P50kBaseRegex
(), P50RanksFile);
1135
return (new Dictionary<string, int> { { EndOfText, 50256 } },
P50kBaseRegex
(), R50RanksFile);
1138
return (new Dictionary<string, int> { { EndOfText, 50256 }, },
P50kBaseRegex
(), GPT2File);
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text, TiktokenTokenizer.
P50kBaseRegex
());
47
return SplitText(text, TiktokenTokenizer.
P50kBaseRegex
());