8 references to P50kBaseRegex
Microsoft.ML.Tokenizers (8)
Model\CodeGenTokenizer.cs (1)
1897
new RegexPreTokenizer(TiktokenTokenizer.
P50kBaseRegex
(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.
P50kBaseRegex
(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (4)
1127
return (new Dictionary<string, int> { { EndOfText, 50256 }, },
P50kBaseRegex
(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
1133
return (new Dictionary<string, int> { { EndOfText, 50256 } },
P50kBaseRegex
(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
1137
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } },
P50kBaseRegex
(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
1140
return (new Dictionary<string, int> { { EndOfText, 50256 } },
P50kBaseRegex
(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text, TiktokenTokenizer.
P50kBaseRegex
());
47
return SplitText(text, TiktokenTokenizer.
P50kBaseRegex
());