3 types derived from PreTokenizer
Microsoft.ML.Tokenizers (2)
PreTokenizer\RegexPreTokenizer.cs (1)
16
public sealed partial class RegexPreTokenizer :
PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13
public sealed partial class RobertaPreTokenizer :
PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76
public class SpacePreTokenizer :
PreTokenizer
75 references to PreTokenizer
Microsoft.ML.Tokenizers (66)
Model\BertTokenizer.cs (4)
28
PreTokenizer
? preTokenizer,
808
PreTokenizer
? preTokenizer = doBasicTokenization ?
809
PreTokenizer
.CreateWhiteSpaceOrPunctuationPreTokenizer(splitOnSpecialTokens ? specialTokens : null) :
810
PreTokenizer
.CreateWhiteSpacePreTokenizer();
Model\BPETokenizer.cs (9)
30
private readonly
PreTokenizer
? _preTokenizer;
90
=> Create(vocabFile, mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWordPreTokenizer(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
107
PreTokenizer
? preTokenizer = null,
134
=> Create(vocabStream, mergesStream, preTokenizer:
PreTokenizer
.CreateWordOrNonWordPreTokenizer(), normalizer: null, addedTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
151
PreTokenizer
? preTokenizer = null,
184
PreTokenizer
? preTokenizer = null,
217
PreTokenizer
? preTokenizer,
228
_preTokenizer = preTokenizer ??
PreTokenizer
.CreateWordOrNonWordPreTokenizer(); // Default to WordOrNonWord pre-tokenizer
285
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
32
private readonly
PreTokenizer
? _preTokenizer;
55
PreTokenizer
? preTokenizer = null,
87
PreTokenizer
? preTokenizer = null,
100
private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? addedTokens, bool addPrefixSpace,
249
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28
private readonly
PreTokenizer
? _preTokenizer;
61
PreTokenizer
? preTokenizer = null,
92
PreTokenizer
? preTokenizer = null,
106
internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
123
internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
128
private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream)
243
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37
PreTokenizer
? preTokenizer = null,
68
PreTokenizer
? preTokenizer = null,
Model\SentencePieceTokenizer.cs (5)
159
public override
PreTokenizer
? PreTokenizer => null;
281
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
611
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
944
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
1302
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, _specialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33
private readonly
PreTokenizer
? _preTokenizer;
46
internal TiktokenTokenizer(string vocabFilePath,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
61
internal TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
80
PreTokenizer
? preTokenizer,
102
private TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream)
127
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
1188
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1191
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1194
internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1257
PreTokenizer
? preTokenizer,
1274
PreTokenizer
? preTokenizer,
1292
PreTokenizer
? preTokenizer,
1321
PreTokenizer
? preTokenizer,
Model\WordPieceTokenizer.cs (10)
27
private readonly
PreTokenizer
? _preTokenizer;
38
PreTokenizer
? preTokenizer,
77
_preTokenizer = preTokenizer ??
PreTokenizer
.CreateWhiteSpacePreTokenizer(specialTokens);
142
PreTokenizer
? preTokenizer = null,
166
PreTokenizer
? preTokenizer = null,
175
PreTokenizer
? preTokenizer,
220
PreTokenizer
? preTokenizer = null,
255
PreTokenizer
? preTokenizer = null,
266
PreTokenizer
? preTokenizer,
298
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
PreTokenizer\PreTokenizer.cs (9)
47
private static
PreTokenizer
? _whiteSpaceOrPunctuationPreTokenizer;
56
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the whitespace or punctuation characters.
60
public static
PreTokenizer
CreateWhiteSpaceOrPunctuationPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
72
private static
PreTokenizer
? _wordOrNonWordPreTokenizer;
82
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the word or non-word boundary.
87
public static
PreTokenizer
CreateWordOrNonWordPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
99
private static
PreTokenizer
? _whiteSpacePreTokenizer;
109
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the white spaces.
113
public static
PreTokenizer
CreateWhiteSpacePreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
Tokenizer.cs (2)
24
public virtual
PreTokenizer
? PreTokenizer => null;
433
PreTokenizer
? preTokenizer,
Microsoft.ML.Tokenizers.Tests (9)
BpeTests.cs (4)
254
BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWordPreTokenizer(), normalizer: null, unknownToken: unknownToken,
503
var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream,
PreTokenizer
.CreateWordOrNonWordPreTokenizer(addedTokens), normalizer: null, addedTokens: addedTokens, unknownToken: "<|endoftext|>");
550
internal static BpeTokenizer CreateEmptyBpe(
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null)
559
vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ??
PreTokenizer
.CreateWordOrNonWordPreTokenizer(), normalizer: normalizer, unknownToken: "Ukn");
PreTokenizerTests.cs (5)
21
PreTokenizer
.CreateWordOrNonWordPreTokenizer(),
28
PreTokenizer
.CreateWordOrNonWordPreTokenizer(),
35
PreTokenizer
.CreateWhiteSpacePreTokenizer(),
58
public void TestPreTokenizer(
PreTokenizer
preTokenizer, string text, (int Offset, int Length)[] splits)
73
Assert.Empty(
PreTokenizer
.CreateWordOrNonWordPreTokenizer().PreTokenize((string)null!));