3 types derived from PreTokenizer
Microsoft.ML.Tokenizers (2)
PreTokenizer\RegexPreTokenizer.cs (1)
16
public sealed partial class RegexPreTokenizer :
PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13
public sealed partial class RobertaPreTokenizer :
PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76
public class SpacePreTokenizer :
PreTokenizer
67 references to PreTokenizer
Microsoft.ML.Tokenizers (58)
Model\BertTokenizer.cs (2)
801
options.PreTokenizer ??= options.ApplyBasicTokenization ?
PreTokenizer
.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? options.SpecialTokens : null) :
PreTokenizer
.CreateWhiteSpace();
Model\BPETokenizer.cs (9)
30
private readonly
PreTokenizer
? _preTokenizer;
93
=> Create(vocabFile, mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
113
PreTokenizer
? preTokenizer = null,
143
=> Create(vocabStream, mergesStream, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, specialTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
163
PreTokenizer
? preTokenizer = null,
199
PreTokenizer
? preTokenizer = null,
232
PreTokenizer
? preTokenizer,
243
_preTokenizer = preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(); // Default to WordOrNonWord pre-tokenizer
300
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
32
private readonly
PreTokenizer
? _preTokenizer;
55
PreTokenizer
? preTokenizer = null,
87
PreTokenizer
? preTokenizer = null,
100
private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? specialTokens, bool addPrefixSpace,
249
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28
private readonly
PreTokenizer
? _preTokenizer;
67
PreTokenizer
? preTokenizer = null,
104
PreTokenizer
? preTokenizer = null,
118
internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
135
internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
140
private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream)
255
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37
PreTokenizer
? preTokenizer = null,
68
PreTokenizer
? preTokenizer = null,
Model\SentencePieceTokenizer.cs (5)
159
public override
PreTokenizer
? PreTokenizer => null;
281
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
611
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
944
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, _specialTokensRegex!))
1302
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, _specialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33
private readonly
PreTokenizer
? _preTokenizer;
46
internal TiktokenTokenizer(string vocabFilePath,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
61
internal TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
80
PreTokenizer
? preTokenizer,
102
private TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream)
127
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
1178
[GeneratedRegex(Cl100kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1181
[GeneratedRegex(P50kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1184
[GeneratedRegex(O200kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1260
PreTokenizer
? preTokenizer,
1280
PreTokenizer
? preTokenizer,
1301
PreTokenizer
? preTokenizer,
1333
PreTokenizer
? preTokenizer,
Model\WordPieceOptions.cs (1)
21
public
PreTokenizer
? PreTokenizer { get; set; }
Model\WordPieceTokenizer.cs (3)
27
private readonly
PreTokenizer
? _preTokenizer;
72
_preTokenizer = options.PreTokenizer ??
PreTokenizer
.CreateWhiteSpace(options.SpecialTokens);
243
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
PreTokenizer\PreTokenizer.cs (9)
47
private static
PreTokenizer
? _whiteSpaceOrPunctuationPreTokenizer;
56
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the whitespace or punctuation characters.
63
public static
PreTokenizer
CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null)
75
private static
PreTokenizer
? _wordOrNonWordPreTokenizer;
85
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the word or non-word boundary.
93
public static
PreTokenizer
CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null)
105
private static
PreTokenizer
? _whiteSpacePreTokenizer;
115
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the white spaces.
122
public static
PreTokenizer
CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokens = null)
Tokenizer.cs (2)
24
public virtual
PreTokenizer
? PreTokenizer => null;
433
PreTokenizer
? preTokenizer,
Microsoft.ML.Tokenizers.Tests (9)
BpeTests.cs (4)
254
BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: unknownToken,
503
var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream,
PreTokenizer
.CreateWordOrNonWord(specialTokens), normalizer: null, specialTokens: specialTokens, unknownToken: "<|endoftext|>");
550
internal static BpeTokenizer CreateEmptyBpe(
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null)
559
vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(), normalizer: normalizer, unknownToken: "Ukn");
PreTokenizerTests.cs (5)
21
PreTokenizer
.CreateWordOrNonWord(),
28
PreTokenizer
.CreateWordOrNonWord(),
35
PreTokenizer
.CreateWhiteSpace(),
58
public void TestPreTokenizer(
PreTokenizer
preTokenizer, string text, (int Offset, int Length)[] splits)
73
Assert.Empty(
PreTokenizer
.CreateWordOrNonWord().PreTokenize((string)null!));