4 types derived from PreTokenizer
Microsoft.ML.Tokenizers (3)
PreTokenizer\CompositePreTokenizer.cs (1)
18
public class CompositePreTokenizer :
PreTokenizer
PreTokenizer\RegexPreTokenizer.cs (1)
16
public sealed partial class RegexPreTokenizer :
PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13
public sealed partial class RobertaPreTokenizer :
PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76
public class SpacePreTokenizer :
PreTokenizer
88 references to PreTokenizer
Microsoft.ML.Tokenizers (73)
Model\BertTokenizer.cs (2)
809
options.PreTokenizer ??= options.ApplyBasicTokenization ?
PreTokenizer
.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) :
PreTokenizer
.CreateWhiteSpace();
Model\BpeOptions.cs (1)
119
public
PreTokenizer
? PreTokenizer { get; set; }
Model\BPETokenizer.cs (9)
30
private readonly
PreTokenizer
? _preTokenizer;
94
=> Create(vocabFile, mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
114
PreTokenizer
? preTokenizer = null,
209
=> Create(vocabStream, mergesStream, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, specialTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
229
PreTokenizer
? preTokenizer = null,
265
PreTokenizer
? preTokenizer = null,
301
PreTokenizer
? preTokenizer,
316
_preTokenizer = preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(); // Default to WordOrNonWord pre-tokenizer
423
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
35
private readonly
PreTokenizer
? _preTokenizer;
58
PreTokenizer
? preTokenizer = null,
90
PreTokenizer
? preTokenizer = null,
103
private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? specialTokens, bool addPrefixSpace,
252
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28
private readonly
PreTokenizer
? _preTokenizer;
67
PreTokenizer
? preTokenizer = null,
104
PreTokenizer
? preTokenizer = null,
118
internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
135
internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
140
private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream)
255
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37
PreTokenizer
? preTokenizer = null,
68
PreTokenizer
? preTokenizer = null,
Model\SentencePieceBpeModel.cs (4)
116
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
375
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
652
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
907
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, SpecialTokensRegex!).ToArray();
Model\SentencePieceTokenizer.cs (1)
101
public override
PreTokenizer
? PreTokenizer => null;
Model\SentencePieceUnigramModel.cs (4)
228
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
651
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
1022
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
1283
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, SpecialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33
private readonly
PreTokenizer
? _preTokenizer;
46
internal TiktokenTokenizer(string vocabFilePath,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
61
internal TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
80
PreTokenizer
? preTokenizer,
102
private TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream)
127
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
1260
[GeneratedRegex(Cl100kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1263
[GeneratedRegex(P50kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1266
[GeneratedRegex(O200kBaseRegexPattern, RegexOptions.None,
PreTokenizer
.DefaultTimeOutInMilliseconds)]
1342
PreTokenizer
? preTokenizer,
1362
PreTokenizer
? preTokenizer,
1383
PreTokenizer
? preTokenizer,
1415
PreTokenizer
? preTokenizer,
Model\WordPieceOptions.cs (1)
21
public
PreTokenizer
? PreTokenizer { get; set; }
Model\WordPieceTokenizer.cs (3)
27
private readonly
PreTokenizer
? _preTokenizer;
72
_preTokenizer = options.PreTokenizer ??
PreTokenizer
.CreateWhiteSpace(options.SpecialTokens);
243
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
PreTokenizer\CompositePreTokenizer.cs (10)
21
private readonly IReadOnlyList<
PreTokenizer
> _preTokenizers;
33
public CompositePreTokenizer(IReadOnlyList<
PreTokenizer
> preTokenizers, IReadOnlyDictionary<string, int>? specialTokens = null)
46
foreach (
var
preTokenizer in preTokenizers)
56
var list = new List<
PreTokenizer
>(specialTokens.Count + 1);
60
foreach (
var
preTokenizer in preTokenizers)
76
public IReadOnlyList<
PreTokenizer
> PreTokenizers => _preTokenizers;
92
static IEnumerable<(int Offset, int Length)> SplitText(string text, IReadOnlyList<
PreTokenizer
> preTokenizers, int preTokenizerIndex, int offset, int length)
95
var
preTokenizer = preTokenizers[preTokenizerIndex];
170
static IEnumerable<(int Offset, int Length)> SplitText(char[] text, IReadOnlyList<
PreTokenizer
> preTokenizers, int preTokenizerIndex, int offset, int length)
173
var
preTokenizer = preTokenizers[preTokenizerIndex];
PreTokenizer\PreTokenizer.cs (9)
47
private static
PreTokenizer
? _whiteSpaceOrPunctuationPreTokenizer;
56
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the whitespace or punctuation characters.
63
public static
PreTokenizer
CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null)
75
private static
PreTokenizer
? _wordOrNonWordPreTokenizer;
85
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the word or non-word boundary.
93
public static
PreTokenizer
CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null)
105
private static
PreTokenizer
? _whiteSpacePreTokenizer;
115
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the white spaces.
122
public static
PreTokenizer
CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokens = null)
Tokenizer.cs (2)
24
public virtual
PreTokenizer
? PreTokenizer => null;
433
PreTokenizer
? preTokenizer,
Microsoft.ML.Tokenizers.Tests (15)
BpeTests.cs (10)
256
BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: unknownToken,
263
PreTokenizer =
PreTokenizer
.CreateWordOrNonWord(),
286
PreTokenizer =
PreTokenizer
.CreateWordOrNonWord(),
549
var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream,
PreTokenizer
.CreateWordOrNonWord(specialTokens), normalizer: null, specialTokens: specialTokens, unknownToken: "<|endoftext|>");
596
internal static BpeTokenizer CreateEmptyBpe(
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null)
605
vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(), normalizer: normalizer, unknownToken: "Ukn");
991
IReadOnlyList<
PreTokenizer
>? preTokenizers = GetPreTokenizer(root, out bool byteLevel);
1028
private static IReadOnlyList<
PreTokenizer
>? GetPreTokenizer(JsonElement root, out bool byteLevel)
1031
List<
PreTokenizer
> preTokenizers = new List<
PreTokenizer
>();
PreTokenizerTests.cs (5)
21
PreTokenizer
.CreateWordOrNonWord(),
28
PreTokenizer
.CreateWordOrNonWord(),
35
PreTokenizer
.CreateWhiteSpace(),
58
public void TestPreTokenizer(
PreTokenizer
preTokenizer, string text, (int Offset, int Length)[] splits)
73
Assert.Empty(
PreTokenizer
.CreateWordOrNonWord().PreTokenize((string)null!));