4 types derived from PreTokenizer
Microsoft.ML.Tokenizers (3)
PreTokenizer\CompositePreTokenizer.cs (1)
16
public class CompositePreTokenizer :
PreTokenizer
PreTokenizer\RegexPreTokenizer.cs (1)
16
public sealed partial class RegexPreTokenizer :
PreTokenizer
PreTokenizer\RobertaPreTokenizer.cs (1)
13
public sealed partial class RobertaPreTokenizer :
PreTokenizer
Microsoft.ML.Tokenizers.Tests (1)
PreTokenizerTests.cs (1)
76
public class SpacePreTokenizer :
PreTokenizer
87 references to PreTokenizer
Microsoft.ML.Tokenizers (73)
Model\BertTokenizer.cs (2)
809
options.PreTokenizer ??= options.ApplyBasicTokenization ?
PreTokenizer
.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) :
PreTokenizer
.CreateWhiteSpace();
Model\BpeOptions.cs (1)
51
public
PreTokenizer
? PreTokenizer { get; set; }
Model\BPETokenizer.cs (9)
30
private readonly
PreTokenizer
? _preTokenizer;
94
=> Create(vocabFile, mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
114
PreTokenizer
? preTokenizer = null,
204
=> Create(vocabStream, mergesStream, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, specialTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
224
PreTokenizer
? preTokenizer = null,
260
PreTokenizer
? preTokenizer = null,
296
PreTokenizer
? preTokenizer,
311
_preTokenizer = preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(); // Default to WordOrNonWord pre-tokenizer
418
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\CodeGenTokenizer.cs (5)
35
private readonly
PreTokenizer
? _preTokenizer;
58
PreTokenizer
? preTokenizer = null,
90
PreTokenizer
? preTokenizer = null,
103
private CodeGenTokenizer(Stream vocabularyStream, Stream mergeStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, IReadOnlyDictionary<string, int>? specialTokens, bool addPrefixSpace,
252
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\EnglishRobertaTokenizer.cs (7)
28
private readonly
PreTokenizer
? _preTokenizer;
67
PreTokenizer
? preTokenizer = null,
104
PreTokenizer
? preTokenizer = null,
118
internal EnglishRobertaTokenizer(string vocabularyPath, string mergePath, string highestOccurrenceMappingPath,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
135
internal EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null, bool filterUnsupportedChars = true) :
140
private EnglishRobertaTokenizer(Stream vocabularyStream, Stream mergeStream, Stream highestOccurrenceMappingStream,
PreTokenizer
? preTokenizer, Normalizer? normalizer, bool filterUnsupportedChars, bool disposeStream)
255
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
Model\Phi2Tokenizer.cs (2)
37
PreTokenizer
? preTokenizer = null,
68
PreTokenizer
? preTokenizer = null,
Model\SentencePieceBpeModel.cs (4)
116
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
375
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
652
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
907
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, SpecialTokensRegex!).ToArray();
Model\SentencePieceTokenizer.cs (1)
101
public override
PreTokenizer
? PreTokenizer => null;
Model\SentencePieceUnigramModel.cs (4)
228
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
651
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
1022
foreach ((int Offset, int Length) in
PreTokenizer
.SplitText(text, SpecialTokensRegex!))
1283
(int Offset, int Length)[] splits =
PreTokenizer
.SplitText(text, SpecialTokensRegex!).ToArray();
Model\TiktokenTokenizer.cs (13)
33
private readonly
PreTokenizer
? _preTokenizer;
46
internal TiktokenTokenizer(string vocabFilePath,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
61
internal TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens = null, Normalizer? normalizer = null, int cacheSize = LruCache<int[]>.DefaultCacheSize) :
80
PreTokenizer
? preTokenizer,
102
private TiktokenTokenizer(Stream vocabStream,
PreTokenizer
? preTokenizer, IReadOnlyDictionary<string, int>? specialTokens, Normalizer? normalizer, int cacheSize, bool disposeStream)
127
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
1203
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1206
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1209
internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(
PreTokenizer
.DefaultTimeOutInMilliseconds));
1275
PreTokenizer
? preTokenizer,
1295
PreTokenizer
? preTokenizer,
1316
PreTokenizer
? preTokenizer,
1348
PreTokenizer
? preTokenizer,
Model\WordPieceOptions.cs (1)
21
public
PreTokenizer
? PreTokenizer { get; set; }
Model\WordPieceTokenizer.cs (3)
27
private readonly
PreTokenizer
? _preTokenizer;
72
_preTokenizer = options.PreTokenizer ??
PreTokenizer
.CreateWhiteSpace(options.SpecialTokens);
243
public override
PreTokenizer
? PreTokenizer => _preTokenizer;
PreTokenizer\CompositePreTokenizer.cs (10)
19
private readonly IReadOnlyList<
PreTokenizer
> _preTokenizers;
31
public CompositePreTokenizer(IReadOnlyList<
PreTokenizer
> preTokenizers, IReadOnlyDictionary<string, int>? specialTokens = null)
44
foreach (
var
preTokenizer in preTokenizers)
54
var list = new List<
PreTokenizer
>(specialTokens.Count + 1);
58
foreach (
var
preTokenizer in preTokenizers)
74
public IReadOnlyList<
PreTokenizer
> PreTokenizers => _preTokenizers;
90
static IEnumerable<(int Offset, int Length)> SplitText(string text, IReadOnlyList<
PreTokenizer
> preTokenizers, int preTokenizerIndex, int offset, int length)
93
var
preTokenizer = preTokenizers[preTokenizerIndex];
168
static IEnumerable<(int Offset, int Length)> SplitText(char[] text, IReadOnlyList<
PreTokenizer
> preTokenizers, int preTokenizerIndex, int offset, int length)
171
var
preTokenizer = preTokenizers[preTokenizerIndex];
PreTokenizer\PreTokenizer.cs (9)
47
private static
PreTokenizer
? _whiteSpaceOrPunctuationPreTokenizer;
56
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the whitespace or punctuation characters.
63
public static
PreTokenizer
CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null)
75
private static
PreTokenizer
? _wordOrNonWordPreTokenizer;
85
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the word or non-word boundary.
93
public static
PreTokenizer
CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null)
105
private static
PreTokenizer
? _whiteSpacePreTokenizer;
115
/// Create a new instance of the <see cref="
PreTokenizer
"/> class which split the text at the white spaces.
122
public static
PreTokenizer
CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokens = null)
Tokenizer.cs (2)
24
public virtual
PreTokenizer
? PreTokenizer => null;
433
PreTokenizer
? preTokenizer,
Microsoft.ML.Tokenizers.Tests (14)
BpeTests.cs (9)
256
BpeTokenizer bpe = BpeTokenizer.Create(vocabFile: vocabFile, mergesFile: mergesFile, preTokenizer:
PreTokenizer
.CreateWordOrNonWord(), normalizer: null, unknownToken: unknownToken,
273
PreTokenizer =
PreTokenizer
.CreateWordOrNonWord(),
536
var bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream,
PreTokenizer
.CreateWordOrNonWord(specialTokens), normalizer: null, specialTokens: specialTokens, unknownToken: "<|endoftext|>");
583
internal static BpeTokenizer CreateEmptyBpe(
PreTokenizer
? preTokenizer = null, Normalizer? normalizer = null)
592
vocabStream: emptyVocabStream, mergesStream: null, preTokenizer: preTokenizer ??
PreTokenizer
.CreateWordOrNonWord(), normalizer: normalizer, unknownToken: "Ukn");
918
IReadOnlyList<
PreTokenizer
>? preTokenizers = GetPreTokenizer(root, out bool byteLevel);
955
private static IReadOnlyList<
PreTokenizer
>? GetPreTokenizer(JsonElement root, out bool byteLevel)
958
List<
PreTokenizer
> preTokenizers = new List<
PreTokenizer
>();
PreTokenizerTests.cs (5)
21
PreTokenizer
.CreateWordOrNonWord(),
28
PreTokenizer
.CreateWordOrNonWord(),
35
PreTokenizer
.CreateWhiteSpace(),
58
public void TestPreTokenizer(
PreTokenizer
preTokenizer, string text, (int Offset, int Length)[] splits)
73
Assert.Empty(
PreTokenizer
.CreateWordOrNonWord().PreTokenize((string)null!));