47 references to GPT4
Microsoft.ML.Tokenizers.Tests (47)
TiktokenTests.cs (47)
49TestGPT4TokenizationEncoding(GPT4);
52Assert.True(GPT4 is TiktokenTokenizer);
53IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
68Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
73tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
77tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
82tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
109yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
200IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
202Assert.Equal(text, GPT4.Decode(encoded));
203TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
205IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
206int idsCount = GPT4.CountTokens(text);
243IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
245Assert.Equal(text, GPT4.Decode(encoded));
246TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
248IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
253int idsCount = GPT4.CountTokens(text);
264IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
267IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
268int idsCount = GPT4.CountTokens(text);
277IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
278int idsCount = GPT4.CountTokens(text);
280Assert.Equal(text, GPT4.Decode(encoded));
281TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
283IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
624TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds);
745IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _);
751Assert.Equal(expectedIds, GPT4.EncodeToIds(text));
752Assert.Equal(expectedIds.Length, GPT4.CountTokens(text));
756int length = GPT4.GetIndexByTokenCount(text, tokenCount, out _, out int count);
774int index = GPT4.GetIndexByTokenCountFromEnd(text, tokenCount, out _, out count);
868IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput);
869string decoded = GPT4.Decode(ids);
874IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput);
875string mixedDecoded = GPT4.Decode(mixedIds);
880IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput);
881string boundaryDecoded = GPT4.Decode(boundaryIds);
886IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput);
887string belowDecoded = GPT4.Decode(belowIds);
892IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput);
893string aboveDecoded = GPT4.Decode(aboveIds);
909IReadOnlyList<int> idsRepeated = GPT4.EncodeToIds(inputRepeated);
912string decodedRepeated = GPT4.Decode(idsRepeated);
917IReadOnlyList<int> idsMixed = GPT4.EncodeToIds(inputMixed);
918string decodedMixed = GPT4.Decode(idsMixed);
922IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(inputRepeated, out string? normalizedText);