47 references to GPT4
Microsoft.ML.Tokenizers.Tests (47)
TiktokenTests.cs (47)
49TestGPT4TokenizationEncoding(GPT4); 52Assert.True(GPT4 is TiktokenTokenizer); 53IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens; 68Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens); 73tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens); 77tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens); 82tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens); 109yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" }; 200IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 202Assert.Equal(text, GPT4.Decode(encoded)); 203TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 205IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 206int idsCount = GPT4.CountTokens(text); 243IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 245Assert.Equal(text, GPT4.Decode(encoded)); 246TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 248IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 253int idsCount = GPT4.CountTokens(text); 264IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 267IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 268int idsCount = GPT4.CountTokens(text); 277IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 278int idsCount = GPT4.CountTokens(text); 280Assert.Equal(text, GPT4.Decode(encoded)); 281TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 283IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 624TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds); 745IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _); 751Assert.Equal(expectedIds, GPT4.EncodeToIds(text)); 752Assert.Equal(expectedIds.Length, GPT4.CountTokens(text)); 756int length = GPT4.GetIndexByTokenCount(text, tokenCount, out _, out int count); 774int index = GPT4.GetIndexByTokenCountFromEnd(text, tokenCount, out _, out count); 868IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput); 869string decoded = GPT4.Decode(ids); 874IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput); 875string mixedDecoded = GPT4.Decode(mixedIds); 880IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput); 881string boundaryDecoded = GPT4.Decode(boundaryIds); 886IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput); 887string belowDecoded = GPT4.Decode(belowIds); 892IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput); 893string aboveDecoded = GPT4.Decode(aboveIds); 909IReadOnlyList<int> idsRepeated = GPT4.EncodeToIds(inputRepeated); 912string decodedRepeated = GPT4.Decode(idsRepeated); 917IReadOnlyList<int> idsMixed = GPT4.EncodeToIds(inputMixed); 918string decodedMixed = GPT4.Decode(idsMixed); 922IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(inputRepeated, out string? normalizedText);