47 references to GPT4
Microsoft.ML.Tokenizers.Tests (47)
TiktokenTests.cs (47)
48TestGPT4TokenizationEncoding(GPT4); 51Assert.True(GPT4 is TiktokenTokenizer); 52IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens; 67Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens); 72tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens); 76tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens); 81tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens); 108yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" }; 199IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 201Assert.Equal(text, GPT4.Decode(encoded)); 202TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 204IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 205int idsCount = GPT4.CountTokens(text); 242IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 244Assert.Equal(text, GPT4.Decode(encoded)); 245TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 247IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 252int idsCount = GPT4.CountTokens(text); 263IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 266IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 267int idsCount = GPT4.CountTokens(text); 276IReadOnlyList<int> encoded = GPT4.EncodeToIds(text); 277int idsCount = GPT4.CountTokens(text); 279Assert.Equal(text, GPT4.Decode(encoded)); 280TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 282IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText); 620TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds); 741IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _); 747Assert.Equal(expectedIds, GPT4.EncodeToIds(text)); 748Assert.Equal(expectedIds.Length, GPT4.CountTokens(text)); 752int length = GPT4.GetIndexByTokenCount(text, tokenCount, out _, out int count); 770int index = GPT4.GetIndexByTokenCountFromEnd(text, tokenCount, out _, out count); 864IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput); 865string decoded = GPT4.Decode(ids); 870IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput); 871string mixedDecoded = GPT4.Decode(mixedIds); 876IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput); 877string boundaryDecoded = GPT4.Decode(boundaryIds); 882IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput); 883string belowDecoded = GPT4.Decode(belowIds); 888IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput); 889string aboveDecoded = GPT4.Decode(aboveIds); 905IReadOnlyList<int> idsRepeated = GPT4.EncodeToIds(inputRepeated); 908string decodedRepeated = GPT4.Decode(idsRepeated); 913IReadOnlyList<int> idsMixed = GPT4.EncodeToIds(inputMixed); 914string decodedMixed = GPT4.Decode(idsMixed); 918IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(inputRepeated, out string? normalizedText);