47 references to GPT4
Microsoft.ML.Tokenizers.Tests (47)
TiktokenTests.cs (47)
48TestGPT4TokenizationEncoding(GPT4);
51Assert.True(GPT4 is TiktokenTokenizer);
52IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
67Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
72tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
76tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
81tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
108yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
199IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
201Assert.Equal(text, GPT4.Decode(encoded));
202TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
204IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
205int idsCount = GPT4.CountTokens(text);
242IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
244Assert.Equal(text, GPT4.Decode(encoded));
245TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
247IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
252int idsCount = GPT4.CountTokens(text);
263IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
266IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
267int idsCount = GPT4.CountTokens(text);
276IReadOnlyList<int> encoded = GPT4.EncodeToIds(text);
277int idsCount = GPT4.CountTokens(text);
279Assert.Equal(text, GPT4.Decode(encoded));
280TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
282IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out string? normalizedText);
620TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds);
741IReadOnlyList<EncodedToken> result = GPT4.EncodeToTokens(text, out _);
747Assert.Equal(expectedIds, GPT4.EncodeToIds(text));
748Assert.Equal(expectedIds.Length, GPT4.CountTokens(text));
752int length = GPT4.GetIndexByTokenCount(text, tokenCount, out _, out int count);
770int index = GPT4.GetIndexByTokenCountFromEnd(text, tokenCount, out _, out count);
864IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput);
865string decoded = GPT4.Decode(ids);
870IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput);
871string mixedDecoded = GPT4.Decode(mixedIds);
876IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput);
877string boundaryDecoded = GPT4.Decode(boundaryIds);
882IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput);
883string belowDecoded = GPT4.Decode(belowIds);
888IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput);
889string aboveDecoded = GPT4.Decode(aboveIds);
905IReadOnlyList<int> idsRepeated = GPT4.EncodeToIds(inputRepeated);
908string decodedRepeated = GPT4.Decode(idsRepeated);
913IReadOnlyList<int> idsMixed = GPT4.EncodeToIds(inputMixed);
914string decodedMixed = GPT4.Decode(idsMixed);
918IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(inputRepeated, out string? normalizedText);