70 references to _unigramTokenizerFromJson
Microsoft.ML.Tokenizers.Tests (70)
UnigramTests.cs (70)
378shiftedIds[i] = _unigramTokenizerFromJson.UnknownId;
382shiftedIds[i] = _unigramTokenizerFromJson.BeginningOfSentenceId;
386shiftedIds[i] = _unigramTokenizerFromJson.EndOfSentenceId;
408result = _unigramTokenizerFromJson.EncodeToTokens(inputText, out normalized);
409extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, _unigramTokenizerFromJson.AddBeginningOfSentence, _unigramTokenizerFromJson.AddEndOfSentence);
416result = _unigramTokenizerFromJson.EncodeToTokens(inputText.AsSpan(), out normalized);
417extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, _unigramTokenizerFromJson.AddBeginningOfSentence, _unigramTokenizerFromJson.AddEndOfSentence);
424result = _unigramTokenizerFromJson.EncodeToTokens(inputText, out normalized, addBeginningOfSentence: true, addEndOfSentence: false);
425extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, true, false);
432result = _unigramTokenizerFromJson.EncodeToTokens(inputText.AsSpan(), out normalized, addBeginningOfSentence: true, addEndOfSentence: false);
433extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, true, false);
440result = _unigramTokenizerFromJson.EncodeToTokens(inputText, out normalized, addBeginningOfSentence: true, addEndOfSentence: true);
441extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, true, true);
448result = _unigramTokenizerFromJson.EncodeToTokens(inputText.AsSpan(), out normalized, addBeginningOfSentence: true, addEndOfSentence: true);
449extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, true, true);
472newString = $"{_unigramTokenizerFromJson.BeginningOfSentenceToken}{inputText}<pad>{inputText}{_unigramTokenizerFromJson.EndOfSentenceToken}";
473result = _unigramTokenizerFromJson.EncodeToTokens(newString, out normalized, addBeginningOfSentence: false, addEndOfSentence: false);
474extracted = ExtractedIds(_unigramTokenizerFromJson, result, normalizedText, false, false);
477expectedIds[0] = _unigramTokenizerFromJson.BeginningOfSentenceId;
479expectedIds[shiftedIds.Length + 1] = _unigramTokenizerFromJson.SpecialTokens!["<pad>"];
481expectedIds[shiftedIds.Length * 2 + 2] = _unigramTokenizerFromJson.EndOfSentenceId;
485expectedTokens[0] = _unigramTokenizerFromJson.BeginningOfSentenceToken;
489expectedTokens[tokens.Length * 2 + 2] = _unigramTokenizerFromJson.EndOfSentenceToken;
508result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: false, addEndOfSentence: false);
510result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false);
521result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: true, addEndOfSentence: false);
537result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: true, addEndOfSentence: false);
554result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: true, addEndOfSentence: true);
572result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: true, addEndOfSentence: true);
587result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: i, out normalized, out charConsumed);
595result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: i, out normalized, out charConsumed);
615result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: true, addEndOfSentence: true, maxTokenCount: i, out normalized, out charConsumed);
647result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: true, addEndOfSentence: true, maxTokenCount: i, out normalized, out charConsumed);
685expectedIds[0] = _unigramTokenizerFromJson.BeginningOfSentenceId;
687expectedIds[shiftedIds.Length + 1] = _unigramTokenizerFromJson.SpecialTokens!["<pad>"];
689expectedIds[shiftedIds.Length * 2 + 2] = _unigramTokenizerFromJson.EndOfSentenceId;
690expectedNormalized = $"{_unigramTokenizerFromJson.BeginningOfSentenceToken}{normalizedText}<pad>{normalizedText}{_unigramTokenizerFromJson.EndOfSentenceToken}";
694result = _unigramTokenizerFromJson.EncodeToIds(inputText, addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: i, out string? normalized, out int charConsumed);
698result = _unigramTokenizerFromJson.EncodeToIds(inputText.AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: i, out normalized, out charConsumed);
723index = _unigramTokenizerFromJson.GetIndexByTokenCount(inputText, addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: 1, out normalized, out charConsumed);
725ids1 = _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(0, index), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
726ids2 = index < normalized.Length ? _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(index), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false) : new List<int>();
735index = _unigramTokenizerFromJson.GetIndexByTokenCount(inputText.AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: 1, out normalized, out charConsumed);
737ids1 = _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(0, index).AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
738ids2 = index < normalized.Length ? _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(index).AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false) : new List<int>();
747index = _unigramTokenizerFromJson.GetIndexByTokenCountFromEnd(inputText, addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: 1, considerNormalization: true, out normalized, out charConsumed);
749ids1 = _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(0, index), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
750ids2 = index < normalized.Length ? _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(index), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false) : new List<int>();
759index = _unigramTokenizerFromJson.GetIndexByTokenCountFromEnd(inputText.AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, maxTokenCount: 1, considerNormalization: true, out normalized, out charConsumed);
761ids1 = _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(0, index).AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false);
762ids2 = index < normalized.Length ? _unigramTokenizerFromJson.EncodeToIds(normalized!.Substring(index).AsSpan(), addBeginningOfSentence: false, addEndOfSentence: false, considerNormalization: false) : new List<int>();
777DecodeWithTokenizerTest(_unigramTokenizerFromJson, decodedString, GetShiftedIds(ids));
814Assert.Equal("<unk>", _unigramTokenizerFromJson.UnknownToken);
815Assert.Equal(3, _unigramTokenizerFromJson.UnknownId);
816Assert.Equal("<s>", _unigramTokenizerFromJson.BeginningOfSentenceToken);
817Assert.Equal(0, _unigramTokenizerFromJson.BeginningOfSentenceId);
818Assert.Equal("</s>", _unigramTokenizerFromJson.EndOfSentenceToken);
819Assert.Equal(2, _unigramTokenizerFromJson.EndOfSentenceId);
830Assert.Equal(specialTokens, _unigramTokenizerFromJson.SpecialTokens);
831Assert.Equal(0, _unigramTokenizerFromJson.Vocabulary["<s>"]);
832Assert.Equal(1, _unigramTokenizerFromJson.Vocabulary["<pad>"]);
833Assert.Equal(2, _unigramTokenizerFromJson.Vocabulary["</s>"]);
834Assert.Equal(3, _unigramTokenizerFromJson.Vocabulary["<unk>"]);
835Assert.Equal(250001, _unigramTokenizerFromJson.Vocabulary["<mask>"]);