File: Model\Phi2Tokenizer.cs
Web Access
Project: src\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj (Microsoft.ML.Tokenizers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.IO;
 
namespace Microsoft.ML.Tokenizers
{
    /// <summary>
    /// Represent the Byte Pair Encoding model.
    /// Implement the Phi2 tokenizer described in https://huggingface.co/microsoft/phi-2
    /// </summary>
    public sealed class Phi2Tokenizer : CodeGenTokenizer
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="Phi2Tokenizer"/> class.
        /// </summary>
        /// <summary>
        /// Construct tokenizer's model object to use with the English Robert model.
        /// </summary>
        /// <param name="vocabularyPath">The JSON file path containing the dictionary of string keys and their ids.</param>
        /// <param name="mergePath">The file path containing the tokens's pairs list.</param>
        /// <param name="preTokenizer">The pre-tokenizer to use.</param>
        /// <param name="normalizer">The normalizer to use.</param>
        /// <param name="specialTokens">The dictionary mapping special tokens to Ids.</param>
        /// <param name="addPrefixSpace">Indicate whether to include a leading space before encoding the text.</param>
        /// <param name="addBeginningOfSentence">Indicate whether to include the beginning of sentence token in the encoding.</param>
        /// <param name="addEndOfSentence">Indicate whether to include the end of sentence token in the encoding.</param>
        /// <param name="unknownToken">The unknown token.</param>
        /// <param name="beginningOfSentenceToken">The beginning of sentence token.</param>
        /// <param name="endOfSentenceToken">The end of sentence token.</param>
        internal Phi2Tokenizer(
                string vocabularyPath,
                string mergePath,
                PreTokenizer? preTokenizer = null,
                Normalizer? normalizer = null,
                IReadOnlyDictionary<string, int>? specialTokens = null,
                bool addPrefixSpace = false,
                bool addBeginningOfSentence = false,
                bool addEndOfSentence = false,
                string? unknownToken = DefaultSpecialToken,
                string? beginningOfSentenceToken = DefaultSpecialToken,
                string? endOfSentenceToken = DefaultSpecialToken) :
                    base(vocabularyPath, mergePath, preTokenizer, normalizer, specialTokens, addPrefixSpace, addBeginningOfSentence,
                    addEndOfSentence, unknownToken, beginningOfSentenceToken, endOfSentenceToken)
        {
        }
 
        /// <summary>
        /// Construct tokenizer's model object to use with the English Robert model.
        /// </summary>
        /// <param name="vocabularyStream">The stream of a JSON file containing the dictionary of string keys and their ids.</param>
        /// <param name="mergeStream">The stream of a file containing the tokens's pairs list.</param>
        /// <param name="preTokenizer">The pre-tokenizer to use.</param>
        /// <param name="normalizer">The normalizer to use.</param>
        /// <param name="specialTokens">The additional tokens to add to the vocabulary.</param>
        /// <param name="addPrefixSpace">Indicate whether to include a leading space before encoding the text.</param>
        /// <param name="addBeginningOfSentence">Indicate whether to include the beginning of sentence token in the encoding.</param>
        /// <param name="addEndOfSentence">Indicate whether to include the end of sentence token in the encoding.</param>
        /// <param name="unknownToken">The unknown token.</param>
        /// <param name="beginningOfSentenceToken">The beginning of sentence token.</param>
        /// <param name="endOfSentenceToken">The end of sentence token.</param>
        internal Phi2Tokenizer(
                Stream vocabularyStream,
                Stream mergeStream,
                PreTokenizer? preTokenizer = null,
                Normalizer? normalizer = null,
                IReadOnlyDictionary<string, int>? specialTokens = null,
                bool addPrefixSpace = false,
                bool addBeginningOfSentence = false,
                bool addEndOfSentence = false,
                string? unknownToken = DefaultSpecialToken,
                string? beginningOfSentenceToken = DefaultSpecialToken,
                string? endOfSentenceToken = DefaultSpecialToken) :
                base(vocabularyStream, mergeStream, preTokenizer, normalizer, specialTokens, addPrefixSpace, addBeginningOfSentence,
                    addEndOfSentence, unknownToken, beginningOfSentenceToken, endOfSentenceToken)
        {
        }
 
        /// <summary>
        /// Create a CodeGen Phi2 tokenizer from the given vocab and merges streams.
        /// </summary>
        /// <param name="vocabStream">The stream containing the vocab file.</param>
        /// <param name="mergesStream">The stream containing the merges file.</param>
        /// <param name="addPrefixSpace">Indicate whether to add a space before the token.</param>
        /// <param name="addBeginOfSentence">Indicate emitting the beginning of sentence token during the encoding.</param>
        /// <param name="addEndOfSentence">Indicate emitting the end of sentence token during the encoding.</param>
        /// <returns>The CodeGen tokenizer object.</returns>
        /// <remarks>
        /// The tokenizer will be created according to the configuration specified in https://huggingface.co/microsoft/phi-2/raw/main/tokenizer.json.
        /// It is important to provide the similar vocab and merges files to the ones used in the training of the model.
        /// The vocab and merges files can be downloaded from the following links:
        ///     https://huggingface.co/microsoft/phi-2/resolve/main/vocab.json?download=true
        ///     https://huggingface.co/microsoft/phi-2/resolve/main/merges.txt?download=true
        /// When creating the tokenizer, ensure that the vocabulary stream is sourced from a trusted provider.
        /// </remarks>
        public static new Phi2Tokenizer Create(
            Stream vocabStream,
            Stream mergesStream,
            bool addPrefixSpace = false,
            bool addBeginOfSentence = false,
            bool addEndOfSentence = false)
        {
            if (vocabStream is null)
            {
                throw new ArgumentNullException(nameof(vocabStream));
            }
 
            if (mergesStream is null)
            {
                throw new ArgumentNullException(nameof(mergesStream));
            }
 
            return new Phi2Tokenizer(
                        vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
                        CodeGenTokenizer.CodeGenSpecialTokens, addPrefixSpace: addPrefixSpace, addBeginningOfSentence: addBeginOfSentence, addEndOfSentence: addEndOfSentence);
        }
    }
}