File: Phi3\Phi3TokenizerHelper.cs
Web Access
Project: src\src\Microsoft.ML.GenAI.Phi\Microsoft.ML.GenAI.Phi.csproj (Microsoft.ML.GenAI.Phi)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.ML.Tokenizers;
 
/// <summary>
/// The utility class to create tokenizer for phi-3 model.
/// </summary>
public class Phi3TokenizerHelper
{
    private const string SystemSymbol = "<|system|>";
    private const string UserSymbol = "<|user|>";
    private const string AssistantSymbol = "<|assistant|>";
    private const string EndSymbol = "<|end|>";
    private const int SystemSymbolId = 32006;
    private const int UserSymbolId = 32010;
    private const int AssistantSymbolId = 32001;
    private const int EndSymbolId = 32007;
 
    public static LlamaTokenizer FromPretrained(
        string modelPath,
        string systemSymbol = SystemSymbol,
        string userSymbol = UserSymbol,
        string assistantSymbol = AssistantSymbol,
        string endSymbol = EndSymbol,
        int systemSymbolId = SystemSymbolId,
        int userSymbolId = UserSymbolId,
        int assistantSymbolId = AssistantSymbolId,
        int endSymbolId = EndSymbolId,
        bool addPrecedingSpace = true)
    {
        var modelStream = File.OpenRead(modelPath);
 
        var llamaTokenizer = LlamaTokenizer.Create(
            modelStream,
            addPrecedingSpace,
            specialTokens: new Dictionary<string, int>
            {
                { systemSymbol, systemSymbolId },
                { userSymbol, userSymbolId },
                { assistantSymbol, assistantSymbolId },
                { endSymbol, endSymbolId }
            });
 
        return llamaTokenizer;
    }
}