File: Model\WordPieceOptions.cs
Web Access
Project: src\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj (Microsoft.ML.Tokenizers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Collections.Generic;
 
namespace Microsoft.ML.Tokenizers
{
    /// <summary>
    /// Options for the WordPiece tokenizer.
    /// </summary>
    public class WordPieceOptions
    {
#pragma warning disable MSML_NoInstanceInitializers
        internal const int DefaultMaxInputCharsPerWord = 100;
        internal const string DefaultContinuingSubwordPrefix = "##";
 
        /// <summary>
        /// Gets or sets the <see cref="PreTokenizer"/> to override the default normalizer, if desired.
        /// </summary>
        public PreTokenizer? PreTokenizer { get; set; }
 
        /// <summary>
        /// Gets or sets the <see cref="Normalizer"/> to override the default normalizer, if desired.
        /// </summary>
        public Normalizer? Normalizer { get; set; }
 
        /// <summary>
        /// Gets or set the special tokens to use.
        /// </summary>
        public IReadOnlyDictionary<string, int>? SpecialTokens { get; set; }
 
        /// <summary>
        /// Gets or set the unknown token to use.
        /// </summary>
        public string UnknownToken { get; set; } = "[UNK]";
 
        /// <summary>
        /// Gets or set the prefix to use for sub-words that are not the first part of a word.
        /// </summary>
        public string ContinuingSubwordPrefix { get; set; } = DefaultContinuingSubwordPrefix;
 
        /// <summary>
        /// Gets or set the maximum number of characters to consider for a single word.
        /// </summary>
        public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord;
#pragma warning restore MSML_NoInstanceInitializers
    }
}