File: Utils\ByteToUnicodeEncoding.cs
Web Access
Project: src\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj (Microsoft.ML.Tokenizers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Collections.Generic;
using System.Linq;
 
namespace Microsoft.ML.Tokenizers
{
    /// <summary>
    /// Map between utf-8 byte to unicode with avoiding mapping to whitespace/control characters.
    /// </summary>
    internal sealed class ByteToUnicodeEncoding
    {
        public static ByteToUnicodeEncoding Instance { get; } = new ByteToUnicodeEncoding();
 
        public ByteToUnicodeEncoding()
        {
            var byteToUnicodeMapping = Enumerable.Range('!', '~' - '!' + 1)
                .Concat(Enumerable.Range('¡', '¬' - '¡' + 1))
                .Concat(Enumerable.Range('®', 'ÿ' - '®' + 1))
                .ToDictionary(b => (char)b, b => (char)b);
 
            const int numChars = 256;
            var n = 0;
            foreach (var b in Enumerable.Range(0, numChars))
            {
                if (!byteToUnicodeMapping.ContainsKey((char)b))
                {
                    byteToUnicodeMapping.Add((char)b, (char)(numChars + n));
                    ++n;
                }
            }
 
            ByteToUnicode = byteToUnicodeMapping;
            UnicodeToByte = ByteToUnicode.ToDictionary(kv => kv.Value, kv => kv.Key);
 
            int count = numChars + n;
 
            CharToString = new string[count];
            for (char c = (char)0; c < (char)count; c++)
            {
                CharToString[c] = c.ToString();
            }
        }
 
        public IReadOnlyDictionary<char, char> ByteToUnicode { get; }
        public IReadOnlyDictionary<char, char> UnicodeToByte { get; }
        public string[] CharToString { get; }
    }
}