File: src\Microsoft.ML.Tokenizers\Utils\ByteToUnicodeEncoding.cs
Web Access
Project: src\test\Microsoft.ML.Tokenizers.Tests\Microsoft.ML.Tokenizers.Tests.csproj (Microsoft.ML.Tokenizers.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using System.Linq;
namespace Microsoft.ML.Tokenizers
    /// <summary>
    /// Map between utf-8 byte to unicode with avoiding mapping to whitespace/control characters.
    /// </summary>
    internal sealed class ByteToUnicodeEncoding
        public static ByteToUnicodeEncoding Instance { get; } = new ByteToUnicodeEncoding();
        public ByteToUnicodeEncoding()
            var byteToUnicodeMapping = Enumerable.Range('!', '~' - '!' + 1)
                .Concat(Enumerable.Range('¡', '¬' - '¡' + 1))
                .Concat(Enumerable.Range('®', 'ÿ' - '®' + 1))
                .ToDictionary(b => (char)b, b => (char)b);
            const int numChars = 256;
            var n = 0;
            foreach (var b in Enumerable.Range(0, numChars))
                if (!byteToUnicodeMapping.ContainsKey((char)b))
                    byteToUnicodeMapping.Add((char)b, (char)(numChars + n));
            ByteToUnicode = byteToUnicodeMapping;
            UnicodeToByte = ByteToUnicode.ToDictionary(kv => kv.Value, kv => kv.Key);
            int count = numChars + n;
            CharToString = new string[count];
            for (char c = (char)0; c < (char)count; c++)
                CharToString[c] = c.ToString();
        public IReadOnlyDictionary<char, char> ByteToUnicode { get; }
        public IReadOnlyDictionary<char, char> UnicodeToByte { get; }
        public string[] CharToString { get; }