File: Text\StringTextDecodingTests.cs
Web Access
Project: src\src\Compilers\Core\CodeAnalysisTest\Microsoft.CodeAnalysis.UnitTests.csproj (Microsoft.CodeAnalysis.UnitTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
using System;
using System.IO;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Test.Utilities;
using Xunit;
 
namespace Microsoft.CodeAnalysis.UnitTests
{
    public sealed class StringTextDecodingTests : TestBase
    {
        private static SourceText CreateMemoryStreamBasedEncodedText(string text, Encoding writeEncoding, Encoding readEncodingOpt, SourceHashAlgorithm algorithm = SourceHashAlgorithm.Sha1)
        {
            byte[] bytes = writeEncoding.GetBytesWithPreamble(text);
 
            return CreateMemoryStreamBasedEncodedText(bytes, readEncodingOpt, algorithm);
        }
 
        private static SourceText CreateMemoryStreamBasedEncodedText(byte[] bytes, Encoding readEncodingOpt, SourceHashAlgorithm algorithm = SourceHashAlgorithm.Sha1)
        {
            // For testing purposes, create a bigger buffer so that we verify 
            // that the implementation only uses the part that's covered by the stream and not the entire array.
            byte[] buffer = new byte[bytes.Length + 10];
            bytes.CopyTo(buffer, 0);
 
            using (var stream = new MemoryStream(buffer, 0, bytes.Length, writable: true, publiclyVisible: true))
            {
                return EncodedStringText.Create(stream, readEncodingOpt, algorithm);
            }
        }
 
        private static SourceText CreateMemoryStreamBasedEncodedText(byte[] bytes,
            Func<Encoding> getEncoding,
            Encoding readEncodingOpt = null,
            SourceHashAlgorithm algorithm = SourceHashAlgorithm.Sha1)
        {
            // For testing purposes, create a bigger buffer so that we verify 
            // that the implementation only uses the part that's covered by the stream and not the entire array.
            byte[] buffer = new byte[bytes.Length + 10];
            bytes.CopyTo(buffer, 0);
 
            using (var stream = new MemoryStream(buffer, 0, bytes.Length, writable: true, publiclyVisible: true))
            {
                return EncodedStringText.TestAccessor.Create(stream, new Lazy<Encoding>(getEncoding), readEncodingOpt, algorithm, canBeEmbedded: false);
            }
        }
 
        [ConditionalFact(typeof(DesktopOnly))]
        public void ShiftJisGetEncoding()
        {
            var sjis = Encoding.GetEncoding(932);
            var data = CreateMemoryStreamBasedEncodedText(TestResources.General.ShiftJisSource, () => sjis);
 
            Assert.Equal(932, data.Encoding?.WindowsCodePage);
            Assert.Equal(sjis.GetString(TestResources.General.ShiftJisSource), data.ToString());
        }
 
        [ConditionalFact(typeof(DesktopOnly))]
        public void ShiftJisFile()
        {
            var sjis = Encoding.GetEncoding(932);
            var data = CreateMemoryStreamBasedEncodedText(TestResources.General.ShiftJisSource, sjis);
 
            Assert.Equal(932, data.Encoding?.WindowsCodePage);
            Assert.Equal(sjis.GetString(TestResources.General.ShiftJisSource), data.ToString());
        }
 
        [Fact]
        public void CheckSum002()
        {
            var data = CreateMemoryStreamBasedEncodedText("The quick brown fox jumps over the lazy dog", Encoding.ASCII, readEncodingOpt: null);
 
            // this is known to be "2fd4e1c6 7a2d28fc ed849ee1 bb76e739 1b93eb12", see http://en.wikipedia.org/wiki/SHA-1
            var checksum = data.GetChecksum();
            Assert.Equal("2fd4e1c6 7a2d28fc ed849ee1 bb76e739 1b93eb12", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum003()
        {
            var data = CreateMemoryStreamBasedEncodedText("The quick brown fox jumps over the lazy dog", Encoding.Unicode, readEncodingOpt: null);
 
            var checksum = data.GetChecksum();
            Assert.Equal("9d0047c0 8c84a7ef a55a955e aa3b4aae f62c9c39", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum004()
        {
            var data = CreateMemoryStreamBasedEncodedText("The quick brown fox jumps over the lazy dog", Encoding.BigEndianUnicode, readEncodingOpt: null);
 
            var checksum = data.GetChecksum();
            Assert.Equal("72b2beae c76188ac 5b38c16c 4f9d518a 2be0a34c", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum006()
        {
            var data = CreateMemoryStreamBasedEncodedText("", Encoding.ASCII, readEncodingOpt: null);
 
            // this is known to be "da39a3ee 5e6b4b0d 3255bfef 95601890 afd80709", see http://en.wikipedia.org/wiki/SHA-1
            var checksum = data.GetChecksum();
            Assert.Equal("da39a3ee 5e6b4b0d 3255bfef 95601890 afd80709", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum007()
        {
            var data = CreateMemoryStreamBasedEncodedText("", Encoding.Unicode, readEncodingOpt: null);
 
            var checksum = data.GetChecksum();
            Assert.Equal("d62636d8 caec13f0 4e28442a 0a6fa1af eb024bbb", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum008()
        {
            var data = CreateMemoryStreamBasedEncodedText("", Encoding.BigEndianUnicode, readEncodingOpt: null);
 
            var checksum = data.GetChecksum();
            Assert.Equal("26237800 2c95ae7e 29535cb9 f438db21 9adf98f5", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [Fact]
        public void CheckSum_SHA256()
        {
            var data = CreateMemoryStreamBasedEncodedText("", Encoding.UTF8, readEncodingOpt: null, algorithm: SourceHashAlgorithm.Sha256);
 
            var checksum = data.GetChecksum();
            Assert.Equal("f1945cd6 c19e56b3 c1c78943 ef5ec181 16907a4c a1efc40a 57d48ab1 db7adfc5", StringTextTest.ChecksumToHexQuads(checksum));
        }
 
        [ConditionalFact(typeof(HasEnglishDefaultEncoding))]
        [WorkItem(5663, "https://github.com/dotnet/roslyn/issues/5663")]
        public void Decode_NonUtf8()
        {
            // Unicode text with extended characters that map to interesting code points in CodePage 1252.
            var text = "abc def baz aeiouy \u20ac\u2019\u00a4\u00b6\u00c9\u00db\u00ed\u00ff";
 
            // The same text encoded in CodePage 1252 which happens to be an illegal sequence if decoded as Utf-8.
            var bytes = new byte[]
            {
                0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66, 0x20, 0x62, 0x61, 0x7a, 0x20, 0x61, 0x65, 0x69, 0x6f, 0x75, 0x79, 0x20,
                0x80, 0x92, 0xA4, 0xB6, 0xC9, 0xDB, 0xED, 0xFF
            };
 
            var utf8 = new UTF8Encoding(false, true);
 
            // bytes should not decode to UTF-8
            using (var stream = new MemoryStream(bytes))
            {
                Assert.Throws<DecoderFallbackException>(() =>
                {
                    EncodedStringText.TestAccessor.Decode(stream, utf8, SourceHashAlgorithm.Sha1, throwIfBinaryDetected: false, canBeEmbedded: false);
                });
 
                Assert.True(stream.CanRead);
            }
 
            // Detect encoding should correctly pick CodePage 1252
            using (var stream = new MemoryStream(bytes))
            {
                var sourceText = EncodedStringText.Create(stream);
                Assert.Equal(text, sourceText.ToString());
 
                // Check for a complete Encoding implementation.
                Assert.Equal(1252, sourceText.Encoding.CodePage);
                Assert.NotNull(sourceText.Encoding.GetEncoder());
                Assert.NotNull(sourceText.Encoding.GetDecoder());
                Assert.Equal(2, sourceText.Encoding.GetMaxByteCount(1));
                Assert.Equal(1, sourceText.Encoding.GetMaxCharCount(1));
                Assert.Equal(text, sourceText.Encoding.GetString(bytes));
 
                Assert.True(stream.CanRead);
            }
        }
 
        [Fact]
        public void Decode_Utf8()
        {
            var utf8 = new UTF8Encoding(false, true);
            var text = "abc def baz aeiouy \u00E4\u00EB\u00EF\u00F6\u00FC\u00FB";
            var bytes = utf8.GetBytesWithPreamble(text);
 
            // Detect encoding should correctly pick UTF-8
            using (var stream = new MemoryStream(bytes))
            {
                var sourceText = EncodedStringText.Create(stream);
                Assert.Equal(text, sourceText.ToString());
                Assert.Equal(Encoding.UTF8.EncodingName, sourceText.Encoding.EncodingName);
                Assert.True(stream.CanRead);
            }
        }
 
        [WorkItem(611805, "http://vstfdevdiv:8080/DevDiv2/DevDiv/_workitems/edit/611805")]
        [Fact]
        public void TestMultithreadedDecoding()
        {
            const string expectedText =
                "\r\n" +
                "class Program\r\n" +
                "{\r\n" +
                "    static void Main()\r\n" +
                "    {\r\n" +
                "        string s = \"class C { \u0410\u0411\u0412 x; }\";\r\n" +
                "        foreach (char ch in s) System.Console.WriteLine(\"{0:x2}\", (int)ch);\r\n" +
                "    }\r\n" +
                "}\r\n";
 
            var encoding = new UTF8Encoding(false);
            string path = Temp.CreateFile().WriteAllBytes(encoding.GetBytes(expectedText)).Path;
 
            var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount * 2 };
            Parallel.For(0, 500, parallelOptions, i =>
            {
                using (var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
                {
                    var sourceText = EncodedStringText.Create(stream);
                    Assert.Equal(expectedText, sourceText.ToString());
                }
            });
        }
 
        [Fact]
        public void MemoryStreamBasedEncodedText1()
        {
            var encodings = new Encoding[]
            {
                new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
                new UTF8Encoding(encoderShouldEmitUTF8Identifier: true),
            };
 
            foreach (var writeEncoding in encodings)
            {
                foreach (var readEncoding in encodings)
                {
                    var text = CreateMemoryStreamBasedEncodedText("goo", writeEncoding, readEncoding);
                    Assert.Equal(1, text.Lines.Count);
                    Assert.Equal(3, text.Lines[0].Span.Length);
                }
            }
        }
 
        [Fact]
        public void MemoryStreamBasedEncodedText2()
        {
            var writeEncodings = new Encoding[]
            {
                new UnicodeEncoding(bigEndian: true, byteOrderMark: true),
                new UnicodeEncoding(bigEndian: false, byteOrderMark: true),
                new UTF8Encoding(encoderShouldEmitUTF8Identifier: true),
            };
 
            var readEncodings = new Encoding[]
            {
                new UnicodeEncoding(bigEndian: true, byteOrderMark: true),
                new UnicodeEncoding(bigEndian: false, byteOrderMark: true),
                new UnicodeEncoding(bigEndian: true, byteOrderMark: false),
                new UnicodeEncoding(bigEndian: false, byteOrderMark: false),
                null,
            };
 
            foreach (var writeEncoding in writeEncodings)
            {
                foreach (var readEncoding in readEncodings)
                {
                    var text = CreateMemoryStreamBasedEncodedText("goo", writeEncoding, readEncoding);
                    Assert.Equal(1, text.Lines.Count);
                    Assert.Equal(3, text.Lines[0].Span.Length);
                }
            }
        }
 
        [Fact]
        public void FileStreamEncodedText()
        {
            const string expectedText =
                "\r\n" +
                "class Program\r\n" +
                "{\r\n" +
                "    static void Main()\r\n" +
                "    {\r\n" +
                "        string s = \"class C { \u0410\u0411\u0412 x; }\";\r\n" +
                "        foreach (char ch in s) System.Console.WriteLine(\"{0:x2}\", (int)ch);\r\n" +
                "    }\r\n" +
                "}\r\n";
 
            var encodings = new Encoding[]
            {
                new UnicodeEncoding(bigEndian: true, byteOrderMark: true),
                new UnicodeEncoding(bigEndian: false, byteOrderMark: true),
                new UTF8Encoding(encoderShouldEmitUTF8Identifier: true),
            };
 
            foreach (var encoding in encodings)
            {
                var tmpFile = Temp.CreateFile();
 
                File.WriteAllText(tmpFile.Path, expectedText, encoding);
 
                using (FileStream fs = new FileStream(tmpFile.Path, FileMode.Open, FileAccess.Read))
                {
                    var encodedText = EncodedStringText.Create(fs);
                    Assert.Equal(encoding.CodePage, encodedText.Encoding.CodePage);
                    Assert.Equal(expectedText, encodedText.ToString());
                }
            }
        }
 
        [Fact]
        public void FileStreamEncodedTextEmpty()
        {
            var tmpFile = Temp.CreateFile();
 
            using (FileStream fs = new FileStream(tmpFile.Path, FileMode.Open, FileAccess.Read))
            {
                var encodedText = EncodedStringText.Create(fs);
                Assert.Equal(0, encodedText.Length);
            }
        }
 
        [ConditionalFact(typeof(HasEnglishDefaultEncoding))]
        [WorkItem(2081, "https://github.com/dotnet/roslyn/issues/2081")]
        [WorkItem(5663, "https://github.com/dotnet/roslyn/issues/5663")]
        public void HorizontalEllipsis()
        {
            // Character 0x85 in CodePage 1252 is a horizontal ellipsis.
            // If decoded as Latin-1, then it's incorrectly treated as \u0085 which
            // is a line break ('NEXT LINE').
            byte[] srcBytes = new[] { (byte)0x85 };
            using (var ms = new MemoryStream(srcBytes))
            {
                var sourceText = EncodedStringText.Create(ms);
                Assert.Equal('\u2026', sourceText[0]);
            }
        }
    }
}