File: src\Workspaces\SharedUtilitiesAndExtensions\Compiler\Core\EmbeddedLanguages\VirtualChars\VirtualChar.cs
Web Access
Project: src\src\CodeStyle\Core\Analyzers\Microsoft.CodeAnalysis.CodeStyle.csproj (Microsoft.CodeAnalysis.CodeStyle)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Text;
using Microsoft.CodeAnalysis.Text;
 
namespace Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
 
/// <summary>
/// <see cref="VirtualCharGreen"/> provides a uniform view of a language's string token characters regardless if they
/// were written raw in source, or are the production of a language escape sequence.  For example, in C#, in a normal
/// <c>""</c> string a <c>Tab</c> character can be written either as the raw tab character (value <c>9</c> in ASCII),
/// or as <c>\t</c>.  The format is a single character in the source, while the latter is two characters (<c>\</c> and
/// <c>t</c>).  <see cref="VirtualCharGreen"/> will represent both, providing the raw <see cref="char"/> value of
/// <c>9</c> as well as what offset and width within original <see cref="SyntaxToken"/> the character was found at.
/// </summary>
internal readonly record struct VirtualCharGreen
{
    private const int MaxWidth = 12;
    private const int WidthMask = 0b1111; // 4 bits for width (max 10)
    private const int OffsetShift = 4;    // remaining bits for offset
 
    public readonly char Char;
 
    /// <summary>
    /// The offset and width combined into a single integer.  Because the width of a VirtualChar can't be more than
    /// 10 (for <c>\UXXXXXXX</c>), we can store the width in the lower 4 bits, and the offset in the upper 28.
    /// </summary>
    private readonly int _offsetAndWidth;
 
    /// <summary>
    /// Offset in the original token that this character was found at.
    /// </summary>
    public int Offset => _offsetAndWidth >> OffsetShift;
 
    /// <summary>
    /// The width of characters in the original <see cref="SourceText"/> that represent this <see cref="VirtualCharGreen"/>.
    /// This can be as low as 1 (for normal characters) or up to 12 (for escape sequences like <c>\u1234\uABCD</c>).
    /// </summary>
    public int Width => _offsetAndWidth & WidthMask;
 
    public VirtualCharGreen(char ch, int offset, int width)
    {
        Contract.ThrowIfTrue(width > MaxWidth);
 
        if (offset < 0)
            throw new ArgumentException("Offset cannot be negative", nameof(offset));
 
        if (width <= 0)
            throw new ArgumentException("Width must be greater than zero.", nameof(width));
 
        Char = ch;
        _offsetAndWidth = (offset << OffsetShift) | width;
    }
 
    public VirtualCharGreen WithOffset(int offset)
        => new(this.Char, offset, this.Width);
}
 
/// <summary>
/// <see cref="VirtualChar"/> provides a uniform view of a language's string token characters regardless if they
/// were written raw in source, or are the production of a language escape sequence.  For example, in C#, in a
/// normal <c>""</c> string a <c>Tab</c> character can be written either as the raw tab character (value <c>9</c> in
/// ASCII),  or as <c>\t</c>.  The format is a single character in the source, while the latter is two characters
/// (<c>\</c> and <c>t</c>).  <see cref="VirtualChar"/> will represent both, providing the raw <see cref="char"/>
/// value of <c>9</c> as well as what <see cref="TextSpan"/> in the original <see cref="SourceText"/> they occupied.
/// </summary>
/// <remarks>
/// A core consumer of this system is the Regex parser.  That parser wants to work over an array of characters,
/// however this array of characters is not the same as the array of characters a user types into a string in C# or
/// VB. For example In C# someone may write: @"\z".  This should appear to the user the same as if they wrote "\\z"
/// and the same as "\\\u007a".  However, as these all have wildly different presentations for the user, there needs
/// to be a way to map back the characters it sees ( '\' and 'z' ) back to the  ranges of characters the user wrote.
/// </remarks>
internal readonly record struct VirtualChar
{
    public VirtualChar(VirtualCharGreen green, int tokenStart)
    {
        if (tokenStart < 0)
            throw new ArgumentException("Token start must be non-negative", nameof(tokenStart));
        Green = green;
        TokenStart = tokenStart;
    }
 
    internal VirtualCharGreen Green { get; }
    internal int TokenStart { get; }
 
    public static implicit operator char(VirtualChar ch)
        => ch.Value;
 
    /// <inheritdoc cref="VirtualCharGreen.Char"/>
    public char Value => Green.Char;
 
    public TextSpan Span => new(TokenStart + Green.Offset, Green.Width);
 
    /// <inheritdoc/>
    public override string ToString()
        => Value.ToString();
 
    #region equality
 
    public static bool operator ==(VirtualChar ch1, char ch2)
        => ch1.Green.Char == ch2;
 
    public static bool operator !=(VirtualChar ch1, char ch2)
        => !(ch1 == ch2);
 
    #endregion
}