// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Collections.Immutable; using System.Threading; using Microsoft.CodeAnalysis.Collections; using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars; using Microsoft.CodeAnalysis.PooledObjects; using Microsoft.CodeAnalysis.Text; internal static class VirtualCharUtilities { public static TextSpan FromBounds(VirtualChar vc1, VirtualChar vc2) => TextSpan.FromBounds(vc1.Span.Start, vc2.Span.End); /// <summary> /// Takes a <see cref="VirtualCharSequence"/> and returns the same characters from it, without any characters /// corresponding to test markup (e.g. <c>$$</c> and the like). Because the virtual chars contain their /// original text span, these final virtual chars can be used both as the underlying source of a <see /// cref="SourceText"/> (which only cares about their <see cref="char"/> value), as well as the way to then map /// positions/spans within that <see cref="SourceText"/> to actual full virtual char spans in the original /// document for classification. /// </summary> public static (ImmutableSegmentedList<VirtualChar> sourceCode, ImmutableArray<TextSpan> markdownSpans) StripMarkupCharacters( ArrayBuilder<VirtualChar> virtualChars, CancellationToken cancellationToken) { using var _ = ArrayBuilder<TextSpan>.GetInstance(out var markdownSpans); var builder = ImmutableSegmentedList.CreateBuilder<VirtualChar>(); var nestedAnonymousSpanCount = 0; var nestedNamedSpanCount = 0; for (int i = 0, n = virtualChars.Count; i < n;) { var vc1 = virtualChars[i]; var vc2 = i + 1 < n ? virtualChars[i + 1] : default; // These casts are safe because we disallowed virtual chars whose Value doesn't fit in a char in // RegisterClassifications. // // TODO: this algorithm is not actually the one used in roslyn or the roslyn-sdk for parsing a // markup file. for example it will get `[|]` wrong (as that depends on knowing if we're starting // or ending an existing span). Fix this up to follow the actual algorithm we use. switch ((vc1.Value, vc2.Value)) { case ('$', '$'): markdownSpans.Add(FromBounds(vc1, vc2)); i += 2; continue; case ('|', ']'): nestedAnonymousSpanCount = Math.Max(0, nestedAnonymousSpanCount - 1); markdownSpans.Add(FromBounds(vc1, vc2)); i += 2; continue; case ('|', '}'): markdownSpans.Add(FromBounds(vc1, vc2)); nestedNamedSpanCount = Math.Max(0, nestedNamedSpanCount - 1); i += 2; continue; // We have a slight ambiguity with cases like these: // // [|] [|} // // Is it starting a new match, or ending an existing match. As a workaround, we special case // these and consider it ending a match if we have something on the stack already. case ('[', '|'): var vc3 = i + 2 < n ? virtualChars[i + 2] : default; if ((vc3 == ']' && nestedAnonymousSpanCount > 0) || (vc3 == '}' && nestedNamedSpanCount > 0)) { // not the start of a span, don't classify this '[' specially. break; } nestedAnonymousSpanCount++; markdownSpans.Add(FromBounds(vc1, vc2)); i += 2; continue; case ('{', '|'): if (TryConsumeNamedSpanStart(ref i, n)) continue; // didn't find the colon. don't classify these specially. break; } // Nothing special, add character as is. builder.Add(vc1); i++; } cancellationToken.ThrowIfCancellationRequested(); return (builder.ToImmutable(), markdownSpans.ToImmutableAndClear()); bool TryConsumeNamedSpanStart(ref int i, int n) { var start = i; var seekPoint = i; while (seekPoint < n) { var colonChar = virtualChars[seekPoint]; if (colonChar == ':') { markdownSpans.Add(FromBounds(virtualChars[start], colonChar)); nestedNamedSpanCount++; i = seekPoint + 1; return true; } seekPoint++; } return false; } } } |