File: BuildCheck\Infrastructure\EditorConfig\EditorConfigGlobsMatcher.cs
Web Access
Project: ..\..\..\src\Build\Microsoft.Build.csproj (Microsoft.Build)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
// Note:
// Copied from https://github.com/dotnet/roslyn/blob/06d3f153ed6af6f2b78028a1e1e6ecc55c8ff101/src/Compilers/Core/Portable/CommandLine/AnalyzerConfig.SectionNameMatching.cs
// with some changes to make it quicker to integrate into the MSBuild.
// Changes:
//  1. ArrayBuilder was replaced with List.
//  2. Exceptions. Wrap in try/catch blocks for proper reporting
 
 
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
 
namespace Microsoft.Build.Experimental.BuildCheck.Infrastructure.EditorConfig;
 
internal class EditorConfigGlobsMatcher
{
    internal readonly struct SectionNameMatcher
    {
        private readonly ImmutableArray<(int minValue, int maxValue)> _numberRangePairs;
 
        internal Regex Regex { get; }
 
        internal SectionNameMatcher(
            Regex regex,
            ImmutableArray<(int minValue, int maxValue)> numberRangePairs)
        {
            Debug.Assert(regex.GetGroupNumbers().Length - 1 == numberRangePairs.Length);
            Regex = regex;
            _numberRangePairs = numberRangePairs;
        }
 
        internal bool IsMatch(string s)
        {
            if (_numberRangePairs.IsEmpty)
            {
                return Regex.IsMatch(s);
            }
 
            var match = Regex.Match(s);
            if (!match.Success)
            {
                return false;
            }
 
            Debug.Assert(match.Groups.Count - 1 == _numberRangePairs.Length);
            for (int i = 0; i < _numberRangePairs.Length; i++)
            {
                var (minValue, maxValue) = _numberRangePairs[i];
                // Index 0 is the whole regex
                if (!int.TryParse(match.Groups[i + 1].Value, out int matchedNum) ||
                    matchedNum < minValue ||
                    matchedNum > maxValue)
                {
                    return false;
                }
            }
            return true;
        }
    }
 
    /// <summary>
    /// Takes a <see cref="EditorConfigFile.Section.Name"/> and creates a matcher that
    /// matches the given language. Returns null if the section name is
    /// invalid.
    /// </summary>
    internal static SectionNameMatcher? TryCreateSectionNameMatcher(string sectionName)
    {
        // An editorconfig section name is a language for recognizing file paths
        // defined by the following grammar:
        //
        // <path> ::= <path-list>
        // <path-list> ::= <path-item> | <path-item> <path-list>
        // <path-item> ::= "*"  | "**" | "?" | <char> | <choice> | <range>
        // <char> ::= any unicode character
        // <choice> ::= "{" <choice-list> "}"
        // <choice-list> ::= <path-list> | <path-list> "," <choice-list>
        // <range> ::= "{" <integer> ".." <integer> "}"
        // <integer> ::= "-" <digit-list> | <digit-list>
        // <digit-list> ::= <digit> | <digit> <digit-list>
        // <digit> ::= 0-9
 
        var sb = new StringBuilder();
        sb.Append('^');
 
        // EditorConfig matching depends on the whether or not there are
        // directory separators and where they are located in the section
        // name. Specifically, the editorconfig core parser says:
        // https://github.com/editorconfig/editorconfig-core-c/blob/5d3996811e962a717a7d7fdd0a941192382241a7/src/lib/editorconfig.c#L231
        //
        //     Pattern would be:
        //     /dir/of/editorconfig/file[double_star]/[section] if section does not contain '/',
        //     /dir/of/editorconfig/file[section] if section starts with a '/', or
        //     /dir/of/editorconfig/file/[section] if section contains '/' but does not start with '/'.
 
        if (!sectionName.Contains("/"))
        {
            sb.Append(".*/");
        }
        else if (sectionName[0] != '/')
        {
            sb.Append('/');
        }
 
        var lexer = new SectionNameLexer(sectionName);
        var numberRangePairs = new List<(int minValue, int maxValue)>();
        if (!TryCompilePathList(ref lexer, sb, parsingChoice: false, numberRangePairs))
        {
            numberRangePairs.Clear();
            return null;
        }
        sb.Append('$');
 
 
        var imArray = ImmutableArray.CreateBuilder<(int, int)>(numberRangePairs is null ? 0 : numberRangePairs.Count);
        if (numberRangePairs?.Count > 0)
        {
            imArray.AddRange(numberRangePairs);
        }
 
        return new SectionNameMatcher(
            new Regex(sb.ToString(), RegexOptions.Compiled),
            imArray.ToImmutableArray());
    }
 
    internal static string UnescapeSectionName(string sectionName)
    {
        var sb = new StringBuilder();
        SectionNameLexer lexer = new SectionNameLexer(sectionName);
        while (!lexer.IsDone)
        {
            var tokenKind = lexer.Lex();
            if (tokenKind == TokenKind.SimpleCharacter)
            {
                sb.Append(lexer.EatCurrentCharacter());
            }
            else
            {
                // We only call this on strings that were already passed through IsAbsoluteEditorConfigPath, so
                // we shouldn't have any other token kinds here.
                throw new BuildCheckConfigurationException($"UnexpectedToken: {tokenKind}", BuildCheckConfigurationErrorScope.EditorConfigParser);
            }
        }
        return sb.ToString();
    }
 
    internal static bool IsAbsoluteEditorConfigPath(string sectionName)
    {
        // NOTE: editorconfig paths must use '/' as a directory separator character on all OS.
 
        // on all unix systems this is thus a simple test: does the path start with '/'
        // and contain no special chars?
 
        // on windows, a path can be either drive rooted or not (e.g. start with 'c:' or just '')
        // in addition to being absolute or relative.
        // for example c:myfile.cs is a relative path, but rooted on drive c:
        // /myfile2.cs is an absolute path but rooted to the current drive.
 
        // in addition there are UNC paths and volume guids (see https://docs.microsoft.com/en-us/dotnet/standard/io/file-path-formats)
        // but these start with \\ (and thus '/' in editor config terminology)
 
        // in this implementation we choose to ignore the drive root for the purposes of
        // determining validity. On windows c:/file.cs and /file.cs are both assumed to be
        // valid absolute paths, even though the second one is technically relative to
        // the current drive of the compiler working directory. 
 
        // Note that this check has no impact on config correctness. Files on windows
        // will still be compared using their full path (including drive root) so it's
        // not possible to target the wrong file. It's just possible that the user won't
        // receive a warning that this section is ignored on windows in this edge case.
 
        SectionNameLexer nameLexer = new SectionNameLexer(sectionName);
        bool sawStartChar = false;
        int logicalIndex = 0;
        while (!nameLexer.IsDone)
        {
            if (nameLexer.Lex() != TokenKind.SimpleCharacter)
            {
                return false;
            }
            var simpleChar = nameLexer.EatCurrentCharacter();
 
            // check the path starts with '/'
            if (logicalIndex == 0)
            {
                if (simpleChar == '/')
                {
                    sawStartChar = true;
                }
                else if (Path.DirectorySeparatorChar == '/')
                {
                    return false;
                }
            }
            // on windows we get a second chance to find the start char
            else if (!sawStartChar && Path.DirectorySeparatorChar == '\\')
            {
                if (logicalIndex == 1 && simpleChar != ':')
                {
                    return false;
                }
                else if (logicalIndex == 2)
                {
                    if (simpleChar != '/')
                    {
                        return false;
                    }
                    else
                    {
                        sawStartChar = true;
                    }
                }
            }
            logicalIndex++;
        }
        return sawStartChar;
    }
 
 
    /// <summary>
    /// <![CDATA[
    /// <path-list> ::= <path-item> | <path-item> <path-list>
    /// <path-item> ::= "*"  | "**" | "?" | <char> | <choice> | <range>
    /// <char> ::= any unicode character
    /// <choice> ::= "{" <choice-list> "}"
    /// <choice-list> ::= <path-list> | <path-list> "," <choice-list>
    /// ]]>
    /// </summary>
    private static bool TryCompilePathList(
        ref SectionNameLexer lexer,
        StringBuilder sb,
        bool parsingChoice,
        List<(int minValue, int maxValue)> numberRangePairs)
    {
        while (!lexer.IsDone)
        {
            var tokenKind = lexer.Lex();
            switch (tokenKind)
            {
                case TokenKind.BadToken:
                    // Parsing failure
                    return false;
                case TokenKind.SimpleCharacter:
                    // Matches just this character
                    sb.Append(Regex.Escape(lexer.EatCurrentCharacter().ToString()));
                    break;
                case TokenKind.Question:
                    // '?' matches any single character
                    sb.Append('.');
                    break;
                case TokenKind.Star:
                    // Matches any string of characters except directory separator
                    // Directory separator is defined in editorconfig spec as '/'
                    sb.Append("[^/]*");
                    break;
                case TokenKind.StarStar:
                    // Matches any string of characters
                    sb.Append(".*");
                    break;
                case TokenKind.OpenCurly:
                    // Back up token stream. The following helpers all expect a '{'
                    lexer.Position--;
                    // This is ambiguous between {num..num} and {item1,item2}
                    // We need to look ahead to disambiguate. Looking for {num..num}
                    // is easier because it can't be recursive.
                    (string numStart, string numEnd)? rangeOpt = TryParseNumberRange(ref lexer);
                    if (rangeOpt is null)
                    {
                        // Not a number range. Try a choice expression
                        if (!TryCompileChoice(ref lexer, sb, numberRangePairs))
                        {
                            return false;
                        }
                        // Keep looping. There may be more after the '}'.
                        break;
                    }
                    else
                    {
                        (string numStart, string numEnd) = rangeOpt.GetValueOrDefault();
                        if (int.TryParse(numStart, out var intStart) && int.TryParse(numEnd, out var intEnd))
                        {
                            var pair = intStart < intEnd ? (intStart, intEnd) : (intEnd, intStart);
                            numberRangePairs.Add(pair);
                            // Group allowing any digit sequence. The validity will be checked outside of the regex
                            sb.Append("(-?[0-9]+)");
                            // Keep looping
                            break;
                        }
                        return false;
                    }
                case TokenKind.CloseCurly:
                    // Either the end of a choice, or a failed parse
                    return parsingChoice;
                case TokenKind.Comma:
                    // The end of a choice section, or a failed parse
                    return parsingChoice;
                case TokenKind.OpenBracket:
                    sb.Append('[');
                    if (!TryCompileCharacterClass(ref lexer, sb))
                    {
                        return false;
                    }
                    break;
                default:
                    throw new BuildCheckConfigurationException($"UnexpectedToken: {tokenKind}", BuildCheckConfigurationErrorScope.EditorConfigParser);
            }
        }
        // If we're parsing a choice we should not exit without a closing '}'
        return !parsingChoice;
    }
 
    /// <summary>
    /// Compile a globbing character class of the form [...]. Returns true if
    /// the character class was successfully compiled. False if there was a syntax
    /// error. The starting character is expected to be directly after the '['.
    /// </summary>
    private static bool TryCompileCharacterClass(ref SectionNameLexer lexer, StringBuilder sb)
    {
        // [...] should match any of the characters in the brackets, with special
        // behavior for four characters: '!' immediately after the opening bracket
        // implies the negation of the character class, '-' implies matching
        // between the locale-dependent range of the previous and next characters,
        // '\' escapes the following character, and ']' ends the range
        if (!lexer.IsDone && lexer.CurrentCharacter == '!')
        {
            sb.Append('^');
            lexer.Position++;
        }
        while (!lexer.IsDone)
        {
            var currentChar = lexer.EatCurrentCharacter();
            switch (currentChar)
            {
                case '-':
                    // '-' means the same thing in regex as it does in the glob, so
                    // put it in verbatim
                    sb.Append(currentChar);
                    break;
 
                case '\\':
                    // Escape the next char
                    if (lexer.IsDone)
                    {
                        return false;
                    }
                    sb.Append('\\');
                    sb.Append(lexer.EatCurrentCharacter());
                    break;
 
                case ']':
                    sb.Append(currentChar);
                    return true;
 
                default:
                    sb.Append(Regex.Escape(currentChar.ToString()));
                    break;
            }
        }
        // Stream ended without a closing bracket
        return false;
    }
 
    /// <summary>
    /// Parses choice defined by the following grammar:
    /// <![CDATA[
    /// <choice> ::= "{" <choice-list> "}"
    /// <choice-list> ::= <path-list> | <path-list> "," <choice-list>
    /// ]]>
    /// </summary>
    private static bool TryCompileChoice(
        ref SectionNameLexer lexer,
        StringBuilder sb,
        List<(int, int)> numberRangePairs)
    {
        if (lexer.Lex() != TokenKind.OpenCurly)
        {
            return false;
        }
 
        // Start a non-capturing group for the choice
        sb.Append("(?:");
 
        // We start immediately after a '{'
        // Try to compile the nested <path-list>
        while (TryCompilePathList(ref lexer, sb, parsingChoice: true, numberRangePairs))
        {
            // If we've successfully compiled a <path-list> the last token should
            // have been a ',' or a '}'
            char lastChar = lexer[lexer.Position - 1];
            if (lastChar == ',')
            {
                // Another option
                sb.Append('|');
            }
            else if (lastChar == '}')
            {
                // Close out the capture group
                sb.Append(')');
                return true;
            }
            else
            {
                throw new BuildCheckConfigurationException($"UnexpectedValue: {lastChar}", BuildCheckConfigurationErrorScope.EditorConfigParser);
            }
        }
 
        // Propagate failure
        return false;
    }
 
    /// <summary>
    /// Parses range defined by the following grammar.
    /// <![CDATA[
    /// <range> ::= "{" <integer> ".." <integer> "}"
    /// <integer> ::= "-" <digit-list> | <digit-list>
    /// <digit-list> ::= <digit> | <digit> <digit-list>
    /// <digit> ::= 0-9
    /// ]]>
    /// </summary>
    private static (string numStart, string numEnd)? TryParseNumberRange(ref SectionNameLexer lexer)
    {
        var saved = lexer.Position;
        if (lexer.Lex() != TokenKind.OpenCurly)
        {
            lexer.Position = saved;
            return null;
        }
 
        var numStart = lexer.TryLexNumber();
        if (numStart is null)
        {
            // Not a number
            lexer.Position = saved;
            return null;
        }
 
        // The next two characters must be ".."
        if (!lexer.TryEatCurrentCharacter(out char c) || c != '.' ||
            !lexer.TryEatCurrentCharacter(out c) || c != '.')
        {
            lexer.Position = saved;
            return null;
        }
 
        // Now another number
        var numEnd = lexer.TryLexNumber();
        if (numEnd is null || lexer.IsDone || lexer.Lex() != TokenKind.CloseCurly)
        {
            // Not a number or no '}'
            lexer.Position = saved;
            return null;
        }
 
        return (numStart, numEnd);
    }
 
    private struct SectionNameLexer
    {
        private readonly string _sectionName;
 
        internal int Position { get; set; }
 
        internal SectionNameLexer(string sectionName)
        {
            _sectionName = sectionName;
            Position = 0;
        }
 
        internal bool IsDone => Position >= _sectionName.Length;
 
        internal TokenKind Lex()
        {
            int lexemeStart = Position;
            switch (_sectionName[Position])
            {
                case '*':
                    {
                        int nextPos = Position + 1;
                        if (nextPos < _sectionName.Length &&
                            _sectionName[nextPos] == '*')
                        {
                            Position += 2;
                            return TokenKind.StarStar;
                        }
                        else
                        {
                            Position++;
                            return TokenKind.Star;
                        }
                    }
 
                case '?':
                    Position++;
                    return TokenKind.Question;
 
                case '{':
                    Position++;
                    return TokenKind.OpenCurly;
 
                case ',':
                    Position++;
                    return TokenKind.Comma;
 
                case '}':
                    Position++;
                    return TokenKind.CloseCurly;
 
                case '[':
                    Position++;
                    return TokenKind.OpenBracket;
 
                case '\\':
                    {
                        // Backslash escapes the next character
                        Position++;
                        if (IsDone)
                        {
                            return TokenKind.BadToken;
                        }
 
                        return TokenKind.SimpleCharacter;
                    }
 
                default:
                    // Don't increment position, since caller needs to fetch the character
                    return TokenKind.SimpleCharacter;
            }
        }
 
        internal char CurrentCharacter => _sectionName[Position];
 
        /// <summary>
        /// Call after getting <see cref="TokenKind.SimpleCharacter" /> from <see cref="Lex()" />
        /// </summary>
        internal char EatCurrentCharacter() => _sectionName[Position++];
 
        /// <summary>
        /// Returns false if there are no more characters in the lex stream.
        /// Otherwise, produces the next character in the stream and returns true.
        /// </summary>
        internal bool TryEatCurrentCharacter(out char nextChar)
        {
            if (IsDone)
            {
                nextChar = default;
                return false;
            }
            else
            {
                nextChar = EatCurrentCharacter();
                return true;
            }
        }
 
        internal char this[int position] => _sectionName[position];
 
        /// <summary>
        /// Returns the string representation of a decimal integer, or null if
        /// the current lexeme is not an integer.
        /// </summary>
        internal string? TryLexNumber()
        {
            bool start = true;
            var sb = new StringBuilder();
 
            while (!IsDone)
            {
                char currentChar = CurrentCharacter;
                if (start && currentChar == '-')
                {
                    Position++;
                    sb.Append('-');
                }
                else if (char.IsDigit(currentChar))
                {
                    Position++;
                    sb.Append(currentChar);
                }
                else
                {
                    break;
                }
                start = false;
            }
 
            var str = sb.ToString();
            return str.Length == 0 || str == "-"
                ? null
                : str;
        }
    }
 
    private enum TokenKind
    {
        BadToken,
        SimpleCharacter,
        Star,
        StarStar,
        Question,
        OpenCurly,
        CloseCurly,
        Comma,
        DoubleDot,
        OpenBracket,
    }
}