File: PatternMatching\RegexQuery.cs
Web Access
Project: src\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
 
namespace Microsoft.CodeAnalysis.PatternMatching;
 
/// <summary>
/// A boolean query tree compiled from a regex AST, used to pre-filter documents before running
/// the full (expensive) regex match. Each node evaluates against a document's indexed
/// bigrams/trigrams to quickly reject documents that cannot possibly contain a match.
/// <para/>
/// The tree mirrors the boolean structure of the regex: concatenation becomes <see cref="All"/>
/// (AND), alternation becomes <see cref="Any"/> (OR), and opaque constructs (wildcards, character
/// classes) become <see cref="None"/> (passthrough). A pattern like <c>(Read|Write)Line</c> compiles
/// to <c>All(Any(Literal("Read"), Literal("Write")), Literal("Line"))</c>, requiring "Line"'s
/// bigrams to be present and at least one of "Read" or "Write"'s bigrams.
/// <para/>
/// When the entire tree reduces to <see cref="None"/> (e.g. for <c>.*</c> which has no extractable
/// literals), <see cref="HasLiterals"/> is <see langword="false"/> and callers skip pre-filtering —
/// every document becomes a candidate and must be checked with the full regex.
/// </summary>
internal abstract class RegexQuery
{
    private RegexQuery() { }
 
    /// <summary>
    /// Whether this tree contains any <see cref="Literal"/> nodes whose bigrams/trigrams
    /// can be checked against the document index. When <see langword="false"/>, the query
    /// cannot reject any documents and pre-filtering should be skipped entirely.
    /// </summary>
    public abstract bool HasLiterals { get; }
 
    /// <summary>
    /// Conjunction: all children must be satisfied. Produced from regex concatenation
    /// (<c>AB</c> = A followed by B).
    /// </summary>
    public sealed class All(ImmutableArray<RegexQuery> children) : RegexQuery
    {
        public readonly ImmutableArray<RegexQuery> Children = children;
        public override bool HasLiterals => Children.Any(static c => c.HasLiterals);
    }
 
    /// <summary>
    /// Disjunction: at least one child must be satisfied. Produced from regex alternation
    /// (<c>A|B</c>).
    /// </summary>
    public sealed class Any(ImmutableArray<RegexQuery> children) : RegexQuery
    {
        public readonly ImmutableArray<RegexQuery> Children = children;
        public override bool HasLiterals => Children.Any(static c => c.HasLiterals);
    }
 
    /// <summary>
    /// A literal string that must appear somewhere in the document's symbol names.
    /// At pre-filter time, the literal's lowercased bigrams and trigrams are checked
    /// against the document's indexed bitset and Bloom filter. The text is always
    /// lowercase and at least two characters long, guaranteeing that every literal
    /// contributes at least one bigram to the pre-filter check.
    /// </summary>
    public sealed class Literal : RegexQuery
    {
        public readonly string Text;
        public override bool HasLiterals => true;
 
        public Literal(string text)
        {
            Debug.Assert(text.Length >= 2);
            Debug.Assert(text == text.ToLowerInvariant());
            Text = text;
        }
    }
 
    /// <summary>
    /// An opaque node that cannot contribute to pre-filtering (e.g. <c>.</c>, <c>\d</c>,
    /// character classes). Always evaluates to <see langword="true"/> in the pre-filter,
    /// meaning "I can't tell — don't reject on my account."
    /// </summary>
    public sealed class None : RegexQuery
    {
        public static readonly None Instance = new();
        private None() { }
        public override bool HasLiterals => false;
    }
 
    /// <summary>
    /// Simplifies the query tree by flattening nested <see cref="All"/>/<see cref="Any"/> nodes,
    /// removing <see cref="None"/> where safe, and collapsing single-child wrappers.
    /// <para/>
    /// The key asymmetry: <see cref="None"/> means "anything could match here." In an
    /// <see cref="All"/> (AND), that's vacuously true and can be dropped — the remaining children
    /// still constrain the match. In an <see cref="Any"/> (OR), it poisons the whole disjunction —
    /// if one branch can match anything, the entire <see cref="Any"/> can match anything, so it
    /// collapses to <see cref="None"/>.
    /// <para/>
    /// <b>Post-condition:</b> The returned tree contains <see cref="None"/> only as the top-level
    /// result (meaning the entire regex is opaque). If the result is an <see cref="All"/>,
    /// <see cref="Any"/>, or <see cref="Literal"/>, no <see cref="None"/> nodes exist anywhere
    /// in the subtree. This allows consumers to assume only those three types appear when
    /// traversing a non-None result.
    /// </summary>
    public static RegexQuery Optimize(RegexQuery query)
    {
        return query switch
        {
            All all => OptimizeAll(all),
            Any any => OptimizeAny(any),
            _ => query,
        };
 
        static RegexQuery OptimizeAll(All all)
        {
            using var _ = Microsoft.CodeAnalysis.PooledObjects.ArrayBuilder<RegexQuery>.GetInstance(out var children);
 
            foreach (var child in all.Children)
            {
                var optimized = Optimize(child);
 
                // None in an All is vacuously true — drop it. For example, in the regex `Goo.*Bar`,
                // `.*` compiles to None (matches anything). The All becomes All(Literal("Goo"), None,
                // Literal("Bar")). Since we AND all children, None (= "anything could be here") doesn't
                // restrict the match, so dropping it yields All(Literal("Goo"), Literal("Bar")), which
                // correctly requires both "Goo" and "Bar" bigrams to be present.
                if (optimized is None)
                    continue;
 
                // Flatten nested All: All(All(a, b), c) -> All(a, b, c).
                if (optimized is All inner)
                    children.AddRange(inner.Children);
                else
                    children.Add(optimized);
            }
 
            return children.Count switch
            {
                0 => None.Instance,
                1 => children[0],
                _ => new All(children.ToImmutable()),
            };
        }
 
        static RegexQuery OptimizeAny(Any any)
        {
            using var _ = Microsoft.CodeAnalysis.PooledObjects.ArrayBuilder<RegexQuery>.GetInstance(out var children);
 
            foreach (var child in any.Children)
            {
                var optimized = Optimize(child);
 
                // None in an Any means "anything could match" — the whole Any is opaque.
                if (optimized is None)
                    return None.Instance;
 
                // Flatten nested Any: Any(Any(a, b), c) -> Any(a, b, c).
                if (optimized is Any inner)
                    children.AddRange(inner.Children);
                else
                    children.Add(optimized);
            }
 
            return children.Count switch
            {
                0 => None.Instance,
                1 => children[0],
                _ => new Any(children.ToImmutable()),
            };
        }
    }
}