DocsIndexService.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
 
namespace Aspire.Cli.Mcp.Docs;
 
/// <summary>
/// Service for indexing and searching aspire.dev documentation using lexical search.
/// </summary>
internal interface IDocsIndexService
{
    /// <summary>
    /// Gets a value indicating whether the documentation has been indexed.
    /// </summary>
    bool IsIndexed { get; }
 
    /// <summary>
    /// Ensures documentation is loaded and indexed.
    /// </summary>
    /// <param name="cancellationToken">The cancellation token.</param>
    ValueTask EnsureIndexedAsync(CancellationToken cancellationToken = default);
 
    /// <summary>
    /// Lists all available documents.
    /// </summary>
    /// <param name="cancellationToken">The cancellation token.</param>
    /// <returns>The list of available documents.</returns>
    ValueTask<IReadOnlyList<DocsListItem>> ListDocumentsAsync(CancellationToken cancellationToken = default);
 
    /// <summary>
    /// Searches documents using weighted lexical matching.
    /// </summary>
    /// <param name="query">The search query.</param>
    /// <param name="topK">Maximum number of results to return.</param>
    /// <param name="cancellationToken">The cancellation token.</param>
    /// <returns>Ranked search results with matched sections.</returns>
    ValueTask<IReadOnlyList<DocsSearchResult>> SearchAsync(string query, int topK = 10, CancellationToken cancellationToken = default);
 
    /// <summary>
    /// Gets a document by slug, optionally returning only a specific section.
    /// </summary>
    /// <param name="slug">The document slug.</param>
    /// <param name="section">Optional section heading to return only that section.</param>
    /// <param name="cancellationToken">The cancellation token.</param>
    /// <returns>The document content, or null if not found.</returns>
    ValueTask<DocsContent?> GetDocumentAsync(string slug, string? section = null, CancellationToken cancellationToken = default);
}
 
/// <summary>
/// Represents a document in the list.
/// </summary>
internal sealed class DocsListItem
{
    public required string Title { get; init; }
    public required string Slug { get; init; }
    public string? Summary { get; init; }
}
 
/// <summary>
/// Represents a search result with matched section.
/// </summary>
internal sealed class DocsSearchResult
{
    public required string Title { get; init; }
    public required string Slug { get; init; }
    public string? Summary { get; init; }
    public string? MatchedSection { get; init; }
    public required float Score { get; init; }
}
 
/// <summary>
/// Represents document content with available sections.
/// </summary>
internal sealed class DocsContent
{
    public required string Title { get; init; }
    public required string Slug { get; init; }
    public string? Summary { get; init; }
    public required string Content { get; init; }
    public required IReadOnlyList<string> Sections { get; init; }
}
 
/// <summary>
/// Lexical search implementation using weighted field matching.
/// </summary>
/// <remarks>
/// For technical documentation, lexical search outperforms embeddings because queries are:
/// - Term-driven ("connection string", "workload identity")
/// - Section-oriented ("configuration", "examples")
/// - Name-exact ("Redis resource", "AddServiceDefaults")
/// </remarks>
internal sealed partial class DocsIndexService(IDocsFetcher docsFetcher, IDocsCache docsCache, ILogger<DocsIndexService> logger) : IDocsIndexService
{
    // Field weights for relevance scoring
    private const float TitleWeight = 10.0f;      // H1 (page title)
    private const float SummaryWeight = 8.0f;     // Blockquote summary
    private const float HeadingWeight = 6.0f;     // H2/H3 headings
    private const float CodeWeight = 5.0f;        // Code identifiers
    private const float BodyWeight = 1.0f;        // Body text
 
    // Scoring constants
    private const float BaseMatchScore = 1.0f;
    private const float WordBoundaryBonus = 0.5f;
    private const float MultipleOccurrenceBonus = 0.25f;
    private const int MaxOccurrenceBonus = 3;
    private const float CodeIdentifierBonus = 0.5f;
    private const int MinTokenLength = 2;
 
    // Slug matching bonuses - helps dedicated docs rank higher than incidental mentions
    private const float ExactSlugMatchBonus = 50.0f;        // Query exactly matches slug (e.g., "service-discovery" matches service-discovery)
    private const float FullPhraseInSlugBonus = 30.0f;      // All query words in slug (e.g., "service discovery" -> service-discovery)
    private const float PartialSlugMatchBonus = 10.0f;      // Some query words in slug
 
    // Changelog/What's New penalty - these pages mention many terms and shouldn't outrank dedicated docs
    private const float WhatsNewPenaltyMultiplier = 0.3f;   // Apply 0.3x to whats-new pages
 
    private readonly IDocsFetcher _docsFetcher = docsFetcher;
    private readonly IDocsCache _docsCache = docsCache;
    private readonly ILogger<DocsIndexService> _logger = logger;
 
    // Volatile ensures the double-checked locking pattern works correctly by preventing
    // instruction reordering that could expose a partially-constructed list to other threads.
    private volatile List<IndexedDocument>? _indexedDocuments;
    private readonly SemaphoreSlim _indexLock = new(1, 1);
 
    /// <inheritdoc />
    public bool IsIndexed => _indexedDocuments is not null;
 
    public async ValueTask EnsureIndexedAsync(CancellationToken cancellationToken = default)
    {
        if (_indexedDocuments is not null)
        {
            return;
        }
 
        await _indexLock.WaitAsync(cancellationToken).ConfigureAwait(false);
 
        try
        {
            if (_indexedDocuments is not null)
            {
                return;
            }
 
            var startTimestamp = Stopwatch.GetTimestamp();
 
            _logger.LogDebug("Loading aspire.dev documentation");
 
            // Try to load from disk cache first
            var cachedDocuments = await _docsCache.GetIndexAsync(cancellationToken).ConfigureAwait(false);
            if (cachedDocuments is not null)
            {
                _indexedDocuments = [.. cachedDocuments.Select(static d => new IndexedDocument(d))];
 
                var cacheElapsedTime = Stopwatch.GetElapsedTime(startTimestamp);
                _logger.LogInformation("Loaded {Count} documents from cache in {ElapsedTime:ss\\.fff} seconds.", _indexedDocuments.Count, cacheElapsedTime);
                return;
            }
 
            // Fetch and parse from network
            var content = await _docsFetcher.FetchDocsAsync(cancellationToken).ConfigureAwait(false);
            if (content is null)
            {
                _logger.LogWarning("Failed to fetch documentation");
 
                return;
            }
 
            var documents = await LlmsTxtParser.ParseAsync(content, cancellationToken).ConfigureAwait(false);
 
            // Pre-compute lowercase versions for faster searching
            _indexedDocuments = [.. documents.Select(static d => new IndexedDocument(d))];
 
            // Cache the parsed documents for next time
            await _docsCache.SetIndexAsync([.. documents], cancellationToken).ConfigureAwait(false);
 
            var elapsedTime = Stopwatch.GetElapsedTime(startTimestamp);
 
            _logger.LogInformation("Indexed {Count} documents from aspire.dev in {ElapsedTime:ss\\.fff} seconds.", _indexedDocuments.Count, elapsedTime);
        }
        finally
        {
            _indexLock.Release();
        }
    }
 
    public async ValueTask<IReadOnlyList<DocsListItem>> ListDocumentsAsync(CancellationToken cancellationToken = default)
    {
        await EnsureIndexedAsync(cancellationToken).ConfigureAwait(false);
 
        if (_indexedDocuments is null or { Count: 0 })
        {
            return [];
        }
 
        return
        [
            .. _indexedDocuments.Select(static d => new DocsListItem
            {
                Title = d.Source.Title,
                Slug = d.Source.Slug,
                Summary = d.Source.Summary
            })
        ];
    }
 
    public async ValueTask<IReadOnlyList<DocsSearchResult>> SearchAsync(string query, int topK = 10, CancellationToken cancellationToken = default)
    {
        await EnsureIndexedAsync(cancellationToken).ConfigureAwait(false);
 
        if (_indexedDocuments is null or { Count: 0 } || string.IsNullOrWhiteSpace(query))
        {
            return [];
        }
 
        var queryTokens = Tokenize(query);
        if (queryTokens.Length is 0)
        {
            return [];
        }
 
        // Pre-compute queryAsSlug once to avoid repeated allocation in hot path
        var queryAsSlug = string.Join("-", queryTokens);
 
        var results = new List<DocsSearchResult>();
 
        foreach (var doc in _indexedDocuments)
        {
            var (score, matchedSection) = ScoreDocument(doc, queryTokens, queryAsSlug);
 
            if (score > 0)
            {
                results.Add(new DocsSearchResult
                {
                    Title = doc.Source.Title,
                    Slug = doc.Source.Slug,
                    Summary = doc.Source.Summary,
                    MatchedSection = matchedSection,
                    Score = score
                });
            }
        }
 
        return
        [
            .. results
                .OrderByDescending(static r => r.Score)
                .Take(topK)
        ];
    }
 
    public async ValueTask<DocsContent?> GetDocumentAsync(string slug, string? section = null, CancellationToken cancellationToken = default)
    {
        await EnsureIndexedAsync(cancellationToken).ConfigureAwait(false);
 
        if (_indexedDocuments is null or { Count: 0 })
        {
            return null;
        }
 
        var doc = _indexedDocuments.FirstOrDefault(d =>
            d.Source.Slug.Equals(slug, StringComparison.OrdinalIgnoreCase));
 
        if (doc is null)
        {
            return null;
        }
 
        var content = doc.Source.Content;
 
        // If a section is specified, return only that section
        if (!string.IsNullOrEmpty(section))
        {
            var matchedSection = doc.Source.Sections.FirstOrDefault(s =>
                s.Heading.Equals(section, StringComparison.OrdinalIgnoreCase) ||
                s.Heading.Contains(section, StringComparison.OrdinalIgnoreCase));
 
            if (matchedSection is not null)
            {
                content = matchedSection.Content;
            }
        }
 
        return new DocsContent
        {
            Title = doc.Source.Title,
            Slug = doc.Source.Slug,
            Summary = doc.Source.Summary,
            Content = content,
            Sections = [.. doc.Source.Sections.Select(static s => s.Heading)]
        };
    }
 
    private static (float Score, string? MatchedSection) ScoreDocument(IndexedDocument doc, string[] queryTokens, string queryAsSlug)
    {
        var score = 0.0f;
        string? matchedSection = null;
        var bestSectionScore = 0.0f;
 
        // Score slug matching - this is key for finding dedicated docs
        // e.g., query "service discovery" should match slug "service-discovery" with high score
        score += ScoreSlugMatch(doc.SlugLower, doc.SlugSegments, queryTokens, queryAsSlug);
 
        // Score H1 title
        score += ScoreField(doc.TitleLower, queryTokens) * TitleWeight;
 
        // Score blockquote summary
        if (doc.SummaryLower is not null)
        {
            score += ScoreField(doc.SummaryLower, queryTokens) * SummaryWeight;
        }
 
        // Score each section (H2/H3 headings + content)
        for (var i = 0; i < doc.Sections.Count; i++)
        {
            var section = doc.Sections[i];
            var headingScore = ScoreField(section.HeadingLower, queryTokens) * HeadingWeight;
            var codeScore = ScoreCodeIdentifiers(section.CodeSpans, section.Identifiers, queryTokens) * CodeWeight;
            var bodyScore = ScoreField(section.ContentLower, queryTokens) * BodyWeight;
 
            var sectionScore = headingScore + codeScore + bodyScore;
 
            if (sectionScore > bestSectionScore)
            {
                bestSectionScore = sectionScore;
                matchedSection = doc.Source.Sections[i].Heading;
            }
        }
 
        score += bestSectionScore;
 
        // Apply penalty for "What's New" / changelog pages
        // These pages mention many features and shouldn't outrank dedicated documentation
        // BUT: Skip penalty when user is explicitly searching for changelog content
        // Note: "what's" tokenizes to "what" due to apostrophe splitting, so we check for both "what" and "new" together
        var hasChangelogToken = queryTokens.Any(static t => t is "changelog" or "whats-new");
        var hasWhatsNewTokens = queryTokens.Contains("what") && queryTokens.Contains("new");
        var queryIsAboutChangelog = hasChangelogToken || hasWhatsNewTokens;
        if (!queryIsAboutChangelog && (doc.SlugLower.Contains("whats-new") || doc.SlugLower.Contains("changelog")))
        {
            score *= WhatsNewPenaltyMultiplier;
        }
 
        return (score, matchedSection);
    }
 
    /// <summary>
    /// Scores how well the query matches the document slug.
    /// Helps dedicated docs rank higher than docs with incidental mentions.
    /// </summary>
    private static float ScoreSlugMatch(string slugLower, string[] slugSegments, string[] queryTokens, string queryAsSlug)
    {
        if (slugLower.Length is 0 || queryTokens.Length is 0)
        {
            return 0;
        }
 
        // queryAsSlug is pre-computed before the scoring loop to avoid repeated allocation
        // e.g., ["service", "discovery"] -> "service-discovery"
 
        // Exact match: query "service-discovery" matches slug "service-discovery"
        if (slugLower == queryAsSlug)
        {
            return ExactSlugMatchBonus;
        }
 
        // Check if slug contains the full query phrase
        // This handles both multi-word queries and hyphenated single-token queries
        // e.g., slug "azure-service-discovery" contains "service-discovery"
        // e.g., single token "service-bus" matches slug "azure-service-bus"
        var isMultiWordQuery = queryTokens.Length > 1;
        var hasHyphenatedToken = queryTokens.Any(static t => t.Contains('-'));
 
        if ((isMultiWordQuery || hasHyphenatedToken) && slugLower.Contains(queryAsSlug))
        {
            return FullPhraseInSlugBonus;
        }
 
        // Count how many query tokens appear as distinct slug segments
        // This prevents "service discovery" from boosting "azure-service-bus"
        // because "discovery" must be a segment, not just "service"
        // Note: slugSegments is pre-computed to avoid allocation in hot path
        var matchingSegments = 0;
 
        foreach (var token in queryTokens)
        {
            // For hyphenated tokens, check if all parts match consecutive segments in order
            if (token.Contains('-'))
            {
                var tokenParts = token.Split('-');
 
                // Look for a contiguous sequence of slug segments that matches all token parts
                var foundContiguousMatch = false;
                var maxStartIndex = slugSegments.Length - tokenParts.Length;
 
                for (var startIndex = 0; startIndex <= maxStartIndex; startIndex++)
                {
                    var allPartsMatch = true;
 
                    for (var partIndex = 0; partIndex < tokenParts.Length; partIndex++)
                    {
                        if (slugSegments[startIndex + partIndex] != tokenParts[partIndex])
                        {
                            allPartsMatch = false;
                            break;
                        }
                    }
 
                    if (allPartsMatch)
                    {
                        foundContiguousMatch = true;
                        break;
                    }
                }
 
                if (foundContiguousMatch)
                {
                    matchingSegments++;
                }
            }
            else
            {
                foreach (var segment in slugSegments)
                {
                    if (segment == token)
                    {
                        matchingSegments++;
                        break;
                    }
                }
            }
        }
 
        // All tokens match as individual segments (but not necessarily as a contiguous phrase)
        // e.g., query "azure cosmos" matches slug "azure-cosmos-db" segment-by-segment
        // This gets PartialSlugMatchBonus because the full phrase isn't in the slug
        if (matchingSegments == queryTokens.Length)
        {
            return PartialSlugMatchBonus;
        }
 
        // Some tokens match slug segments - give proportional bonus
        if (matchingSegments > 0)
        {
            // Give proportional bonus based on how many tokens matched
            return PartialSlugMatchBonus * matchingSegments / (float)queryTokens.Length;
        }
 
        return 0;
    }
 
    /// <summary>
    /// Tokenizes a query string, preserving symbols like --flag, AddRedis, aspire.json.
    /// </summary>
    private static string[] Tokenize(string text)
    {
        if (string.IsNullOrWhiteSpace(text))
        {
            return [];
        }
 
        // Split on whitespace/punctuation, then lowercase and dedupe
        return
        [
            .. TokenSplitRegex().Split(text)
                .Where(static t => t.Length >= MinTokenLength)
                .Select(static t => t.ToLowerInvariant())
                .Distinct()
        ];
    }
 
    /// <summary>
    /// Scores how well a pre-lowercased field matches the query tokens.
    /// </summary>
    private static float ScoreField(string lowerText, string[] queryTokens)
    {
        if (lowerText.Length is 0)
        {
            return 0;
        }
 
        var score = 0.0f;
        var textSpan = lowerText.AsSpan();
 
        foreach (var token in queryTokens)
        {
            var index = textSpan.IndexOf(token, StringComparison.Ordinal);
            if (index >= 0)
            {
                score += BaseMatchScore;
 
                // Bonus for exact word boundary match
                if (IsWordBoundaryMatch(textSpan, token, index))
                {
                    score += WordBoundaryBonus;
                }
 
                // Bonus for multiple occurrences (capped)
                var count = CountOccurrences(textSpan, token);
                if (count > 1)
                {
                    score += Math.Min(count - 1, MaxOccurrenceBonus) * MultipleOccurrenceBonus;
                }
            }
        }
 
        return score;
    }
 
    /// <summary>
    /// Scores pre-extracted code identifiers against query tokens.
    /// </summary>
    private static float ScoreCodeIdentifiers(IReadOnlyList<string> codeSpans, IReadOnlyList<string> identifiers, string[] queryTokens)
    {
        var score = 0.0f;
 
        // Score backticked code spans
        foreach (var code in codeSpans)
        {
            foreach (var token in queryTokens)
            {
                if (code.Contains(token, StringComparison.Ordinal))
                {
                    score += BaseMatchScore;
                }
            }
        }
 
        // Score PascalCase identifiers
        foreach (var identifier in identifiers)
        {
            foreach (var token in queryTokens)
            {
                if (identifier.Contains(token, StringComparison.Ordinal))
                {
                    score += CodeIdentifierBonus;
                }
            }
        }
 
        return score;
    }
 
    private static bool IsWordBoundaryMatch(ReadOnlySpan<char> text, string token, int index)
    {
        var startOk = index == 0 || !char.IsLetterOrDigit(text[index - 1]);
        var endIndex = index + token.Length;
        var endOk = endIndex >= text.Length || !char.IsLetterOrDigit(text[endIndex]);
 
        return startOk && endOk;
    }
 
    private static int CountOccurrences(ReadOnlySpan<char> text, string token)
    {
        var count = 0;
        var remaining = text;
 
        while (true)
        {
            var index = remaining.IndexOf(token, StringComparison.Ordinal);
            if (index < 0)
            {
                break;
            }
 
            count++;
            remaining = remaining[(index + token.Length)..];
        }
 
        return count;
    }
 
    // Split on whitespace and punctuation, keeping dotted/hyphenated tokens together
    [GeneratedRegex(@"[\s,;:!?\(\)\[\]{}""']+")]
    private static partial Regex TokenSplitRegex();
 
    // Match backticked code spans
    [GeneratedRegex(@"`([^`]+)`")]
    private static partial Regex CodeBlockRegex();
 
    // Match PascalCase/camelCase identifiers
    [GeneratedRegex(@"\b[A-Z][a-zA-Z0-9]+\b")]
    private static partial Regex IdentifierRegex();
 
    /// <summary>
    /// Pre-indexed document with lowercase text for faster searching.
    /// </summary>
    private sealed class IndexedDocument
    {
        private readonly string _slugLower;
 
        public IndexedDocument(LlmsDocument source)
        {
            Source = source;
            TitleLower = source.Title.ToLowerInvariant();
            _slugLower = source.Slug.ToLowerInvariant();
            SlugSegments = _slugLower.Split('-');
            SummaryLower = source.Summary?.ToLowerInvariant();
            Sections = [.. source.Sections.Select(static s => new IndexedSection(s))];
        }
 
        public LlmsDocument Source { get; }
 
        public string TitleLower { get; }
 
        public string SlugLower => _slugLower;
 
        /// <summary>
        /// Pre-computed slug segments to avoid allocation in hot path during scoring.
        /// </summary>
        public string[] SlugSegments { get; }
 
        public string? SummaryLower { get; }
 
        public IReadOnlyList<IndexedSection> Sections { get; }
    }
 
    /// <summary>
    /// Pre-indexed section with extracted code identifiers.
    /// </summary>
    private sealed class IndexedSection(LlmsSection source)
    {
        public string HeadingLower { get; } = source.Heading.ToLowerInvariant();
 
        public string ContentLower { get; } = source.Content.ToLowerInvariant();
 
        public IReadOnlyList<string> CodeSpans { get; } =
        [
            .. CodeBlockRegex()
                .Matches(source.Content)
                .Select(static m => m.Groups[1].Value.ToLowerInvariant())
        ];
 
        public IReadOnlyList<string> Identifiers { get; } =
        [
            .. IdentifierRegex()
                .Matches(source.Content)
                .Select(static m => m.Value.ToLowerInvariant())
        ];
    }
}
File: Mcp\Docs\DocsIndexService.cs	Web Access
Project: src\src\Aspire.Cli\Aspire.Cli.Tool.csproj (aspire)