File: System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Sample.cs
Web Access
Project: src\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
#if DEBUG
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
 
namespace System.Text.RegularExpressions.Symbolic
{
    internal sealed partial class SymbolicRegexMatcher<TSet>
    {
        /// <summary>
        /// The probability of stopping match sampling when a candidate is found. This influences the expected length
        /// of the sampled matches. For a pattern .* that accepts anything the expected length is:
        /// 1/p - 1, where the -1 comes from the fact that the first coin is tossed with the empty input.
        /// </summary>
        private const double SampleMatchesStoppingProbability = 0.2;
 
        /// <summary>
        /// Maximum length to try to sample input to.
        /// </summary>
        /// <remarks>
        /// This is required for cases where the state space has a loop that is not detected as a deadend,
        /// which would otherwise cause the sampling to hang.
        /// </remarks>
        private const int SampleMatchesMaxInputLength = 100;
 
        /// <inheritdoc cref="Regex.SampleMatches(int, int)"/>
        [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
        public override IEnumerable<string> SampleMatches(int k, int randomseed)
        {
            var results = new List<string>();
 
            lock (this)
            {
                // Zero is treated as no seed, instead using a system provided one
                Random random = randomseed != 0 ? new Random(randomseed) : new Random();
                CharSetSolver charSetSolver = _builder._charSetSolver;
 
                // Create helper BDDs for handling anchors and preferentially generating ASCII inputs
                BDD asciiWordCharacters = charSetSolver.Or(
                [
                    charSetSolver.CreateBDDFromRange('A', 'Z'),
                    charSetSolver.CreateBDDFromRange('a', 'z'),
                    charSetSolver.CreateBDDFromChar('_'),
                    charSetSolver.CreateBDDFromRange('0', '9')
                ]);
                // Visible ASCII range for input character generation
                BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
                BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));
 
                // Set up two sets of minterms, one with the additional special minterm for the last end-of-line
                Debug.Assert(_minterms is not null);
                int[] mintermIdsWithoutZ = new int[_minterms.Length];
                int[] mintermIdsWithZ = new int[_minterms.Length + 1];
                for (int i = 0; i < _minterms.Length; ++i)
                {
                    mintermIdsWithoutZ[i] = i;
                    mintermIdsWithZ[i] = i;
                }
                mintermIdsWithZ[_minterms.Length] = _minterms.Length;
 
                for (int i = 0; i < k; i++)
                {
                    // Holds the generated input so far
                    StringBuilder inputSoFar = new();
                    StringBuilder? latestCandidate = null;
 
                    // Current set of states reached initially contains just the root
                    NfaMatchingState states = new();
                    // Here one could also consider previous characters for example for \b, \B, and ^ anchors
                    // and initialize inputSoFar accordingly
                    states.InitializeFrom(this, _initialStates[GetCharKind<FullInputReader>([], -1)]);
                    CurrentState statesWrapper = new(states);
 
                    // Used for end suffixes
                    List<string> possibleEndings = new();
 
                    while (true)
                    {
                        Debug.Assert(states.NfaStateSet.Count > 0);
 
                        // Gather the possible endings for satisfying nullability
                        possibleEndings.Clear();
                        StateFlags flags = SymbolicRegexMatcher<TSet>.NfaStateHandler.GetStateFlags(this, in statesWrapper);
                        if (flags.CanBeNullable())
                        {
                            // Unconditionally final state or end of the input due to \Z anchor for example
                            if (flags.IsNullable() || SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.BeginningEnd))
                            {
                                possibleEndings.Add("");
                            }
 
                            // End of line due to end-of-line anchor
                            if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.Newline))
                            {
                                possibleEndings.Add("\n");
                            }
 
                            // Related to wordborder due to \b or \B
                            if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.WordLetter))
                            {
                                possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
                            }
 
                            // Related to wordborder due to \b or \B
                            if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.General))
                            {
                                possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
                            }
                        }
 
                        // If we have a possible ending, then store a candidate input
                        if (possibleEndings.Count > 0)
                        {
                            latestCandidate ??= new();
                            latestCandidate.Clear();
                            latestCandidate.Append(inputSoFar);
                            //Choose some suffix that allows some anchor (if any) to be nullable
                            latestCandidate.Append(Choose(random, possibleEndings));
 
                            // Choose to stop here based on a coin-toss
                            if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
                            {
                                results.Add(latestCandidate.ToString());
                                break;
                            }
                        }
 
                        // Shuffle the minterms, including the last end-of-line marker if appropriate
                        int[] mintermIds = SymbolicRegexMatcher<TSet>.NfaStateHandler.StartsWithLineAnchor(this, in statesWrapper) ?
                            Shuffle(random, mintermIdsWithZ) :
                            Shuffle(random, mintermIdsWithoutZ);
                        foreach (int mintermId in mintermIds)
                        {
                            bool success = SymbolicRegexMatcher<TSet>.NfaStateHandler.TryTakeTransition(this, ref statesWrapper, mintermId);
                            Debug.Assert(success);
                            if (states.NfaStateSet.Count > 0)
                            {
                                TSet minterm = GetMintermFromId(mintermId);
                                // Append a random member of the minterm
                                inputSoFar.Append(ChooseChar(random, ToBDD(minterm, Solver, charSetSolver), ascii, charSetSolver));
                                break;
                            }
                            else
                            {
                                // The transition was a dead end, undo and continue to try another minterm
                                NfaStateHandler.UndoTransition(ref statesWrapper);
                            }
                        }
 
                        // In the case that there are no next states or input has become too large: stop here
                        if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
                        {
                            // Ending up here without an ending is unlikely but possible for example for infeasible patterns
                            // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
                            if (latestCandidate != null)
                            {
                                results.Add(latestCandidate.ToString());
                            }
                            break;
                        }
                    }
                }
 
                return results;
            }
 
            static BDD ToBDD(TSet set, ISolver<TSet> solver, CharSetSolver charSetSolver) => solver.ConvertToBDD(set, charSetSolver);
 
            static T Choose<T>(Random random, IList<T> elems) => elems[random.Next(elems.Count)];
 
            static char ChooseChar(Random random, BDD bdd, BDD ascii, CharSetSolver charSetSolver)
            {
                Debug.Assert(!bdd.IsEmpty);
                // Select characters from the visible ASCII range whenever possible
                BDD bdd1 = charSetSolver.And(bdd, ascii);
                (uint, uint) range = Choose(random, BDDRangeConverter.ToRanges(bdd1.IsEmpty ? bdd : bdd1));
                return (char)random.Next((int)range.Item1, (int)range.Item2 + 1);
            }
 
            static bool FlipBiasedCoin(Random random, double probTrue) => random.NextDouble() < probTrue;
 
            static T[] Shuffle<T>(Random random, T[] array)
            {
                random.Shuffle(array);
                return array;
            }
        }
    }
}
#endif