File: System\Collections\Frozen\String\KeyAnalyzer.cs
Web Access
Project: src\src\libraries\System.Collections.Immutable\src\System.Collections.Immutable.csproj (System.Collections.Immutable)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
#if !NET8_0_OR_GREATER
using System.Runtime.CompilerServices;
#endif
 
namespace System.Collections.Frozen
{
    internal static class KeyAnalyzer
    {
        /// <summary>
        /// Look for well-known patterns we can optimize for in a set of dictionary or set keys.
        /// </summary>
        /// <remarks>
        /// The idea here is to find the shortest substring slice across all the input strings which yields a set of
        /// strings which are maximally unique. The optimal slice is then applied to incoming strings being hashed to
        /// perform dictionary/set lookups. Keeping the slices as small as possible minimizes the number of characters
        /// involved in hashing, speeding up the whole process.
        ///
        /// What we do here is pretty simple. We loop over the input strings, looking for the shortest slice with a good
        /// enough uniqueness factor. We look at all the strings both left-justified and right-justified as this maximizes
        /// the opportunities to find unique slices, especially in the case of many strings with the same prefix or suffix.
        ///
        /// In whatever slice we end up with, if all the characters involved in the slice are ASCII and we're doing case-insensitive
        /// operations, then we can select an ASCII-specific case-insensitive comparer which yields faster overall performance.
        /// </remarks>
        public static AnalysisResults Analyze(
            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
        {
            Debug.Assert(!uniqueStrings.IsEmpty);
            bool allUniqueStringsAreConfirmedAscii = ignoreCase && AreAllAscii(uniqueStrings);
 
            // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
            AnalysisResults results;
            if (minLength == 0 || !TryUseSubstring(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, out results))
            {
                results = CreateAnalysisResults(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
            }
 
            return results;
        }
 
        /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
        private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
        {
            const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
 
            // Sufficient uniqueness factor of 95% is good enough.
            // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
            int acceptableNonUniqueCount = uniqueStrings.Length / 20;
 
            // If we're case-sensitive, we can just use a case-sensitive substring comparer, which is cheap.
            // For case-insensitive / ignoreCase, we don't know which portion of the input strings we're going to care about.
            // However, if all of the strings are entirely ASCII, we know that no portion of any will be non-ASCII and thus can
            // use a comparer that only knows how to do ASCII-based ordinal casing, which is cheaper than full Unicode casing,
            // in particular for GetHashCode.
            SubstringComparer comparer =
                !ignoreCase ? new JustifiedSubstringComparer() :
                allUniqueStringsAreConfirmedAscii ? new JustifiedCaseInsensitiveAsciiSubstringComparer() :
                new JustifiedCaseInsensitiveSubstringComparer();
 
            HashSet<string> set = new HashSet<string>(
#if NET
                uniqueStrings.Length,
#endif
                comparer);
 
            // For each substring length...preferring the shortest length that provides
            // enough uniqueness
            int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
            for (int count = 1; count <= maxSubstringLength; count++)
            {
                comparer.IsLeft = true;
                comparer.Count = count;
 
                // For each index from, get a uniqueness factor for the left-justified substrings.
                // If any is above our threshold, we're done.
                for (int index = 0; index <= minLength - count; index++)
                {
                    comparer.Index = index;
 
                    if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                    {
                        results = CreateAnalysisResults(
                            uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, index, count,
                            static (string s, int index, int count) => s.AsSpan(index, count));
                        return true;
                    }
                }
 
                // There were no left-justified substrings of this length available.
                // If all of the strings are of the same length, then just checking left-justification is sufficient.
                // But if any strings are of different lengths, then we'll get different alignments for left- vs
                // right-justified substrings, and so we also check right-justification.
                if (minLength != maxLength)
                {
                    // toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
                    comparer.IsLeft = false;
 
                    // For each index, get a uniqueness factor for the right-justified substrings.
                    // If any is above our threshold, we're done.
                    for (int index = 0; index <= minLength - count; index++)
                    {
                        comparer.Index = -index - count;
 
                        if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                        {
                            results = CreateAnalysisResults(
                                uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, comparer.Index, count,
                                static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
                            return true;
                        }
                    }
                }
            }
 
            // Could not find a substring index/length that was good enough.
            results = default;
            return false;
        }
 
        private static AnalysisResults CreateAnalysisResults(
            ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString)
        {
            // Start off by assuming all strings are ASCII
            bool allAsciiIfIgnoreCase = true;
 
            // If we're case-sensitive, it doesn't matter if the strings are ASCII or not.
            // But if we're case-insensitive, we can switch to a faster comparer if all the
            // substrings are ASCII, so we check each.
            if (ignoreCase)
            {
                // Further, if the ASCII keys (in their entirety) don't contain any letters,
                // then we can actually perform the comparison as case-sensitive even if
                // case-insensitive was requested, as there's nothing that would compare
                // equally to the key other than the key itself.
                bool canSwitchIgnoreCaseHashToCaseSensitive = true;
 
                foreach (string uniqueString in uniqueStrings)
                {
                    // Get a span representing the slice of the uniqueString which will be hashed.
                    // If the slice isn't ASCII, bail out to return the results.
                    if (!allUniqueStringsAreConfirmedAscii &&
                        !IsAllAscii(getHashString(uniqueString, index, count)))
                    {
                        allAsciiIfIgnoreCase = false;
                        canSwitchIgnoreCaseHashToCaseSensitive = false;
                        break;
                    }
 
                    // The hash string is ASCII only.  We disable the switch to
                    // case sensitive if by examining the entire uniqueString we
                    // find that it is not ASCII, or that it contains ASCII letters.
                    if (canSwitchIgnoreCaseHashToCaseSensitive)
                    {
                        // If count is 0 then uniqueString equals hashString,
                        // and as we have just checked that IsAllAscii(hashString) is true
                        // then we know IsAllAscii(uniqueString) must be true,
                        // so we can skip the check.
                        if ((count > 0 && !allUniqueStringsAreConfirmedAscii && !IsAllAscii(uniqueString.AsSpan())) ||
                            ContainsAnyAsciiLetters(uniqueString.AsSpan()))
                        {
                            canSwitchIgnoreCaseHashToCaseSensitive = false;
                            if (allUniqueStringsAreConfirmedAscii)
                            {
                                break;
                            }
                        }
                    }
                }
 
                // If we can switch to case-sensitive, do so.
                if (canSwitchIgnoreCaseHashToCaseSensitive)
                {
                    ignoreCase = false;
                }
            }
 
            // Return the analysis results.
            return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
        }
 
        private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);
 
        private static bool AreAllAscii(ReadOnlySpan<string> strings)
        {
            foreach (string s in strings)
            {
                if (!IsAllAscii(s.AsSpan()))
                {
                    return false;
                }
            }
 
            return true;
        }
 
        internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
        {
#if NET8_0_OR_GREATER
            return System.Text.Ascii.IsValid(s);
#else
            fixed (char* src = s)
            {
                uint* ptrUInt32 = (uint*)src;
                int length = s.Length;
 
                while (length >= 4)
                {
                    if (!AllCharsInUInt32AreAscii(ptrUInt32[0] | ptrUInt32[1]))
                    {
                        return false;
                    }
 
                    ptrUInt32 += 2;
                    length -= 4;
                }
 
                char* ptrChar = (char*)ptrUInt32;
                while (length-- > 0)
                {
                    char ch = *ptrChar++;
                    if (ch >= 0x80)
                    {
                        return false;
                    }
                }
            }
 
            return true;
 
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            static bool AllCharsInUInt32AreAscii(uint value) => (value & ~0x007F_007Fu) == 0;
#endif
        }
 
#if NET8_0_OR_GREATER
        private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
#endif
        internal static bool ContainsAnyAsciiLetters(ReadOnlySpan<char> s)
        {
            Debug.Assert(IsAllAscii(s));
 
#if NET8_0_OR_GREATER
            return s.ContainsAny(s_asciiLetters);
#else
            foreach (char c in s)
            {
                Debug.Assert(c <= 0x7f);
                if ((uint)((c | 0x20) - 'a') <= (uint)('z' - 'a'))
                {
                    return true;
                }
            }
            return false;
#endif
        }
 
        internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
        {
            set.Clear();
 
            foreach (string s in uniqueStrings)
            {
                if (!set.Add(s) && --acceptableNonUniqueCount < 0)
                {
                    return false;
                }
            }
 
            return true;
        }
 
        internal readonly struct AnalysisResults
        {
            public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex, int hashCount, int minLength, int maxLength)
            {
                IgnoreCase = ignoreCase;
                AllAsciiIfIgnoreCase = allAsciiIfIgnoreCase;
                HashIndex = hashIndex;
                HashCount = hashCount;
                MinimumLength = minLength;
                MaximumLengthDiff = maxLength - minLength;
            }
 
            public bool IgnoreCase { get; }
            public bool AllAsciiIfIgnoreCase { get; }
            public int HashIndex { get; }
            public int HashCount { get; }
            public int MinimumLength { get; }
            public int MaximumLengthDiff { get; }
 
            public bool SubstringHashing => HashCount != 0;
            public bool RightJustifiedSubstring => HashIndex < 0;
        }
 
        private abstract class SubstringComparer : IEqualityComparer<string>
        {
            public int Index;
            public int Count;
            public bool IsLeft;
            public abstract bool Equals(string? x, string? y);
            public abstract int GetHashCode(string s);
        }
 
        private sealed class JustifiedSubstringComparer : SubstringComparer
        {
            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
        }
 
        private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
        {
            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
        }
 
        private sealed class JustifiedCaseInsensitiveAsciiSubstringComparer : SubstringComparer
        {
            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCaseAscii(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
        }
    }
}