File: System\Linq\Parallel\QueryOperators\Unary\DistinctQueryOperator.cs
Web Access
Project: src\src\libraries\System.Linq.Parallel\src\System.Linq.Parallel.csproj (System.Linq.Parallel)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
//
// DistinctQueryOperator.cs
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
 
namespace System.Linq.Parallel
{
    /// <summary>
    /// This operator yields all of the distinct elements in a single data set. It works quite
    /// like the above set operations, with the obvious difference being that it only accepts
    /// a single data source as input.
    /// </summary>
    /// <typeparam name="TInputOutput"></typeparam>
    internal sealed class DistinctQueryOperator<TInputOutput> : UnaryQueryOperator<TInputOutput, TInputOutput>
    {
        private readonly IEqualityComparer<TInputOutput>? _comparer; // An (optional) equality comparer.
 
        //---------------------------------------------------------------------------------------
        // Constructs a new distinction operator.
        //
 
        internal DistinctQueryOperator(IEnumerable<TInputOutput> source, IEqualityComparer<TInputOutput>? comparer)
            : base(source)
        {
            Debug.Assert(source != null, "child data source cannot be null");
            _comparer = comparer;
            SetOrdinalIndexState(OrdinalIndexState.Shuffled);
        }
 
        //---------------------------------------------------------------------------------------
        // Just opens the current operator, including opening the child and wrapping it with
        // partitions as needed.
        //
 
        internal override QueryResults<TInputOutput> Open(QuerySettings settings, bool preferStriping)
        {
            // We just open our child operator.  Do not propagate the preferStriping value, but
            // instead explicitly set it to false. Regardless of whether the parent prefers striping or range
            // partitioning, the output will be hash-partitioned.
            QueryResults<TInputOutput> childResults = Child.Open(settings, false);
            return new UnaryQueryOperatorResults(childResults, this, settings, false);
        }
 
        internal override void WrapPartitionedStream<TKey>(
            PartitionedStream<TInputOutput, TKey> inputStream, IPartitionedStreamRecipient<TInputOutput> recipient, bool preferStriping, QuerySettings settings)
        {
            // Hash-repartition the source stream
            if (OutputOrdered)
            {
                WrapPartitionedStreamHelper<TKey>(
                    ExchangeUtilities.HashRepartitionOrdered<TInputOutput, NoKeyMemoizationRequired, TKey>(
                        inputStream, null, null, _comparer, settings.CancellationState.MergedCancellationToken),
                    recipient, settings.CancellationState.MergedCancellationToken);
            }
            else
            {
                WrapPartitionedStreamHelper<int>(
                    ExchangeUtilities.HashRepartition<TInputOutput, NoKeyMemoizationRequired, TKey>(
                        inputStream, null, null, _comparer, settings.CancellationState.MergedCancellationToken),
                    recipient, settings.CancellationState.MergedCancellationToken);
            }
        }
 
        //---------------------------------------------------------------------------------------
        // This is a helper method. WrapPartitionedStream decides what type TKey is going
        // to be, and then call this method with that key as a generic parameter.
        //
 
        private void WrapPartitionedStreamHelper<TKey>(
            PartitionedStream<Pair<TInputOutput, NoKeyMemoizationRequired>, TKey> hashStream,
            IPartitionedStreamRecipient<TInputOutput> recipient, CancellationToken cancellationToken)
        {
            int partitionCount = hashStream.PartitionCount;
            PartitionedStream<TInputOutput, TKey> outputStream =
                new PartitionedStream<TInputOutput, TKey>(partitionCount, hashStream.KeyComparer, OrdinalIndexState.Shuffled);
 
            for (int i = 0; i < partitionCount; i++)
            {
                if (OutputOrdered)
                {
                    outputStream[i] =
                        new OrderedDistinctQueryOperatorEnumerator<TKey>(hashStream[i], _comparer, hashStream.KeyComparer, cancellationToken);
                }
                else
                {
                    outputStream[i] = (QueryOperatorEnumerator<TInputOutput, TKey>)(object)
                                                                                   new DistinctQueryOperatorEnumerator<TKey>(hashStream[i], _comparer, cancellationToken);
                }
            }
 
            recipient.Receive(outputStream);
        }
 
        //---------------------------------------------------------------------------------------
        // Whether this operator performs a premature merge that would not be performed in
        // a similar sequential operation (i.e., in LINQ to Objects).
        //
 
        internal override bool LimitsParallelism
        {
            get { return false; }
        }
 
        //---------------------------------------------------------------------------------------
        // This enumerator performs the distinct operation incrementally. It does this by
        // maintaining a history -- in the form of a set -- of all data already seen. It simply
        // then doesn't return elements it has already seen before.
        //
 
        private sealed class DistinctQueryOperatorEnumerator<TKey> : QueryOperatorEnumerator<TInputOutput, int>
        {
            private readonly QueryOperatorEnumerator<Pair<TInputOutput, NoKeyMemoizationRequired>, TKey> _source; // The data source.
            private readonly HashSet<TInputOutput> _hashLookup; // The hash lookup, used to produce the distinct set.
            private readonly CancellationToken _cancellationToken;
            private Shared<int>? _outputLoopCount; // Allocated in MoveNext to avoid false sharing.
 
            //---------------------------------------------------------------------------------------
            // Instantiates a new distinction operator.
            //
 
            internal DistinctQueryOperatorEnumerator(
                QueryOperatorEnumerator<Pair<TInputOutput, NoKeyMemoizationRequired>, TKey> source, IEqualityComparer<TInputOutput>? comparer,
                CancellationToken cancellationToken)
            {
                Debug.Assert(source != null);
                _source = source;
                _hashLookup = new HashSet<TInputOutput>(comparer);
                _cancellationToken = cancellationToken;
            }
 
            //---------------------------------------------------------------------------------------
            // Walks the single data source, skipping elements it has already seen.
            //
 
            internal override bool MoveNext([MaybeNullWhen(false), AllowNull] ref TInputOutput currentElement, ref int currentKey)
            {
                Debug.Assert(_source != null);
                Debug.Assert(_hashLookup != null);
 
                // Iterate over this set's elements until we find a unique element.
                TKey keyUnused = default!;
                Pair<TInputOutput, NoKeyMemoizationRequired> current = default(Pair<TInputOutput, NoKeyMemoizationRequired>);
 
                _outputLoopCount ??= new Shared<int>(0);
 
                while (_source.MoveNext(ref current, ref keyUnused))
                {
                    if ((_outputLoopCount.Value++ & CancellationState.POLL_INTERVAL) == 0)
                        _cancellationToken.ThrowIfCancellationRequested();
 
                    // We ensure we never return duplicates by tracking them in our set.
                    if (_hashLookup.Add(current.First))
                    {
#if DEBUG
                        currentKey = unchecked((int)0xdeadbeef);
#endif
                        currentElement = current.First;
                        return true;
                    }
                }
 
                return false;
            }
 
            protected override void Dispose(bool disposing)
            {
                Debug.Assert(_source != null);
                _source.Dispose();
            }
        }
 
        //---------------------------------------------------------------------------------------
        // Returns an enumerable that represents the query executing sequentially.
        //
 
        internal override IEnumerable<TInputOutput> AsSequentialQuery(CancellationToken token)
        {
            IEnumerable<TInputOutput> wrappedChild = CancellableEnumerable.Wrap(Child.AsSequentialQuery(token), token);
            return wrappedChild.Distinct(_comparer);
        }
 
        private sealed class OrderedDistinctQueryOperatorEnumerator<TKey> : QueryOperatorEnumerator<TInputOutput, TKey>
        {
            private readonly QueryOperatorEnumerator<Pair<TInputOutput, NoKeyMemoizationRequired>, TKey> _source; // The data source.
            private readonly Dictionary<Wrapper<TInputOutput>, TKey> _hashLookup; // The hash lookup, used to produce the distinct set.
            private readonly IComparer<TKey> _keyComparer; // Comparer to decide the key order.
#pragma warning disable CA1859
            private IEnumerator<KeyValuePair<Wrapper<TInputOutput>, TKey>>? _hashLookupEnumerator; // Enumerates over _hashLookup.
#pragma warning restore
            private readonly CancellationToken _cancellationToken;
 
            //---------------------------------------------------------------------------------------
            // Instantiates a new distinction operator.
            //
 
            internal OrderedDistinctQueryOperatorEnumerator(
                QueryOperatorEnumerator<Pair<TInputOutput, NoKeyMemoizationRequired>, TKey> source,
                IEqualityComparer<TInputOutput>? comparer, IComparer<TKey> keyComparer,
                CancellationToken cancellationToken)
            {
                Debug.Assert(source != null);
                _source = source;
                _keyComparer = keyComparer;
 
                _hashLookup = new Dictionary<Wrapper<TInputOutput>, TKey>(
                    new WrapperEqualityComparer<TInputOutput>(comparer));
 
                _cancellationToken = cancellationToken;
            }
 
            //---------------------------------------------------------------------------------------
            // Walks the single data source, skipping elements it has already seen.
            //
 
            internal override bool MoveNext([MaybeNullWhen(false), AllowNull] ref TInputOutput currentElement, [AllowNull] ref TKey currentKey)
            {
                Debug.Assert(_source != null);
                Debug.Assert(_hashLookup != null);
 
                if (_hashLookupEnumerator == null)
                {
                    Pair<TInputOutput, NoKeyMemoizationRequired> elem = default(Pair<TInputOutput, NoKeyMemoizationRequired>);
                    TKey orderKey = default!;
 
                    int i = 0;
                    while (_source.MoveNext(ref elem, ref orderKey))
                    {
                        if ((i++ & CancellationState.POLL_INTERVAL) == 0)
                            _cancellationToken.ThrowIfCancellationRequested();
 
                        // For each element, we track the smallest order key for that element that we saw so far
                        TKey oldEntry;
 
                        Wrapper<TInputOutput> wrappedElem = new Wrapper<TInputOutput>(elem.First);
 
                        // If this is the first occurrence of this element, or the order key is lower than all keys we saw previously,
                        // update the order key for this element.
                        if (!_hashLookup.TryGetValue(wrappedElem, out oldEntry!) || _keyComparer.Compare(orderKey, oldEntry) < 0)
                        {
                            // For each "elem" value, we store the smallest key, and the element value that had that key.
                            // Note that even though two element values are "equal" according to the EqualityComparer,
                            // we still cannot choose arbitrarily which of the two to yield.
                            _hashLookup[wrappedElem] = orderKey;
                        }
                    }
 
                    _hashLookupEnumerator = _hashLookup.GetEnumerator();
                }
 
                if (_hashLookupEnumerator.MoveNext())
                {
                    KeyValuePair<Wrapper<TInputOutput>, TKey> currentPair = _hashLookupEnumerator.Current;
                    currentElement = currentPair.Key.Value;
                    currentKey = currentPair.Value;
                    return true;
                }
 
                return false;
            }
 
            protected override void Dispose(bool disposing)
            {
                Debug.Assert(_source != null);
                _source.Dispose();
 
                _hashLookupEnumerator?.Dispose();
            }
        }
    }
}