File: System\Numerics\Tensors\netcore\Common\TensorPrimitives.IBooleanUnaryOperator.cs
Web Access
Project: src\src\libraries\System.Numerics.Tensors\src\System.Numerics.Tensors.csproj (System.Numerics.Tensors)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using static System.Numerics.Tensors.TensorPrimitives;
 
namespace System.Numerics.Tensors
{
    public static unsafe partial class TensorPrimitives
    {
        /// <summary>Unary operator that produces a Boolean result for each element.</summary>
        /// <remarks>For vector-based methods, the Boolean result is either all-bits-set or zero.</remarks>
        private interface IBooleanUnaryOperator<T>
        {
            static abstract bool Vectorizable { get; }
            static abstract bool Invoke(T x);
            static abstract Vector128<T> Invoke(Vector128<T> x);
            static abstract Vector256<T> Invoke(Vector256<T> x);
            static abstract Vector512<T> Invoke(Vector512<T> x);
        }
 
        private interface IAnyAllAggregator<T>
        {
            static abstract bool DefaultResult { get; }
            static abstract bool ShouldEarlyExit(bool result);
            static abstract bool ShouldEarlyExit(Vector128<T> result);
            static abstract bool ShouldEarlyExit(Vector256<T> result);
            static abstract bool ShouldEarlyExit(Vector512<T> result);
        }
 
        private readonly struct AnyAggregator<T> : IAnyAllAggregator<T>
        {
            public static bool DefaultResult => false;
 
            public static bool ShouldEarlyExit(bool result) => result;
 
#if NET10_0_OR_GREATER
            public static bool ShouldEarlyExit(Vector128<T> result) => Vector128.AnyWhereAllBitsSet(result);
            public static bool ShouldEarlyExit(Vector256<T> result) => Vector256.AnyWhereAllBitsSet(result);
            public static bool ShouldEarlyExit(Vector512<T> result) => Vector512.AnyWhereAllBitsSet(result);
#else
            public static bool ShouldEarlyExit(Vector128<T> result) =>
                typeof(T) == typeof(float) ? Vector128.EqualsAny(result.AsUInt32(), Vector128<uint>.AllBitsSet) :
                typeof(T) == typeof(double) ? Vector128.EqualsAny(result.AsUInt64(), Vector128<ulong>.AllBitsSet) :
                Vector128.EqualsAny(result, Vector128<T>.AllBitsSet);
 
            public static bool ShouldEarlyExit(Vector256<T> result) =>
                typeof(T) == typeof(float) ? Vector256.EqualsAny(result.AsUInt32(), Vector256<uint>.AllBitsSet) :
                typeof(T) == typeof(double) ? Vector256.EqualsAny(result.AsUInt64(), Vector256<ulong>.AllBitsSet) :
                Vector256.EqualsAny(result, Vector256<T>.AllBitsSet);
 
            public static bool ShouldEarlyExit(Vector512<T> result) =>
                typeof(T) == typeof(float) ? Vector512.EqualsAny(result.AsUInt32(), Vector512<uint>.AllBitsSet) :
                typeof(T) == typeof(double) ? Vector512.EqualsAny(result.AsUInt64(), Vector512<ulong>.AllBitsSet) :
                Vector512.EqualsAny(result, Vector512<T>.AllBitsSet);
#endif
        }
 
        private readonly struct AllAggregator<T> : IAnyAllAggregator<T>
        {
            public static bool DefaultResult => true;
 
            public static bool ShouldEarlyExit(bool result) => !result;
 
            public static bool ShouldEarlyExit(Vector128<T> result) =>
                typeof(T) == typeof(float) ? Vector128.EqualsAny(result.AsUInt32(), Vector128<uint>.Zero) :
                typeof(T) == typeof(double) ? Vector128.EqualsAny(result.AsUInt64(), Vector128<ulong>.Zero) :
                Vector128.EqualsAny(result, Vector128<T>.Zero);
 
            public static bool ShouldEarlyExit(Vector256<T> result) =>
                typeof(T) == typeof(float) ? Vector256.EqualsAny(result.AsUInt32(), Vector256<uint>.Zero) :
                typeof(T) == typeof(double) ? Vector256.EqualsAny(result.AsUInt64(), Vector256<ulong>.Zero) :
                Vector256.EqualsAny(result, Vector256<T>.Zero);
 
            public static bool ShouldEarlyExit(Vector512<T> result) =>
                typeof(T) == typeof(float) ? Vector512.EqualsAny(result.AsUInt32(), Vector512<uint>.Zero) :
                typeof(T) == typeof(double) ? Vector512.EqualsAny(result.AsUInt64(), Vector512<ulong>.Zero) :
                Vector512.EqualsAny(result, Vector512<T>.Zero);
        }
 
        private static bool All<T, TOperator>(ReadOnlySpan<T> x)
            where TOperator : struct, IBooleanUnaryOperator<T> =>
            AggregateAnyAll<T, TOperator, AllAggregator<T>>(x);
 
        private static bool Any<T, TOperator>(ReadOnlySpan<T> x)
            where TOperator : struct, IBooleanUnaryOperator<T> =>
            AggregateAnyAll<T, TOperator, AnyAggregator<T>>(x);
 
        private static bool AggregateAnyAll<T, TOperator, TAnyAll>(ReadOnlySpan<T> x)
            where TOperator : struct, IBooleanUnaryOperator<T>
            where TAnyAll : struct, IAnyAllAggregator<T>
        {
            Debug.Assert(!x.IsEmpty);
 
            ref T xRef = ref MemoryMarshal.GetReference(x);
            int i = 0, oneVectorFromEnd;
 
            if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported)
            {
                oneVectorFromEnd = x.Length - Vector512<T>.Count;
                if (i <= oneVectorFromEnd)
                {
                    // Loop handling one vector at a time.
                    do
                    {
                        if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i))))
                        {
                            return !TAnyAll.DefaultResult;
                        }
 
                        i += Vector512<T>.Count;
                    }
                    while (i <= oneVectorFromEnd);
 
                    // Handle any remaining elements with a final vector.
                    if (i != x.Length &&
                        TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<T>.Count)))))
                    {
                        return !TAnyAll.DefaultResult;
                    }
 
                    return TAnyAll.DefaultResult;
                }
            }
 
            if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported)
            {
                oneVectorFromEnd = x.Length - Vector256<T>.Count;
                if (i <= oneVectorFromEnd)
                {
                    // Loop handling one vector at a time.
                    do
                    {
                        if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i))))
                        {
                            return !TAnyAll.DefaultResult;
                        }
 
                        i += Vector256<T>.Count;
                    }
                    while (i <= oneVectorFromEnd);
 
                    // Handle any remaining elements with a final vector.
                    if (i != x.Length &&
                        TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<T>.Count)))))
                    {
                        return !TAnyAll.DefaultResult;
                    }
 
                    return TAnyAll.DefaultResult;
                }
            }
 
            if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported)
            {
                oneVectorFromEnd = x.Length - Vector128<T>.Count;
                if (i <= oneVectorFromEnd)
                {
                    // Loop handling one vector at a time.
                    do
                    {
                        if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i))))
                        {
                            return !TAnyAll.DefaultResult;
                        }
 
                        i += Vector128<T>.Count;
                    }
                    while (i <= oneVectorFromEnd);
 
                    // Handle any remaining elements with a final vector.
                    if (i != x.Length &&
                        TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<T>.Count)))))
                    {
                        return !TAnyAll.DefaultResult;
                    }
 
                    return TAnyAll.DefaultResult;
                }
            }
 
            while (i < x.Length)
            {
                if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Unsafe.Add(ref xRef, i))))
                {
                    return !TAnyAll.DefaultResult;
                }
 
                i++;
            }
 
            return TAnyAll.DefaultResult;
        }
 
        /// <summary>Performs an element-wise operation on <paramref name="x"/> and writes the results to <paramref name="destination"/>.</summary>
        /// <typeparam name="T">The element input type.</typeparam>
        /// <typeparam name="TOperator">Specifies the operation to perform on each element loaded from <paramref name="x"/>.</typeparam>
        private static void InvokeSpanIntoSpan<T, TOperator>(
            ReadOnlySpan<T> x, Span<bool> destination)
            where TOperator : struct, IBooleanUnaryOperator<T>
        {
            if (x.Length > destination.Length)
            {
                ThrowHelper.ThrowArgument_DestinationTooShort();
            }
 
            if (sizeof(T) == 1)
            {
                Vectorized_Size1(x, destination);
            }
            else if (sizeof(T) == 2)
            {
                Vectorized_Size2(x, destination);
            }
            else if (sizeof(T) == 4)
            {
                Vectorized_Size4(x, destination);
            }
            else
            {
                Vectorized_Size8OrOther(x, destination);
            }
 
            static void Vectorized_Size1(ReadOnlySpan<T> x, Span<bool> destination)
            {
                Debug.Assert(sizeof(T) == sizeof(bool));
 
                ref T xRef = ref MemoryMarshal.GetReference(x);
                ref bool destinationRef = ref MemoryMarshal.GetReference(destination);
                int i = 0;
 
                if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported)
                {
                    int vectorFromEnd = x.Length - Vector512<T>.Count;
                    if (i <= vectorFromEnd)
                    {
                        // Loop handling one input vector / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector512<T>.Count;
                        }
                        while (i <= vectorFromEnd);
 
                        // Handle any remaining elements with a final vector.
                        if (i != x.Length)
                        {
                            i = x.Length - Vector512<T>.Count;
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector512<byte> v = TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsByte();
 
                            (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported)
                {
                    int vectorFromEnd = x.Length - Vector256<T>.Count;
                    if (i <= vectorFromEnd)
                    {
                        // Loop handling one input vector / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector256<T>.Count;
                        }
                        while (i <= vectorFromEnd);
 
                        // Handle any remaining elements with a final vector.
                        if (i != x.Length)
                        {
                            i = x.Length - Vector256<T>.Count;
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector256<byte> v = TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsByte();
 
                            (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported)
                {
                    int vectorFromEnd = x.Length - Vector128<T>.Count;
                    if (i <= vectorFromEnd)
                    {
                        // Loop handling one input vector / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector128<T>.Count;
                        }
                        while (i <= vectorFromEnd);
 
                        // Handle any remaining elements with a final vector.
                        if (i != x.Length)
                        {
                            i = x.Length - Vector128<T>.Count;
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector128<byte> v = TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsByte();
 
                            (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                while (i < x.Length)
                {
                    Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i));
                    i++;
                }
            }
 
            static void Vectorized_Size2(ReadOnlySpan<T> x, Span<bool> destination)
            {
                Debug.Assert(sizeof(T) == 2 * sizeof(bool));
 
                ref T xRef = ref MemoryMarshal.GetReference(x);
                ref bool destinationRef = ref MemoryMarshal.GetReference(destination);
                int i = 0;
 
                if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector512<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector512<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector512<byte> v =
                                Vector512.Narrow(
                                    TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(),
                                    TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt16());
 
                            (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector256<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector256<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector256<byte> v =
                                Vector256.Narrow(
                                    TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(),
                                    TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt16());
 
                            (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector128<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector128<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector128<byte> v =
                                Vector128.Narrow(
                                    TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(),
                                    TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt16());
 
                            (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                while (i < x.Length)
                {
                    Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i));
                    i++;
                }
            }
 
            static void Vectorized_Size4(ReadOnlySpan<T> x, Span<bool> destination)
            {
                Debug.Assert(sizeof(T) == 4 * sizeof(bool));
 
                ref T xRef = ref MemoryMarshal.GetReference(x);
                ref bool destinationRef = ref MemoryMarshal.GetReference(destination);
                int i = 0;
 
                if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector512<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector512<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector512<byte> v =
                                Vector512.Narrow(
                                    Vector512.Narrow(
                                        TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(),
                                        TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt32()),
                                    Vector512.Narrow(
                                        TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector512<T>.Count)))).AsUInt32(),
                                        TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector512<T>.Count)))).AsUInt32()));
 
                            (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector256<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector256<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector256<byte> v =
                                Vector256.Narrow(
                                    Vector256.Narrow(
                                        TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(),
                                        TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt32()),
                                    Vector256.Narrow(
                                        TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector256<T>.Count)))).AsUInt32(),
                                        TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector256<T>.Count)))).AsUInt32()));
 
                            (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported)
                {
                    int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector128<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector128<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector128<byte> v =
                                Vector128.Narrow(
                                    Vector128.Narrow(
                                        TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(),
                                        TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt32()),
                                    Vector128.Narrow(
                                        TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector128<T>.Count)))).AsUInt32(),
                                        TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector128<T>.Count)))).AsUInt32()));
 
                            (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                while (i < x.Length)
                {
                    Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i));
                    i++;
                }
            }
 
            static void Vectorized_Size8OrOther(ReadOnlySpan<T> x, Span<bool> destination)
            {
                ref T xRef = ref MemoryMarshal.GetReference(x);
                ref bool destinationRef = ref MemoryMarshal.GetReference(destination);
                int i = 0;
 
                if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported)
                {
                    Debug.Assert(sizeof(T) == 8 * sizeof(bool));
 
                    int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector512<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector512<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector512<byte> v =
                                Vector512.Narrow(
                                    Vector512.Narrow(
                                        Vector512.Narrow(
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(),
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt64()),
                                        Vector512.Narrow(
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector512<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector512<T>.Count)))).AsUInt64())),
                                    Vector512.Narrow(
                                        Vector512.Narrow(
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector512<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector512<T>.Count)))).AsUInt64()),
                                        Vector512.Narrow(
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector512<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector512<T>.Count)))).AsUInt64())));
 
                            (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported)
                {
                    Debug.Assert(sizeof(T) == 8 * sizeof(bool));
 
                    int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector256<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector256<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector256<byte> v =
                                Vector256.Narrow(
                                    Vector256.Narrow(
                                        Vector256.Narrow(
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(),
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt64()),
                                        Vector256.Narrow(
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector256<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector256<T>.Count)))).AsUInt64())),
                                    Vector256.Narrow(
                                        Vector256.Narrow(
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector256<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector256<T>.Count)))).AsUInt64()),
                                        Vector256.Narrow(
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector256<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector256<T>.Count)))).AsUInt64())));
 
                            (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported)
                {
                    Debug.Assert(sizeof(T) == 8 * sizeof(bool));
 
                    int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T));
                    if (i <= vectorsFromEnd)
                    {
                        // Loop handling two input vectors / one output vector at a time.
                        do
                        {
                            Process(ref xRef, ref destinationRef, i);
                            i += Vector128<T>.Count * sizeof(T);
                        }
                        while (i <= vectorsFromEnd);
 
                        // Handle any remaining elements with final vectors.
                        if (i != x.Length)
                        {
                            i = x.Length - (Vector128<T>.Count * sizeof(T));
                            Process(ref xRef, ref destinationRef, i);
                        }
 
                        return;
 
                        static void Process(ref T xRef, ref bool destinationRef, int i)
                        {
                            Vector128<byte> v =
                                Vector128.Narrow(
                                    Vector128.Narrow(
                                        Vector128.Narrow(
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(),
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt64()),
                                        Vector128.Narrow(
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector128<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector128<T>.Count)))).AsUInt64())),
                                    Vector128.Narrow(
                                        Vector128.Narrow(
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector128<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector128<T>.Count)))).AsUInt64()),
                                        Vector128.Narrow(
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector128<T>.Count)))).AsUInt64(),
                                            TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector128<T>.Count)))).AsUInt64())));
 
                            (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i);
                        }
                    }
                }
 
                while (i < x.Length)
                {
                    Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i));
                    i++;
                }
            }
        }
    }
}