// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; namespace System.Numerics.Tensors { public static unsafe partial class TensorPrimitives { /// <summary>Unary operator that produces a Boolean result for each element.</summary> /// <remarks>For vector-based methods, the Boolean result is either all-bits-set or zero.</remarks> private interface IBooleanUnaryOperator<T> { static abstract bool Vectorizable { get; } static abstract bool Invoke(T x); static abstract Vector128<T> Invoke(Vector128<T> x); static abstract Vector256<T> Invoke(Vector256<T> x); static abstract Vector512<T> Invoke(Vector512<T> x); } private interface IAnyAllAggregator<T> { static abstract bool DefaultResult { get; } static abstract bool ShouldEarlyExit(bool result); static abstract bool ShouldEarlyExit(Vector128<T> result); static abstract bool ShouldEarlyExit(Vector256<T> result); static abstract bool ShouldEarlyExit(Vector512<T> result); } private readonly struct AnyAggregator<T> : IAnyAllAggregator<T> { public static bool DefaultResult => false; public static bool ShouldEarlyExit(bool result) => result; public static bool ShouldEarlyExit(Vector128<T> result) => Vector128.AnyWhereAllBitsSet(result); public static bool ShouldEarlyExit(Vector256<T> result) => Vector256.AnyWhereAllBitsSet(result); public static bool ShouldEarlyExit(Vector512<T> result) => Vector512.AnyWhereAllBitsSet(result); } private readonly struct AllAggregator<T> : IAnyAllAggregator<T> { public static bool DefaultResult => true; public static bool ShouldEarlyExit(bool result) => !result; public static bool ShouldEarlyExit(Vector128<T> result) => typeof(T) == typeof(float) ? Vector128.EqualsAny(result.AsUInt32(), Vector128<uint>.Zero) : typeof(T) == typeof(double) ? Vector128.EqualsAny(result.AsUInt64(), Vector128<ulong>.Zero) : Vector128.EqualsAny(result, Vector128<T>.Zero); public static bool ShouldEarlyExit(Vector256<T> result) => typeof(T) == typeof(float) ? Vector256.EqualsAny(result.AsUInt32(), Vector256<uint>.Zero) : typeof(T) == typeof(double) ? Vector256.EqualsAny(result.AsUInt64(), Vector256<ulong>.Zero) : Vector256.EqualsAny(result, Vector256<T>.Zero); public static bool ShouldEarlyExit(Vector512<T> result) => typeof(T) == typeof(float) ? Vector512.EqualsAny(result.AsUInt32(), Vector512<uint>.Zero) : typeof(T) == typeof(double) ? Vector512.EqualsAny(result.AsUInt64(), Vector512<ulong>.Zero) : Vector512.EqualsAny(result, Vector512<T>.Zero); } private static bool All<T, TOperator>(ReadOnlySpan<T> x) where TOperator : struct, IBooleanUnaryOperator<T> => AggregateAnyAll<T, TOperator, AllAggregator<T>>(x); private static bool Any<T, TOperator>(ReadOnlySpan<T> x) where TOperator : struct, IBooleanUnaryOperator<T> => AggregateAnyAll<T, TOperator, AnyAggregator<T>>(x); private static bool AggregateAnyAll<T, TOperator, TAnyAll>(ReadOnlySpan<T> x) where TOperator : struct, IBooleanUnaryOperator<T> where TAnyAll : struct, IAnyAllAggregator<T> { Debug.Assert(!x.IsEmpty); ref T xRef = ref MemoryMarshal.GetReference(x); int i = 0, oneVectorFromEnd; if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported) { oneVectorFromEnd = x.Length - Vector512<T>.Count; if (i <= oneVectorFromEnd) { // Loop handling one vector at a time. do { if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)))) { return !TAnyAll.DefaultResult; } i += Vector512<T>.Count; } while (i <= oneVectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length && TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<T>.Count))))) { return !TAnyAll.DefaultResult; } return TAnyAll.DefaultResult; } } if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported) { oneVectorFromEnd = x.Length - Vector256<T>.Count; if (i <= oneVectorFromEnd) { // Loop handling one vector at a time. do { if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)))) { return !TAnyAll.DefaultResult; } i += Vector256<T>.Count; } while (i <= oneVectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length && TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<T>.Count))))) { return !TAnyAll.DefaultResult; } return TAnyAll.DefaultResult; } } if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported) { oneVectorFromEnd = x.Length - Vector128<T>.Count; if (i <= oneVectorFromEnd) { // Loop handling one vector at a time. do { if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)))) { return !TAnyAll.DefaultResult; } i += Vector128<T>.Count; } while (i <= oneVectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length && TAnyAll.ShouldEarlyExit(TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<T>.Count))))) { return !TAnyAll.DefaultResult; } return TAnyAll.DefaultResult; } } while (i < x.Length) { if (TAnyAll.ShouldEarlyExit(TOperator.Invoke(Unsafe.Add(ref xRef, i)))) { return !TAnyAll.DefaultResult; } i++; } return TAnyAll.DefaultResult; } /// <summary>Performs an element-wise operation on <paramref name="x"/> and writes the results to <paramref name="destination"/>.</summary> /// <typeparam name="T">The element input type.</typeparam> /// <typeparam name="TOperator">Specifies the operation to perform on each element loaded from <paramref name="x"/>.</typeparam> private static void InvokeSpanIntoSpan<T, TOperator>( ReadOnlySpan<T> x, Span<bool> destination) where TOperator : struct, IBooleanUnaryOperator<T> { if (x.Length > destination.Length) { ThrowHelper.ThrowArgument_DestinationTooShort(); } if (sizeof(T) == 1) { Vectorized_Size1(x, destination); } else if (sizeof(T) == 2) { Vectorized_Size2(x, destination); } else if (sizeof(T) == 4) { Vectorized_Size4(x, destination); } else { Vectorized_Size8OrOther(x, destination); } static void Vectorized_Size1(ReadOnlySpan<T> x, Span<bool> destination) { Debug.Assert(sizeof(T) == sizeof(bool)); ref T xRef = ref MemoryMarshal.GetReference(x); ref bool destinationRef = ref MemoryMarshal.GetReference(destination); int i = 0; if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported) { int vectorFromEnd = x.Length - Vector512<T>.Count; if (i <= vectorFromEnd) { // Loop handling one input vector / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector512<T>.Count; } while (i <= vectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length) { i = x.Length - Vector512<T>.Count; Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector512<byte> v = TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsByte(); (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported) { int vectorFromEnd = x.Length - Vector256<T>.Count; if (i <= vectorFromEnd) { // Loop handling one input vector / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector256<T>.Count; } while (i <= vectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length) { i = x.Length - Vector256<T>.Count; Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector256<byte> v = TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsByte(); (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported) { int vectorFromEnd = x.Length - Vector128<T>.Count; if (i <= vectorFromEnd) { // Loop handling one input vector / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector128<T>.Count; } while (i <= vectorFromEnd); // Handle any remaining elements with a final vector. if (i != x.Length) { i = x.Length - Vector128<T>.Count; Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector128<byte> v = TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsByte(); (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } while (i < x.Length) { Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i)); i++; } } static void Vectorized_Size2(ReadOnlySpan<T> x, Span<bool> destination) { Debug.Assert(sizeof(T) == 2 * sizeof(bool)); ref T xRef = ref MemoryMarshal.GetReference(x); ref bool destinationRef = ref MemoryMarshal.GetReference(destination); int i = 0; if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector512<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector512<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector512<byte> v = Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt16()); (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector256<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector256<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector256<byte> v = Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt16()); (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector128<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector128<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector128<byte> v = Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt16(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt16()); (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } while (i < x.Length) { Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i)); i++; } } static void Vectorized_Size4(ReadOnlySpan<T> x, Span<bool> destination) { Debug.Assert(sizeof(T) == 4 * sizeof(bool)); ref T xRef = ref MemoryMarshal.GetReference(x); ref bool destinationRef = ref MemoryMarshal.GetReference(destination); int i = 0; if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector512<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector512<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector512<byte> v = Vector512.Narrow( Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt32()), Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector512<T>.Count)))).AsUInt32(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector512<T>.Count)))).AsUInt32())); (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector256<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector256<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector256<byte> v = Vector256.Narrow( Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt32()), Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector256<T>.Count)))).AsUInt32(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector256<T>.Count)))).AsUInt32())); (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported) { int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector128<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector128<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector128<byte> v = Vector128.Narrow( Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt32(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt32()), Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector128<T>.Count)))).AsUInt32(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector128<T>.Count)))).AsUInt32())); (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } while (i < x.Length) { Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i)); i++; } } static void Vectorized_Size8OrOther(ReadOnlySpan<T> x, Span<bool> destination) { ref T xRef = ref MemoryMarshal.GetReference(x); ref bool destinationRef = ref MemoryMarshal.GetReference(destination); int i = 0; if (Vector512.IsHardwareAccelerated && TOperator.Vectorizable && Vector512<T>.IsSupported) { Debug.Assert(sizeof(T) == 8 * sizeof(bool)); int vectorsFromEnd = x.Length - (Vector512<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector512<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector512<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector512<byte> v = Vector512.Narrow( Vector512.Narrow( Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + Vector512<T>.Count))).AsUInt64()), Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector512<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector512<T>.Count)))).AsUInt64())), Vector512.Narrow( Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector512<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector512<T>.Count)))).AsUInt64()), Vector512.Narrow( TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector512<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector512<T>.Count)))).AsUInt64()))); (v & Vector512<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector256.IsHardwareAccelerated && TOperator.Vectorizable && Vector256<T>.IsSupported) { Debug.Assert(sizeof(T) == 8 * sizeof(bool)); int vectorsFromEnd = x.Length - (Vector256<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector256<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector256<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector256<byte> v = Vector256.Narrow( Vector256.Narrow( Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + Vector256<T>.Count))).AsUInt64()), Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector256<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector256<T>.Count)))).AsUInt64())), Vector256.Narrow( Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector256<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector256<T>.Count)))).AsUInt64()), Vector256.Narrow( TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector256<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector256<T>.Count)))).AsUInt64()))); (v & Vector256<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } if (Vector128.IsHardwareAccelerated && TOperator.Vectorizable && Vector128<T>.IsSupported) { Debug.Assert(sizeof(T) == 8 * sizeof(bool)); int vectorsFromEnd = x.Length - (Vector128<T>.Count * sizeof(T)); if (i <= vectorsFromEnd) { // Loop handling two input vectors / one output vector at a time. do { Process(ref xRef, ref destinationRef, i); i += Vector128<T>.Count * sizeof(T); } while (i <= vectorsFromEnd); // Handle any remaining elements with final vectors. if (i != x.Length) { i = x.Length - (Vector128<T>.Count * sizeof(T)); Process(ref xRef, ref destinationRef, i); } return; static void Process(ref T xRef, ref bool destinationRef, int i) { Vector128<byte> v = Vector128.Narrow( Vector128.Narrow( Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)).AsUInt64(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + Vector128<T>.Count))).AsUInt64()), Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (2 * Vector128<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (3 * Vector128<T>.Count)))).AsUInt64())), Vector128.Narrow( Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (4 * Vector128<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (5 * Vector128<T>.Count)))).AsUInt64()), Vector128.Narrow( TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (6 * Vector128<T>.Count)))).AsUInt64(), TOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(i + (7 * Vector128<T>.Count)))).AsUInt64()))); (v & Vector128<byte>.One).StoreUnsafe(ref Unsafe.As<bool, byte>(ref destinationRef), (uint)i); } } } while (i < x.Length) { Unsafe.Add(ref destinationRef, i) = TOperator.Invoke(Unsafe.Add(ref xRef, i)); i++; } } } } } |