File: System\Numerics\Tensors\netcore\TensorPrimitives.PopCount.cs
Web Access
Project: src\src\libraries\System.Numerics.Tensors\src\System.Numerics.Tensors.csproj (System.Numerics.Tensors)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
 
namespace System.Numerics.Tensors
{
    public static partial class TensorPrimitives
    {
        /// <summary>Computes the population count of all elements in the specified tensor.</summary>
        /// <param name="x">The tensor, represented as a span.</param>
        /// <returns>The sum of the number of bits set in each element in <paramref name="x"/>.</returns>
        public static long PopCount<T>(ReadOnlySpan<T> x) where T : IBinaryInteger<T>
        {
            long count = 0;
            for (int i = 0; i < x.Length; i++)
            {
                count += long.CreateTruncating(T.PopCount(x[i]));
            }
 
            return count;
        }
 
        /// <summary>Computes the element-wise population count of numbers in the specified tensor.</summary>
        /// <param name="x">The tensor, represented as a span.</param>
        /// <param name="destination">The destination tensor, represented as a span.</param>
        /// <exception cref="ArgumentException">Destination is too short.</exception>
        /// <exception cref="ArgumentException"><paramref name="x"/> and <paramref name="destination"/> reference overlapping memory locations and do not begin at the same location.</exception>
        /// <remarks>
        /// <para>
        /// This method effectively computes <c><paramref name="destination" />[i] = T.PopCount(<paramref name="x" />[i])</c>.
        /// </para>
        /// </remarks>
        public static void PopCount<T>(ReadOnlySpan<T> x, Span<T> destination)
            where T : IBinaryInteger<T> =>
            InvokeSpanIntoSpan<T, PopCountOperator<T>>(x, destination);
 
        /// <summary>T.PopCount(x)</summary>
        private readonly unsafe struct PopCountOperator<T> : IUnaryOperator<T, T> where T : IBinaryInteger<T>
        {
            // TODO https://github.com/dotnet/runtime/issues/96162: Use AVX512 popcount operations when available
 
            public static bool Vectorizable =>
                // The implementation uses a vectorized version of the BitOperations.PopCount software fallback:
                // https://github.com/dotnet/runtime/blob/aff061bab1b6d9ccd5731bd16fa8e89ad82ab75a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs#L496-L508
                // This relies on 64-bit shifts for sizeof(T) == 8, and such shifts aren't accelerated on today's hardware.
                // Alternative approaches, such as doing two 32-bit operations and combining them were observed to not
                // provide any meaningfuls speedup over scalar. So for now, we don't vectorize when sizeof(T) == 8.
                (sizeof(T) is 1 or 2 or 4) || (AdvSimd.IsSupported && sizeof(T) == 8);
 
            public static T Invoke(T x) => T.PopCount(x);
 
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            public static Vector128<T> Invoke(Vector128<T> x)
            {
                if (AdvSimd.IsSupported)
                {
                    Vector128<byte> cnt = AdvSimd.PopCount(x.AsByte());
 
                    if (sizeof(T) == 1)
                    {
                        return cnt.As<byte, T>();
                    }
 
                    if (sizeof(T) == 2)
                    {
                        return AdvSimd.AddPairwiseWidening(cnt).As<ushort, T>();
                    }
 
                    if (sizeof(T) == 4)
                    {
                        return AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(cnt)).As<uint, T>();
                    }
 
                    if (sizeof(T) == 8)
                    {
                        return AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(cnt))).As<ulong, T>();
                    }
                }
 
                if (sizeof(T) == 1)
                {
                    if (PackedSimd.IsSupported)
                    {
                        return PackedSimd.PopCount(x.AsByte()).As<byte, T>();
                    }
 
                    Vector128<byte> c1 = Vector128.Create((byte)0x55);
                    Vector128<byte> c2 = Vector128.Create((byte)0x33);
                    Vector128<byte> c3 = Vector128.Create((byte)0x0F);
 
                    // We don't have a per element shuffle for byte on some platforms.
                    // However, we do currently always have a 16-bit shift available and
                    // due to how the algorithm works, we don't need to worry about
                    // any bits that shift into the lower 8-bits from the upper 8-bits.
                    Vector128<byte> tmp = x.AsByte();
                    tmp -= (x.AsUInt16() >> 1).AsByte() & c1;
                    tmp = (tmp & c2) + ((tmp.AsUInt16() >> 2).AsByte() & c2);
                    return ((tmp + (tmp.AsUInt16() >> 4).AsByte()) & c3).As<byte, T>();
                }
 
                if (sizeof(T) == 2)
                {
                    Vector128<ushort> c1 = Vector128.Create((ushort)0x5555);
                    Vector128<ushort> c2 = Vector128.Create((ushort)0x3333);
                    Vector128<ushort> c3 = Vector128.Create((ushort)0x0F0F);
                    Vector128<ushort> c4 = Vector128.Create((ushort)0x0101);
 
                    Vector128<ushort> tmp = x.AsUInt16();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 8;
                    return tmp.As<ushort, T>();
                }
 
                Debug.Assert(sizeof(T) == 4);
                {
                    Vector128<uint> c1 = Vector128.Create(0x55555555u);
                    Vector128<uint> c2 = Vector128.Create(0x33333333u);
                    Vector128<uint> c3 = Vector128.Create(0x0F0F0F0Fu);
                    Vector128<uint> c4 = Vector128.Create(0x01010101u);
 
                    Vector128<uint> tmp = x.AsUInt32();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 24;
                    return tmp.As<uint, T>();
                }
            }
 
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            public static Vector256<T> Invoke(Vector256<T> x)
            {
                if (sizeof(T) == 1)
                {
                    Vector256<byte> c1 = Vector256.Create((byte)0x55);
                    Vector256<byte> c2 = Vector256.Create((byte)0x33);
                    Vector256<byte> c3 = Vector256.Create((byte)0x0F);
 
                    // We don't have a per element shuffle for byte on some platforms.
                    // However, we do currently always have a 16-bit shift available and
                    // due to how the algorithm works, we don't need to worry about
                    // any bits that shift into the lower 8-bits from the upper 8-bits.
                    Vector256<byte> tmp = x.AsByte();
                    tmp -= (x.AsUInt16() >> 1).AsByte() & c1;
                    tmp = (tmp & c2) + ((tmp.AsUInt16() >> 2).AsByte() & c2);
                    return ((tmp + (tmp.AsUInt16() >> 4).AsByte()) & c3).As<byte, T>();
                }
 
                if (sizeof(T) == 2)
                {
                    Vector256<ushort> c1 = Vector256.Create((ushort)0x5555);
                    Vector256<ushort> c2 = Vector256.Create((ushort)0x3333);
                    Vector256<ushort> c3 = Vector256.Create((ushort)0x0F0F);
                    Vector256<ushort> c4 = Vector256.Create((ushort)0x0101);
 
                    Vector256<ushort> tmp = x.AsUInt16();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 8;
                    return tmp.As<ushort, T>();
                }
 
                Debug.Assert(sizeof(T) == 4);
                {
                    Vector256<uint> c1 = Vector256.Create(0x55555555u);
                    Vector256<uint> c2 = Vector256.Create(0x33333333u);
                    Vector256<uint> c3 = Vector256.Create(0x0F0F0F0Fu);
                    Vector256<uint> c4 = Vector256.Create(0x01010101u);
 
                    Vector256<uint> tmp = x.AsUInt32();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 24;
                    return tmp.As<uint, T>();
                }
            }
 
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            public static Vector512<T> Invoke(Vector512<T> x)
            {
                if (sizeof(T) == 1)
                {
                    Vector512<byte> c1 = Vector512.Create((byte)0x55);
                    Vector512<byte> c2 = Vector512.Create((byte)0x33);
                    Vector512<byte> c3 = Vector512.Create((byte)0x0F);
 
                    // We don't have a per element shuffle for byte on some platforms.
                    // However, we do currently always have a 16-bit shift available and
                    // due to how the algorithm works, we don't need to worry about
                    // any bits that shift into the lower 8-bits from the upper 8-bits.
                    Vector512<byte> tmp = x.AsByte();
                    tmp -= (x.AsUInt16() >> 1).AsByte() & c1;
                    tmp = (tmp & c2) + ((tmp.AsUInt16() >> 2).AsByte() & c2);
                    return ((tmp + (tmp.AsUInt16() >> 4).AsByte()) & c3).As<byte, T>();
                }
 
                if (sizeof(T) == 2)
                {
                    Vector512<ushort> c1 = Vector512.Create((ushort)0x5555);
                    Vector512<ushort> c2 = Vector512.Create((ushort)0x3333);
                    Vector512<ushort> c3 = Vector512.Create((ushort)0x0F0F);
                    Vector512<ushort> c4 = Vector512.Create((ushort)0x0101);
 
                    Vector512<ushort> tmp = x.AsUInt16();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 8;
                    return tmp.As<ushort, T>();
                }
 
                Debug.Assert(sizeof(T) == 4);
                {
                    Vector512<uint> c1 = Vector512.Create(0x55555555u);
                    Vector512<uint> c2 = Vector512.Create(0x33333333u);
                    Vector512<uint> c3 = Vector512.Create(0x0F0F0F0Fu);
                    Vector512<uint> c4 = Vector512.Create(0x01010101u);
 
                    Vector512<uint> tmp = x.AsUInt32();
                    tmp -= (tmp >> 1) & c1;
                    tmp = (tmp & c2) + ((tmp >> 2) & c2);
                    tmp = (((tmp + (tmp >> 4)) & c3) * c4) >> 24;
                    return tmp.As<uint, T>();
                }
            }
        }
    }
}