File: Deprecated\Vector\VectorUtils.cs
Web Access
Project: src\src\Microsoft.ML.Data\Microsoft.ML.Data.csproj (Microsoft.ML.Data)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Internal.CpuMath;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
 
namespace Microsoft.ML.Numeric
{
    /// <summary>
    /// A series of vector utility functions, generally operating over arrays or <see cref="VBuffer{T}"/>
    /// structures. The convention is that if a array or buffer is not modified, that is, it is treated
    /// as a constant, it might have the name <c>a</c> or <c>b</c> or <c>src</c>, but in a situation
    /// where the vector structure might be changed the parameter might have the name <c>dst</c>.
    /// </summary>
    [BestFriend]
    internal static partial class VectorUtils
    {
        public static float DotProduct(float[] a, float[] b)
        {
            Contracts.Check(Utils.Size(a) == Utils.Size(b), "Arrays must have the same length");
            Contracts.Check(Utils.Size(a) > 0);
            return CpuMathUtils.DotProductDense(a, b, a.Length);
        }
 
        public static float DotProduct(float[] a, in VBuffer<float> b)
        {
            Contracts.Check(Utils.Size(a) == b.Length, "Vectors must have the same dimensionality.");
            var bValues = b.GetValues();
            if (bValues.Length == 0)
                return 0;
            if (b.IsDense)
                return CpuMathUtils.DotProductDense(a, bValues, b.Length);
            return CpuMathUtils.DotProductSparse(a, bValues, b.GetIndices(), bValues.Length);
        }
 
        public static float DotProduct(in VBuffer<float> a, in VBuffer<float> b)
        {
            Contracts.Check(a.Length == b.Length, "Vectors must have the same dimensionality.");
 
            var aValues = a.GetValues();
            var bValues = b.GetValues();
            if (aValues.Length == 0 || bValues.Length == 0)
                return 0;
 
            if (a.IsDense)
            {
                if (b.IsDense)
                    return CpuMathUtils.DotProductDense(aValues, bValues, a.Length);
                return CpuMathUtils.DotProductSparse(aValues, bValues, b.GetIndices(), bValues.Length);
            }
 
            if (b.IsDense)
                return CpuMathUtils.DotProductSparse(bValues, aValues, a.GetIndices(), aValues.Length);
            return DotProductSparse(aValues, a.GetIndices(), 0, aValues.Length, bValues, b.GetIndices(), 0, bValues.Length);
        }
 
        /// <summary>
        ///  Sparsify vector A (keep at most <paramref name="top"/>+<paramref name="bottom"/> values)
        /// and optionally rescale values to the [-1, 1] range.
        /// <param name="a">Vector to be sparsified and normalized.</param>
        /// <param name="top">How many top (positive) elements to preserve after sparsification.</param>
        /// <param name="bottom">How many bottom (negative) elements to preserve after sparsification.</param>
        /// <param name="normalize">Whether to normalize results to [-1,1] range.</param>
        /// </summary>
        public static void SparsifyNormalize(ref VBuffer<float> a, int top, int bottom, bool normalize)
        {
            Contracts.CheckParam(top >= 0, nameof(top), "Top count needs to be non-negative");
            Contracts.CheckParam(bottom >= 0, nameof(bottom), "Bottom count needs to be non-negative");
            float absMax = 0;
 
            // In the top heap, we pop the smallest values, so that the 'top' largest remain.
            var topHeap = new Heap<KeyValuePair<int, float>>((left, right) => right.Value < left.Value, top + 1);
            var bottomHeap = new Heap<KeyValuePair<int, float>>((left, right) => right.Value > left.Value, bottom + 1);
            bool isDense = a.IsDense;
 
            var aValues = a.GetValues();
            var aIndices = a.GetIndices();
            for (int i = 0; i < aValues.Length; i++)
            {
                int idx = isDense ? i : aIndices[i];
                var value = aValues[i];
 
                if (value < 0 && bottom > 0)
                {
                    if (bottomHeap.Count == bottom && value > bottomHeap.Top.Value)
                        continue;
 
                    bottomHeap.Add(new KeyValuePair<int, float>(idx, value));
                    if (bottomHeap.Count > bottom)
                    {
                        bottomHeap.Pop();
                        Contracts.Assert(bottomHeap.Count == bottom);
                    }
                }
 
                if (value > 0 && top > 0)
                {
                    if (topHeap.Count == top && value < topHeap.Top.Value)
                        continue;
 
                    topHeap.Add(new KeyValuePair<int, float>(idx, value));
                    if (topHeap.Count > top)
                    {
                        topHeap.Pop();
                        Contracts.Assert(topHeap.Count == top);
                    }
                }
            }
 
            var newCount = topHeap.Count + bottomHeap.Count;
            var aEditor = VBufferEditor.Create(ref a, a.Length, newCount, requireIndicesOnDense: true);
            int count = 0;
            while (topHeap.Count > 0)
            {
                var pair = topHeap.Pop();
                aEditor.Indices[count] = pair.Key;
                aEditor.Values[count++] = pair.Value;
            }
 
            while (bottomHeap.Count > 0)
            {
                var pair = bottomHeap.Pop();
                aEditor.Indices[count] = pair.Key;
                aEditor.Values[count++] = pair.Value;
            }
 
            Contracts.Assert(count == newCount);
 
            if (normalize)
            {
                for (var i = 0; i < newCount; i++)
                {
                    var value = aEditor.Values[i];
                    var absValue = Math.Abs(value);
                    if (absValue > absMax)
                        absMax = absValue;
                }
 
                if (absMax != 0)
                {
                    var ratio = 1 / absMax;
                    for (var i = 0; i < newCount; i++)
                        aEditor.Values[i] = ratio * aEditor.Values[i];
                }
            }
 
            if (!aEditor.Indices.IsEmpty)
                GenericSpanSortHelper<int>.Sort(aEditor.Indices, aEditor.Values, 0, newCount);
            a = aEditor.Commit();
        }
 
        /// <summary>
        /// Multiplies arrays Dst *= A element by element and returns the result in <paramref name="dst"/> (Hadamard product).
        /// </summary>
        public static void MulElementWise(in VBuffer<float> a, ref VBuffer<float> dst)
        {
            Contracts.Check(a.Length == dst.Length, "Vectors must have the same dimensionality.");
 
            if (a.IsDense && dst.IsDense)
            {
                var editor = VBufferEditor.CreateFromBuffer(ref dst);
                CpuMathUtils.MulElementWise(a.GetValues(), dst.GetValues(), editor.Values, a.Length);
            }
            else
                VBufferUtils.ApplyWithEitherDefined(in a, ref dst, (int ind, float v1, ref float v2) => { v2 *= v1; });
        }
 
        private static float L2DistSquaredSparse(ReadOnlySpan<float> valuesA, ReadOnlySpan<int> indicesA, ReadOnlySpan<float> valuesB, ReadOnlySpan<int> indicesB)
        {
            Contracts.Assert(valuesA.Length == indicesA.Length);
            Contracts.Assert(valuesB.Length == indicesB.Length);
 
            float res = 0;
 
            int ia = 0;
            int ib = 0;
            while (ia < indicesA.Length && ib < indicesB.Length)
            {
                int diff = indicesA[ia] - indicesB[ib];
                float d;
                if (diff == 0)
                {
                    d = valuesA[ia] - valuesB[ib];
                    ia++;
                    ib++;
                }
                else if (diff < 0)
                {
                    d = valuesA[ia];
                    ia++;
                }
                else
                {
                    d = valuesB[ib];
                    ib++;
                }
                res += d * d;
            }
 
            while (ia < indicesA.Length)
            {
                var d = valuesA[ia];
                res += d * d;
                ia++;
            }
 
            while (ib < indicesB.Length)
            {
                var d = valuesB[ib];
                res += d * d;
                ib++;
            }
 
            return res;
        }
 
        private static float L2DistSquaredHalfSparse(ReadOnlySpan<float> valuesA, ReadOnlySpan<float> valuesB, ReadOnlySpan<int> indicesB)
        {
            var normA = CpuMathUtils.SumSq(valuesA);
            if (valuesB.Length == 0)
                return normA;
            var normB = CpuMathUtils.SumSq(valuesB);
            var dotP = CpuMathUtils.DotProductSparse(valuesA, valuesB, indicesB, valuesB.Length);
            var res = normA + normB - 2 * dotP;
            return res < 0 ? 0 : res;
        }
 
        private static float L2DiffSquaredDense(ReadOnlySpan<float> valuesA, ReadOnlySpan<float> valuesB, int length)
        {
            Contracts.Assert(0 <= length && length <= valuesA.Length);
            Contracts.Assert(0 <= length && length <= valuesB.Length);
 
            if (length == 0)
                return 0;
            return CpuMathUtils.L2DistSquared(valuesA, valuesB, length);
        }
 
        /// <summary>
        /// Computes the dot product of two arrays
        /// Where "offset" is considered to be a's zero index
        /// </summary>
        /// <param name="a">one array</param>
        /// <param name="b">the second array (given as a VBuffer)</param>
        /// <param name="offset">offset in 'a'</param>
        /// <returns>the dot product</returns>
        public static float DotProductWithOffset(in VBuffer<float> a, int offset, in VBuffer<float> b)
        {
            Contracts.Check(0 <= offset && offset <= a.Length);
            Contracts.Check(b.Length <= a.Length - offset, "VBuffer b must be no longer than a.Length - offset.");
 
            var aValues = a.GetValues();
            var bValues = b.GetValues();
            if (aValues.Length == 0 || bValues.Length == 0)
                return 0;
            if (a.IsDense)
            {
                if (b.IsDense)
                    return CpuMathUtils.DotProductDense(aValues.Slice(offset), bValues, b.Length);
                return CpuMathUtils.DotProductSparse(aValues.Slice(offset), bValues, b.GetIndices(), bValues.Length);
            }
            else
            {
                float result = 0;
                var aIndices = a.GetIndices();
                int aMin = Utils.FindIndexSorted(aIndices, 0, aIndices.Length, offset);
                int aLim = Utils.FindIndexSorted(aIndices, 0, aIndices.Length, offset + b.Length);
                if (b.IsDense)
                {
                    for (int iA = aMin; iA < aLim; ++iA)
                        result += aValues[iA] * bValues[aIndices[iA] - offset];
                    return result;
                }
                var bIndices = b.GetIndices();
                for (int iA = aMin, iB = 0; iA < aLim && iB < bIndices.Length;)
                {
                    int aIndex = aIndices[iA];
                    int bIndex = bIndices[iB];
                    int comp = (aIndex - offset) - bIndex;
                    if (comp == 0)
                        result += aValues[iA++] * bValues[iB++];
                    else if (comp < 0)
                        iA++;
                    else
                        iB++;
                }
                return result;
            }
        }
 
        /// <summary>
        /// Computes the dot product of two arrays
        /// Where "offset" is considered to be a's zero index
        /// </summary>
        /// <param name="a">one array</param>
        /// <param name="b">the second array (given as a VBuffer)</param>
        /// <param name="offset">offset in 'a'</param>
        /// <returns>the dot product</returns>
        public static float DotProductWithOffset(float[] a, int offset, in VBuffer<float> b)
        {
            Contracts.Check(0 <= offset && offset <= a.Length);
            Contracts.Check(b.Length <= a.Length - offset, "VBuffer b must be no longer than a.Length - offset.");
 
            var bValues = b.GetValues();
            if (bValues.Length == 0)
                return 0;
 
            if (b.IsDense)
                return CpuMathUtils.DotProductDense(a.AsSpan(offset), bValues, b.Length);
            return CpuMathUtils.DotProductSparse(a.AsSpan(offset), bValues, b.GetIndices(), bValues.Length);
        }
 
        private static float DotProductSparse(ReadOnlySpan<float> aValues, ReadOnlySpan<int> aIndices, int ia, int iaLim, ReadOnlySpan<float> bValues, ReadOnlySpan<int> bIndices, int ib, int ibLim)
        {
            Contracts.AssertNonEmpty(aValues);
            Contracts.AssertNonEmpty(aIndices);
            Contracts.AssertNonEmpty(bValues);
            Contracts.AssertNonEmpty(bIndices);
            Contracts.Assert(0 <= ia && ia < iaLim && iaLim <= aIndices.Length);
            Contracts.Assert(0 <= ib && ib < ibLim && ibLim <= bIndices.Length);
 
            float res = 0;
 
            // Do binary searches when the indices mismatch by more than this.
            const int thresh = 20;
 
            for (; ; )
            {
                int d = aIndices[ia] - bIndices[ib];
                if (d == 0)
                {
                    res += aValues[ia] * bValues[ib];
                    if (++ia >= iaLim)
                        break;
                    if (++ib >= ibLim)
                        break;
                }
                else if (d < 0)
                {
                    ia++;
                    if (d < -thresh)
                        ia = Utils.FindIndexSorted(aIndices, ia, iaLim, bIndices[ib]);
                    if (ia >= iaLim)
                        break;
                }
                else
                {
                    ib++;
                    if (d > thresh)
                        ib = Utils.FindIndexSorted(bIndices, ib, ibLim, aIndices[ia]);
                    if (ib >= ibLim)
                        break;
                }
            }
 
            return res;
        }
 
        /// <summary>
        /// Computes the L1 distance between two VBuffers
        /// </summary>
        /// <param name="a">one VBuffer</param>
        /// <param name="b">another VBuffer</param>
        /// <returns>L1 Distance from a to b</returns>
        public static float L1Distance(in VBuffer<float> a, in VBuffer<float> b)
        {
            float res = 0;
            VBufferUtils.ForEachEitherDefined(in a, in b,
                (slot, val1, val2) => res += Math.Abs(val1 - val2));
            return res;
        }
 
        /// <summary>
        /// Computes the Euclidean distance between two VBuffers
        /// </summary>
        /// <param name="a">one VBuffer</param>
        /// <param name="b">another VBuffer</param>
        /// <returns>Distance from a to b</returns>
        public static float Distance(in VBuffer<float> a, in VBuffer<float> b)
        {
            return MathUtils.Sqrt(L2DistSquared(in a, in b));
        }
 
        /// <summary>
        /// Computes the Euclidean distance squared between two VBuffers
        /// </summary>
        /// <param name="a">one VBuffer</param>
        /// <param name="b">another VBuffer</param>
        /// <returns>Distance from a to b</returns>
        public static float L2DistSquared(in VBuffer<float> a, in VBuffer<float> b)
        {
            Contracts.Check(a.Length == b.Length, "Vectors must have the same dimensionality.");
            if (a.IsDense)
            {
                if (b.IsDense)
                    return L2DiffSquaredDense(a.GetValues(), b.GetValues(), b.Length);
                return L2DistSquaredHalfSparse(a.GetValues(), b.GetValues(), b.GetIndices());
            }
            if (b.IsDense)
                return L2DistSquaredHalfSparse(b.GetValues(), a.GetValues(), a.GetIndices());
            return L2DistSquaredSparse(a.GetValues(), a.GetIndices(), b.GetValues(), b.GetIndices());
        }
 
        /// <summary>
        /// Given two vectors a and b, calculate their L2 distance squared (|a-b|^2).
        /// </summary>
        /// <param name="a">The first vector, given as an array</param>
        /// <param name="b">The second vector, given as a VBuffer{float}</param>
        /// <returns>The squared L2 distance between a and b</returns>
        public static float L2DistSquared(float[] a, in VBuffer<float> b)
        {
            Contracts.CheckValue(a, nameof(a));
            Contracts.Check(Utils.Size(a) == b.Length, "Vectors must have the same dimensionality.");
            if (b.IsDense)
                return L2DiffSquaredDense(a, b.GetValues(), b.Length);
            return L2DistSquaredHalfSparse(a.AsSpan(0, a.Length), b.GetValues(), b.GetIndices());
        }
 
        /// <summary>
        /// Perform in-place vector addition <c><paramref name="dst"/> += <paramref name="src"/></c>.
        /// </summary>
        public static void Add(float[] src, float[] dst)
        {
            Contracts.CheckValue(src, nameof(src));
            Contracts.CheckValue(dst, nameof(dst));
            Contracts.CheckParam(src.Length == dst.Length, nameof(dst), "Arrays must have the same dimensionality.");
            if (src.Length == 0)
                return;
            CpuMathUtils.Add(src, dst, src.Length);
        }
 
        /// <summary>
        /// Adds a multiple of a <see cref="VBuffer{T}"/> to a <see cref="float"/> array.
        /// </summary>
        /// <param name="src">Buffer to add</param>
        /// <param name="dst">Span to add to</param>
        /// <param name="c">Coefficient</param>
        public static void AddMult(in VBuffer<float> src, Span<float> dst, float c)
        {
            Contracts.CheckParam(src.Length == dst.Length, nameof(dst), "Arrays must have the same dimensionality.");
 
            var srcValues = src.GetValues();
            if (srcValues.Length == 0 || c == 0)
                return;
 
            if (src.IsDense)
                CpuMathUtils.AddScale(c, srcValues, dst, srcValues.Length);
            else
            {
                var srcIndices = src.GetIndices();
                for (int i = 0; i < srcValues.Length; i++)
                    dst[srcIndices[i]] += c * srcValues[i];
            }
        }
 
        /// <summary>
        /// Adds a multiple of a <see cref="VBuffer{T}"/> to a <see cref="float"/> array, with an offset into the destination.
        /// </summary>
        /// <param name="src">Buffer to add</param>
        /// <param name="dst">Array to add to</param>
        /// <param name="offset">The offset into <paramref name="dst"/> at which to add</param>
        /// <param name="c">Coefficient</param>
 
        public static void AddMultWithOffset(in VBuffer<float> src, float[] dst, int offset, float c)
        {
            Contracts.CheckValue(dst, nameof(dst));
            Contracts.Check(0 <= offset && offset <= dst.Length);
            Contracts.Check(src.Length <= dst.Length - offset, "Vector src must be no longer than dst.Length - offset.");
 
            var srcValues = src.GetValues();
            if (srcValues.Length == 0 || c == 0)
                return;
 
            if (src.IsDense)
            {
                for (int i = 0; i < src.Length; i++)
                    dst[i + offset] += c * srcValues[i];
            }
            else
            {
                var srcIndices = src.GetIndices();
                for (int i = 0; i < srcValues.Length; i++)
                    dst[srcIndices[i] + offset] += c * srcValues[i];
            }
        }
 
        /// <summary>
        /// Adds a multiple of an array to a second array.
        /// </summary>
        /// <param name="src">Array to add</param>
        /// <param name="dst">Array to add to</param>
        /// <param name="c">Multiple</param>
        public static void AddMult(float[] src, float[] dst, float c)
        {
            Contracts.Check(src.Length == dst.Length, "Arrays must have the same dimensionality.");
 
            if (c == 0)
                return;
 
            CpuMathUtils.AddScale(c, src, dst, src.Length);
        }
 
        /// <summary>
        /// Returns the L2 norm of the vector (sum of squares of the components).
        /// </summary>
        public static float Norm(float[] a)
        {
            return MathUtils.Sqrt(CpuMathUtils.SumSq(a));
        }
 
        /// <summary>
        /// Returns sum of elements in array
        /// </summary>
        public static float Sum(float[] a)
        {
            if (a == null || a.Length == 0)
                return 0;
            return CpuMathUtils.Sum(a);
        }
 
        /// <summary>
        /// Multiples the array by a real value
        /// </summary>
        /// <param name="dst">The array</param>
        /// <param name="c">Value to multiply vector with</param>
        public static void ScaleBy(float[] dst, float c)
        {
            if (c == 1)
                return;
 
            if (c != 0)
                CpuMathUtils.Scale(c, dst);
            else
                Array.Clear(dst, 0, dst.Length);
        }
 
        public static float Distance(float[] a, float[] b)
        {
            Contracts.Check(a.Length == b.Length, "Arrays must have the same dimensionality.");
 
            float res = 0;
            for (int i = 0; i < a.Length; i++)
            {
                var diff = a[i] - b[i];
                res += diff * diff;
            }
 
            return res;
        }
    }
}