File: HashBench.cs
Web Access
Project: src\test\Microsoft.ML.PerformanceTests\Microsoft.ML.PerformanceTests.csproj (Microsoft.ML.PerformanceTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Linq;
using BenchmarkDotNet.Attributes;
using Microsoft.ML.Data;
using Microsoft.ML.PerformanceTests.Harness;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;
 
namespace Microsoft.ML.PerformanceTests
{
    [CIBenchmark]
    public class HashBench : BenchmarkBase
    {
        private sealed class RowImpl : DataViewRow
        {
            public long PositionValue;
 
            public override DataViewSchema Schema { get; }
            public override long Position => PositionValue;
            public override long Batch => 0;
            public override ValueGetter<DataViewRowId> GetIdGetter()
                => (ref DataViewRowId val) => val = new DataViewRowId((ulong)Position, 0);
 
            private readonly Delegate _getter;
 
            /// <summary>
            /// Returns whether the given column is active in this row.
            /// </summary>
            public override bool IsColumnActive(DataViewSchema.Column column)
            {
                if (column.Index != 0)
                    throw new Exception();
                return true;
            }
 
            /// <summary>
            /// Returns a value getter delegate to fetch the valueof column with the given columnIndex, from the row.
            /// This throws if the column is not active in this row, or if the type
            /// <typeparamref name="TValue"/> differs from this column's type.
            /// </summary>
            /// <typeparam name="TValue"> is the output column's content type.</typeparam>
            /// <param name="column"> is the index of a output column whose getter should be returned.</param>
            public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column)
            {
                if (column.Index != 0)
                    throw new Exception();
                if (_getter is ValueGetter<TValue> typedGetter)
                    return typedGetter;
                throw new Exception();
            }
 
            public static RowImpl Create<T>(DataViewType type, ValueGetter<T> getter)
            {
                if (type.RawType != typeof(T))
                    throw new Exception();
                return new RowImpl(type, getter);
            }
 
            private RowImpl(DataViewType type, Delegate getter)
            {
                var builder = new DataViewSchema.Builder();
                builder.AddColumn("Foo", type, null);
                Schema = builder.ToSchema();
                _getter = getter;
            }
        }
 
        private const int Count = 100_000;
 
        private readonly MLContext _env = new MLContext(1);
 
        private RowImpl _inRow;
        private ValueGetter<uint> _getter;
        private ValueGetter<VBuffer<uint>> _vecGetter;
 
        private void InitMapMurmurHashV2<T>(T val, DataViewType type, int numberOfBits = 20, ValueGetter<T> getter = null)
        {
            if (getter == null)
                getter = (ref T dst) => dst = val;
            _inRow = RowImpl.Create(type, getter);
            // One million features is a nice, typical number.
            var info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: numberOfBits);
            var xf = new HashingTransformer(_env, new[] { info });
            var mapper = ((ITransformer)xf).GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, column);
            if (type is VectorDataViewType)
                _vecGetter = outRow.GetGetter<VBuffer<uint>>(column);
            else
                _getter = outRow.GetGetter<uint>(column);
        }
 
        private void InitMapMurmurHashV1<T>(T val, DataViewType type, ValueGetter<T> getter = null)
        {
            if (getter == null)
                getter = (ref T dst) => dst = val;
            _inRow = RowImpl.Create(type, getter);
 
            var modelPath = GetBenchmarkDataPathAndEnsureData("backcompat/MurmurHashV1.zip");
            var estimator = _env.Model.Load(modelPath, out var schema);
            var mapper = ((ITransformer)estimator).GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, column);
            if (type is VectorDataViewType)
                _vecGetter = outRow.GetGetter<VBuffer<uint>>(column);
            else
                _getter = outRow.GetGetter<uint>(column);
        }
 
        /// <summary>
        /// All the scalar mappers have the same output type.
        /// </summary>
        private void RunScalar()
        {
            uint val = default;
            for (int i = 0; i < Count; ++i)
            {
                _getter(ref val);
                ++_inRow.PositionValue;
            }
        }
 
        private void InitDenseVecMap<T>(T[] vals, PrimitiveDataViewType itemType, int numberOfBits = 20)
        {
            var vbuf = new VBuffer<T>(vals.Length, vals);
            InitMapMurmurHashV2(vbuf, new VectorDataViewType(itemType, vals.Length), numberOfBits, vbuf.CopyTo);
        }
 
        /// <summary>
        /// All the vector mappers have the same output type.
        /// </summary>
        private void RunVector()
        {
            VBuffer<uint> val = default;
            for (int i = 0; i < Count; ++i)
            {
                _vecGetter(ref val);
                ++_inRow.PositionValue;
            }
        }
 
        [GlobalSetup(Target = nameof(HashScalarString))]
        public void SetupHashScalarString()
        {
            InitMapMurmurHashV2("Hello".AsMemory(), TextDataViewType.Instance);
        }
 
        [Benchmark]
        public void HashScalarString()
        {
            RunScalar();
        }
 
        [GlobalSetup(Target = nameof(HashScalarFloat))]
        public void SetupHashScalarFloat()
        {
            InitMapMurmurHashV2(5.0f, NumberDataViewType.Single);
        }
 
        [Benchmark]
        public void HashScalarFloat()
        {
            RunScalar();
        }
 
        [GlobalSetup(Target = nameof(HashScalarDouble))]
        public void SetupHashScalarDouble()
        {
            InitMapMurmurHashV2(5.0, NumberDataViewType.Double);
        }
 
        [Benchmark]
        public void HashScalarDouble()
        {
            RunScalar();
        }
 
        [GlobalSetup(Target = nameof(HashScalarKey))]
        public void SetupHashScalarKey()
        {
            InitMapMurmurHashV2(6u, new KeyDataViewType(typeof(uint), 100));
        }
 
        [Benchmark]
        public void HashScalarKey()
        {
            RunScalar();
        }
 
 
 
        [GlobalSetup(Target = nameof(HashVectorString))]
        public void SetupHashVectorString()
        {
            var tokens = "Hello my friend, stay awhile and listen! ".Split().Select(token => token.AsMemory()).ToArray();
            InitDenseVecMap(tokens, TextDataViewType.Instance);
        }
 
        [Benchmark]
        public void HashVectorString()
        {
            RunVector();
        }
 
        [GlobalSetup(Target = nameof(HashVectorFloat))]
        public void SetupHashVectorFloat()
        {
            InitDenseVecMap(new[] { 1f, 2f, 3f, 4f, 5f }, NumberDataViewType.Single);
        }
 
        [Benchmark]
        public void HashVectorFloat()
        {
            RunVector();
        }
 
        [GlobalSetup(Target = nameof(HashVectorDouble))]
        public void SetupHashVectorDouble()
        {
            InitDenseVecMap(new[] { 1d, 2d, 3d, 4d, 5d }, NumberDataViewType.Double);
        }
 
        [Benchmark]
        public void HashVectorDouble()
        {
            RunVector();
        }
 
        [GlobalSetup(Target = nameof(HashVectorKey))]
        public void SetupHashVectorKey()
        {
            InitDenseVecMap(new[] { 1u, 2u, 0u, 4u, 5u }, new KeyDataViewType(typeof(uint), 100));
        }
 
        [Benchmark]
        public void HashVectorKey()
        {
            RunVector();
        }
 
        //The benchmarks below use a model that uses MurmurHash Version 1
        [GlobalSetup(Target = nameof(HashScalarStringV1))]
        public void SetupHashScalarStringV1()
        {
            InitMapMurmurHashV1("Hello".AsMemory(), TextDataViewType.Instance);
        }
 
        [Benchmark]
        public void HashScalarStringV1()
        {
            RunScalar();
        }
 
        [GlobalSetup(Target = nameof(HashScalarKeyV1))]
        public void SetupHashScalarKeyV1()
        {
            InitMapMurmurHashV1(6u, new KeyDataViewType(typeof(uint), 100));
        }
 
        [Benchmark]
        public void HashScalarKeyV1()
        {
            RunScalar();
        }
 
        [GlobalSetup(Target = nameof(HashScalarFloatV1))]
        public void SetupHashScalarFloatV1()
        {
            InitMapMurmurHashV1(5.0f, NumberDataViewType.Single);
        }
 
        [Benchmark]
        public void HashScalarFloatV1()
        {
            RunScalar();
        }
 
        // * Summary *
 
        // BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
        // Intel Core i7-7700 CPU 3.60GHz(Kaby Lake), 1 CPU, 8 logical and 4 physical cores
        // .NET Core SDK = 3.1.200
 
        // [Host]     : .NET Core 2.1.16 (CoreCLR 4.6.28516.03, CoreFX 4.6.28516.10), X64 RyuJIT
        // Job-FUEYRN : .NET Core 2.1.16 (CoreCLR 4.6.28516.03, CoreFX 4.6.28516.10), X64 RyuJIT
 
        // Arguments=/p:Configuration=Release Toolchain = netcoreapp2.1  MaxIterationCount=20
        // WarmupCount=1
 
        // |              Method |        Mean |     Error |    StdDev | Extra Metric |
        // |------------------- |------------:|----------:|----------:|-------------:|
        // |   HashScalarString |  3,232.4 us |  49.93 us |  46.71 us |            - |
        // |    HashScalarFloat |    675.3 us |   3.22 us |   2.86 us |            - |
        // |   HashScalarDouble |    824.2 us |   3.32 us |   3.10 us |            - |
        // |      HashScalarKey |    570.8 us |   2.77 us |   2.46 us |            - |
        // |   HashVectorString | 25,937.6 us | 621.62 us | 715.85 us |            - |
        // |    HashVectorFloat | 13,825.1 us |  74.14 us |  69.35 us |            - |
        // |   HashVectorDouble | 14,616.9 us |  43.29 us |  36.15 us |            - |
        // |      HashVectorKey | 13,096.8 us |  56.62 us |  52.96 us |            - |
        // | HashScalarStringV1 |  3,154.5 us |  27.44 us |  25.67 us |            - |
        // |   HashScalarUIntV1 |    546.2 us |   2.35 us |   2.08 us |            - |
        // |  HashScalarFloatV1 |    670.7 us |   3.60 us |   3.36 us |            - |
    }
}