File: HashJoiningTransform.cs
Web Access
Project: src\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj (Microsoft.ML.Transforms)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;
 
[assembly: LoadableClass(HashJoiningTransform.Summary, typeof(HashJoiningTransform), typeof(HashJoiningTransform.Arguments), typeof(SignatureDataTransform),
    HashJoiningTransform.UserName, "HashJoinTransform", HashJoiningTransform.RegistrationName)]
 
[assembly: LoadableClass(HashJoiningTransform.Summary, typeof(HashJoiningTransform), null, typeof(SignatureLoadDataTransform),
    HashJoiningTransform.UserName, HashJoiningTransform.LoaderSignature, "HashJoinFunction")]
 
[assembly: EntryPointModule(typeof(HashJoin))]
 
namespace Microsoft.ML.Transforms
{
    /// <summary>
    /// This transform hashes its input columns. Each column is hashed separately, and within each
    /// column there is an option to specify which slots should be hashed together into one output slot.
    /// This transform can be applied either to single valued columns or to known length vector columns.
    /// </summary>
    [BestFriend]
    internal sealed class HashJoiningTransform : OneToOneTransformBase
    {
        public const int NumBitsMin = 1;
        public const int NumBitsLim = 32;
 
        private static class Defaults
        {
            public const bool Join = true;
            public const int NumberOfBits = NumBitsLim - 1;
            public const uint Seed = 314489979;
            public const bool Ordered = true;
        }
 
        public sealed class Arguments : TransformInputBase
        {
            [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)",
                Name = "Column",
                ShortName = "col",
                SortOrder = 1)]
            public Column[] Columns;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the values need to be combined for a single hash")]
            public bool Join = Defaults.Join;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 31, inclusive.",
                ShortName = "bits", SortOrder = 2)]
            public int NumberOfBits = Defaults.NumberOfBits;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")]
            public uint Seed = Defaults.Seed;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each term should be included in the hash", ShortName = "ord")]
            public bool Ordered = Defaults.Ordered;
        }
 
        public sealed class Column : OneToOneColumn
        {
            // REVIEW: rename to 'combine' (with 'join' as a secondary name) once possible
            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the values need to be combined for a single hash")]
            public bool? Join;
 
            // REVIEW: maybe the language could support ranges
            [Argument(ArgumentType.AtMostOnce, HelpText = "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.")]
            public string CustomSlotMap;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 31, inclusive.", ShortName = "bits")]
            public int? NumberOfBits;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")]
            public uint? Seed;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each term should be included in the hash", ShortName = "ord")]
            public bool? Ordered;
 
            internal static Column Parse(string str)
            {
                var res = new Column();
                if (res.TryParse(str))
                    return res;
                return null;
            }
 
            internal bool TryUnparse(StringBuilder sb)
            {
                Contracts.AssertValue(sb);
                if (Join != null || !string.IsNullOrEmpty(CustomSlotMap) || NumberOfBits != null ||
                    Seed != null || Ordered != null)
                {
                    return false;
                }
                return TryUnparseCore(sb);
            }
        }
 
        [BestFriend]
        internal sealed class ColumnOptions
        {
            // Either VBuffer<Key<U4>> or a single Key<U4>.
            // Note that if CustomSlotMap contains only one array, the output type of the transform will a single Key<U4>.
            // This corresponds to the join=+ case, although now it's possible to omit certain slots entirely.
            // If # of hash bits is less than 31, the key type will have a positive count.
            public readonly DataViewType OutputColumnType;
 
            public readonly int NumberOfBits;
            public readonly uint HashSeed;
            public readonly bool Ordered;
            public readonly int[][] SlotMap; // null if the input is a single column
 
            public int OutputValueCount
            {
                get { return OutputColumnType.GetValueCount(); }
            }
 
            public ColumnOptions(int[][] slotMap, int numberOfBits, uint hashSeed, bool ordered)
            {
                Contracts.CheckValueOrNull(slotMap);
                Contracts.Check(NumBitsMin <= numberOfBits && numberOfBits < NumBitsLim);
 
                SlotMap = slotMap;
                NumberOfBits = numberOfBits;
                HashSeed = hashSeed;
                Ordered = ordered;
                var itemType = GetItemType(numberOfBits);
                if (Utils.Size(SlotMap) <= 1)
                    OutputColumnType = itemType;
                else
                    OutputColumnType = new VectorDataViewType(itemType, SlotMap.Length);
            }
 
            /// <summary>
            /// Constructs the correct KeyType for the given hash bits.
            /// Because of array size limitation, if numberOfBits = 31, the key type is not contiguous (not transformable into indicator array)
            /// </summary>
            private static KeyDataViewType GetItemType(int numberOfBits)
            {
                var keyCount = (ulong)1 << numberOfBits;
                return new KeyDataViewType(typeof(uint), keyCount);
            }
        }
 
        internal const string RegistrationName = "HashJoin";
 
        internal const string Summary = "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. ";
 
        internal const string UserName = "Hash Join Transform";
 
        internal const string LoaderSignature = "HashJoinTransform";
        private static VersionInfo GetVersionInfo()
        {
            return new VersionInfo(
                modelSignature: "HSHJOINF",
                // verWrittenCur: 0x00010001, // Initial
                // verWrittenCur: 0x00010002, // Added slot maps
                //verWrittenCur: 0x00010003, // Added per-column hash bits
                //verWrittenCur: 0x00010004, // Added per-column hash seed and ordered flag
                verWrittenCur: 0x00010005, // Hash fix
                verReadableCur: 0x00010005,
                verWeCanReadBack: 0x00010005,
                loaderSignature: LoaderSignature,
                loaderAssemblyName: typeof(HashJoiningTransform).Assembly.FullName);
        }
 
        private readonly ColumnOptions[] _exes;
 
        /// <summary>
        /// Initializes a new instance of <see cref="HashJoiningTransform"/>.
        /// </summary>
        /// <param name="env">Host Environment.</param>
        /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
        /// <param name="name">Name of the output column.</param>
        /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
        /// <param name="join">Whether the values need to be combined for a single hash.</param>
        /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
        public HashJoiningTransform(IHostEnvironment env,
            IDataView input,
            string name,
            string source = null,
             bool join = Defaults.Join,
            int numberOfBits = Defaults.NumberOfBits)
            : this(env, new Arguments() { Columns = new[] { new Column() { Source = source ?? name, Name = name } }, Join = join, NumberOfBits = numberOfBits }, input)
        {
        }
 
        /// <include file='doc.xml' path='doc/members/member[@name="HashJoin"]/*' />
        public HashJoiningTransform(IHostEnvironment env, Arguments args, IDataView input)
            : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Columns, input, TestColumnType)
        {
            Host.AssertNonEmpty(Infos);
            Host.Assert(Infos.Length == Utils.Size(args.Columns));
 
            if (args.NumberOfBits < NumBitsMin || args.NumberOfBits >= NumBitsLim)
                throw Host.ExceptUserArg(nameof(args.NumberOfBits), "numberOfBits should be between {0} and {1} inclusive", NumBitsMin, NumBitsLim - 1);
 
            _exes = new ColumnOptions[Infos.Length];
            for (int i = 0; i < Infos.Length; i++)
            {
                var numberOfBits = args.Columns[i].NumberOfBits ?? args.NumberOfBits;
                Host.CheckUserArg(NumBitsMin <= numberOfBits && numberOfBits < NumBitsLim, nameof(args.NumberOfBits));
                _exes[i] = CreateColumnOptionsEx(
                    args.Columns[i].Join ?? args.Join,
                    args.Columns[i].CustomSlotMap,
                    args.Columns[i].NumberOfBits ?? args.NumberOfBits,
                    args.Columns[i].Seed ?? args.Seed,
                    args.Columns[i].Ordered ?? args.Ordered,
                    Infos[i]);
            }
 
            SetMetadata();
        }
 
        private HashJoiningTransform(IHost host, ModelLoadContext ctx, IDataView input)
            : base(host, ctx, input, TestColumnType)
        {
            Host.AssertValue(ctx);
 
            // *** Binary format ***
            // <base>
            // For each of the infos:
            //     int: hash bits
            //     uint: hash seed
            //     bool: ordered
            //     int N: number of slot maps (must be zero for single-value inputs)
            //     Then, for each of the N maps:
            //          Slot map itself (array of ints, including length)
 
            Host.AssertNonEmpty(Infos);
 
            _exes = new ColumnOptions[Infos.Length];
            for (int i = 0; i < Infos.Length; i++)
            {
                int numberOfBits = ctx.Reader.ReadInt32();
                Host.CheckDecode(NumBitsMin <= numberOfBits && numberOfBits < NumBitsLim);
 
                uint hashSeed = ctx.Reader.ReadUInt32();
                bool ordered = ctx.Reader.ReadBoolByte();
 
                int slotMapCount = ctx.Reader.ReadInt32();
                Host.CheckDecode(slotMapCount >= 0);
 
                int[][] slotMap = null;
                if (slotMapCount > 0)
                {
                    Host.CheckDecode(Infos[i].TypeSrc is VectorDataViewType);
 
                    slotMap = new int[slotMapCount][];
                    for (int j = 0; j < slotMapCount; j++)
                    {
                        slotMap[j] = ctx.Reader.ReadIntArray();
                        Host.CheckDecode(Utils.Size(slotMap[j]) > 0); // null array could be returned by the call above
 
                        // the slots should be distinct and between 0 and vector size
                        Host.CheckDecode(slotMap[j].Distinct().Count() == slotMap[j].Length);
                        Host.CheckDecode(
                            slotMap[j].All(slot => 0 <= slot && slot < Infos[i].TypeSrc.GetValueCount()));
                    }
                }
 
                _exes[i] = new ColumnOptions(slotMap, numberOfBits, hashSeed, ordered);
            }
 
            SetMetadata();
        }
 
        public static HashJoiningTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);
 
            h.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());
            h.CheckValue(input, nameof(input));
 
            return h.Apply("Loading Model", ch => new HashJoiningTransform(h, ctx, input));
        }
 
        private protected override void SaveModel(ModelSaveContext ctx)
        {
            Host.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());
 
            // *** Binary format ***
            // <base>
            // For each of the infos:
            //     int: hash bits
            //     uint: hash seed
            //     bool: ordered
            //     int N: number of slot maps (must be zero for single-value inputs)
            //     Then, for each of the N maps:
            //          Slot map itself (array of ints, including length)
 
            SaveBase(ctx);
 
            for (int iColumn = 0; iColumn < Infos.Length; iColumn++)
            {
                var ex = _exes[iColumn];
 
                Host.Assert(NumBitsMin <= ex.NumberOfBits && ex.NumberOfBits < NumBitsLim);
                ctx.Writer.Write(ex.NumberOfBits);
 
                ctx.Writer.Write(ex.HashSeed);
                ctx.Writer.WriteBoolByte(ex.Ordered);
 
                ctx.Writer.Write(Utils.Size(ex.SlotMap));
                if (ex.SlotMap == null)
                    continue;
 
                for (int i = 0; i < ex.SlotMap.Length; i++)
                {
                    Host.Assert(ex.SlotMap[i].Distinct().Count() == ex.SlotMap[i].Length);
                    Host.Assert(ex.SlotMap[i].All(slot => 0 <= slot && slot < Infos[iColumn].TypeSrc.GetValueCount()));
                    ctx.Writer.WriteIntArray(ex.SlotMap[i]);
                }
            }
        }
 
        private ColumnOptions CreateColumnOptionsEx(bool join, string customSlotMap, int numberOfBits, uint hashSeed, bool ordered, ColInfo colInfo)
        {
            int[][] slotMap = null;
            if (colInfo.TypeSrc is VectorDataViewType vectorType)
            {
                // fill in the slot map
                if (!string.IsNullOrWhiteSpace(customSlotMap))
                    slotMap = CompileSlotMap(customSlotMap, vectorType.Size);
                else
                    slotMap = CreateDefaultSlotMap(join, vectorType.Size);
                Host.Assert(Utils.Size(slotMap) >= 1);
            }
 
            return new ColumnOptions(slotMap, numberOfBits, hashSeed, ordered);
        }
 
        private int[][] CompileSlotMap(string slotMapString, int srcSlotCount)
        {
            var parts = ReadOnlyMemoryUtils.Split(slotMapString.AsMemory(), new[] { ';' }).ToArray();
            var slotMap = new int[parts.Length][];
            for (int i = 0; i < slotMap.Length; i++)
            {
                var slotIndices = ReadOnlyMemoryUtils.Split(parts[i], new[] { ',' }).ToArray();
                var slots = new int[slotIndices.Length];
                slotMap[i] = slots;
                for (int j = 0; j < slots.Length; j++)
                {
                    int index;
                    if (!int.TryParse(slotIndices[j].ToString(), out index) || index < 0 || index >= srcSlotCount)
                        throw Host.Except("Unexpected slot index '{1}' in group {0}. Expected 0 to {2}", i, slotIndices[j], srcSlotCount - 1);
                    slots[j] = index;
                }
 
                if (slots.Distinct().Count() < slots.Length)
                    throw Host.Except("Group '{0}' has duplicate slot indices", parts[i]);
            }
 
            return slotMap;
        }
 
        private static int[][] CreateDefaultSlotMap(bool join, int srcSlotCount)
        {
            if (join)
            {
                // map all input slots into one output slot
                return new[] { Utils.GetIdentityPermutation(srcSlotCount) };
            }
            else
            {
                // map every input slot into a separate output slot
                return Enumerable.Range(0, srcSlotCount).Select(v => new[] { v }).ToArray();
            }
        }
 
        private static string TestColumnType(DataViewType type)
        {
            // REVIEW: list all types that can be hashed.
            if (type.GetValueCount() > 0)
                return null;
            return "Unknown vector size";
        }
 
        private void SetMetadata()
        {
            var md = Metadata;
            for (int i = 0; i < _exes.Length; i++)
            {
                var ex = _exes[i];
                if (Utils.Size(ex.SlotMap) <= 1)
                    continue;
                using (var bldr = md.BuildMetadata(i))
                {
                    bldr.AddGetter<VBuffer<ReadOnlyMemory<char>>>(AnnotationUtils.Kinds.SlotNames,
                        new VectorDataViewType(TextDataViewType.Instance, ex.SlotMap.Length), GetSlotNames);
                }
            }
            md.Seal();
        }
 
        private void GetSlotNames(int iinfo, ref VBuffer<ReadOnlyMemory<char>> dst)
        {
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
 
            Host.AssertValue(_exes[iinfo].SlotMap);
 
            int n = _exes[iinfo].OutputValueCount;
            var dstEditor = VBufferEditor.Create(ref dst, n);
 
            var srcColumnName = Source.Schema[Infos[iinfo].Source].Name;
            bool useDefaultSlotNames = !Source.Schema[Infos[iinfo].Source].HasSlotNames(Infos[iinfo].TypeSrc.GetVectorSize());
            VBuffer<ReadOnlyMemory<char>> srcSlotNames = default;
            if (!useDefaultSlotNames)
            {
                Source.Schema[Infos[iinfo].Source].Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref srcSlotNames);
                useDefaultSlotNames =
                    !srcSlotNames.IsDense
                    || srcSlotNames.Length != Infos[iinfo].TypeSrc.GetValueCount();
            }
 
            var outputSlotName = new StringBuilder();
            var srcSlotNameValues = srcSlotNames.GetValues();
            for (int slot = 0; slot < n; slot++)
            {
                var slotList = _exes[iinfo].SlotMap[slot];
 
                outputSlotName.Clear();
 
                foreach (var inputSlotIndex in slotList)
                {
                    if (outputSlotName.Length > 0)
                        outputSlotName.Append("+");
 
                    if (useDefaultSlotNames)
                        outputSlotName.AppendFormat("{0}[{1}]", srcColumnName, inputSlotIndex);
                    else
                        outputSlotName.Append(srcSlotNameValues[inputSlotIndex]);
                }
 
                dstEditor.Values[slot] = outputSlotName.ToString().AsMemory();
            }
 
            dst = dstEditor.Commit();
        }
 
        private delegate uint HashDelegate<TSrc>(in TSrc value, uint seed);
 
        // generic method generators
        private static readonly FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate> _composeGetterOneToOneMethodInfo
            = FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate>.Create(target => target.ComposeGetterOneToOne<int>);
 
        private static readonly FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate> _composeGetterVecToVecMethodInfo
            = FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate>.Create(target => target.ComposeGetterVecToVec<int>);
 
        private static readonly FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate> _composeGetterVecToOneMethodInfo
            = FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate>.Create(target => target.ComposeGetterVecToOne<int>);
 
        protected override Delegate GetGetterCore(IChannel ch, DataViewRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            disposer = null;
 
            // Magic code to generate a correct getter.
            // First, we take a method info for GetGetter<int>
            // Then, we replace <int> with correct type of the input
            // And then we generate a delegate using the generic delegate generator
            DataViewType itemType;
            FuncInstanceMethodInfo1<HashJoiningTransform, DataViewRow, int, Delegate> mi;
            if (!(Infos[iinfo].TypeSrc is VectorDataViewType vectorType))
            {
                itemType = Infos[iinfo].TypeSrc;
                mi = _composeGetterOneToOneMethodInfo;
            }
            else
            {
                itemType = vectorType.ItemType;
                if (_exes[iinfo].OutputValueCount == 1)
                    mi = _composeGetterVecToOneMethodInfo;
                else
                    mi = _composeGetterVecToVecMethodInfo;
            }
 
            return Utils.MarshalInvoke(mi, this, itemType.RawType, input, iinfo);
        }
 
        /// <summary>
        /// Getter generator for inputs of type <typeparamref name="TSrc"/>
        /// </summary>
        /// <typeparam name="TSrc">Input type. Must be a non-vector</typeparam>
        /// <param name="input">Row inout</param>
        /// <param name="iinfo">Index of the getter</param>
        private ValueGetter<uint> ComposeGetterOneToOne<TSrc>(DataViewRow input, int iinfo)
        {
            Host.AssertValue(input);
            Host.Assert(!(Infos[iinfo].TypeSrc is VectorDataViewType));
 
            var getSrc = GetSrcGetter<TSrc>(input, iinfo);
            var hashFunction = ComposeHashDelegate<TSrc>();
            var src = default(TSrc);
            var mask = (1U << _exes[iinfo].NumberOfBits) - 1;
            var hashSeed = _exes[iinfo].HashSeed;
            return
                (ref uint dst) =>
                {
                    getSrc(ref src);
                    dst = (hashFunction(in src, hashSeed) & mask) + 1; // +1 to offset from zero, which has special meaning for KeyType
                };
        }
 
        /// <summary>
        /// Getter generator for inputs of type <typeparamref name="TSrc"/>, where output type is a vector of hashes
        /// </summary>
        /// <typeparam name="TSrc">Input type. Must be a vector</typeparam>
        /// <param name="input">Row input</param>
        /// <param name="iinfo">Index of the getter</param>
        private ValueGetter<VBuffer<uint>> ComposeGetterVecToVec<TSrc>(DataViewRow input, int iinfo)
        {
            Host.AssertValue(input);
            VectorDataViewType srcType = Infos[iinfo].TypeSrc as VectorDataViewType;
            Host.Assert(srcType != null);
 
            var getSrc = GetSrcGetter<VBuffer<TSrc>>(input, iinfo);
            var hashFunction = ComposeHashDelegate<TSrc>();
            var src = default(VBuffer<TSrc>);
            int n = _exes[iinfo].OutputValueCount;
            int expectedSrcLength = srcType.Size;
            int[][] slotMap = _exes[iinfo].SlotMap;
            // REVIEW: consider adding a fix-zero functionality (subtract emptyTextHash from all hashes)
            var mask = (1U << _exes[iinfo].NumberOfBits) - 1;
            var hashSeed = _exes[iinfo].HashSeed;
            bool ordered = _exes[iinfo].Ordered;
            var denseSource = default(VBuffer<TSrc>);
            return
                (ref VBuffer<uint> dst) =>
                {
                    getSrc(ref src);
                    Host.Check(src.Length == expectedSrcLength);
                    var hashes = VBufferEditor.Create(ref dst, n);
                    src.CopyToDense(ref denseSource);
                    for (int i = 0; i < n; i++)
                    {
                        uint hash = hashSeed;
                        foreach (var srcSlot in slotMap[i])
                        {
                            // REVIEW: some legacy code hashes 0 for srcSlot in ord- case, do we need to preserve this behavior?
                            if (ordered)
                                hash = Hashing.MurmurRound(hash, (uint)srcSlot);
                            hash = hashFunction(denseSource.GetItemOrDefault(srcSlot), hash);
                        }
 
                        hashes.Values[i] = (Hashing.MixHash(hash) & mask) + 1; // +1 to offset from zero, which has special meaning for KeyType
                    }
 
                    dst = hashes.Commit();
                };
        }
 
        /// <summary>
        /// Getter generator for inputs of type <typeparamref name="TSrc"/>, where output type is a single hash
        /// </summary>
        /// <typeparam name="TSrc">Input type. Must be a vector</typeparam>
        /// <param name="input">Row input</param>
        /// <param name="iinfo">Index of the getter</param>
        private ValueGetter<uint> ComposeGetterVecToOne<TSrc>(DataViewRow input, int iinfo)
        {
            Host.AssertValue(input);
            VectorDataViewType srcType = Infos[iinfo].TypeSrc as VectorDataViewType;
            Host.Assert(srcType != null);
            Host.Assert(Utils.Size(_exes[iinfo].SlotMap) == 1);
 
            var slots = _exes[iinfo].SlotMap[0];
 
            var getSrc = GetSrcGetter<VBuffer<TSrc>>(input, iinfo);
            var hashFunction = ComposeHashDelegate<TSrc>();
            var src = default(VBuffer<TSrc>);
            int expectedSrcLength = srcType.Size;
            var mask = (1U << _exes[iinfo].NumberOfBits) - 1;
            var hashSeed = _exes[iinfo].HashSeed;
            bool ordered = _exes[iinfo].Ordered;
            var denseSource = default(VBuffer<TSrc>);
            return
                (ref uint dst) =>
                {
                    getSrc(ref src);
                    Host.Check(src.Length == expectedSrcLength);
                    src.CopyToDense(ref denseSource);
                    uint hash = hashSeed;
                    foreach (var srcSlot in slots)
                    {
                        if (ordered)
                            hash = Hashing.MurmurRound(hash, (uint)srcSlot);
                        hash = hashFunction(denseSource.GetItemOrDefault(srcSlot), hash);
                    }
                    dst = (Hashing.MixHash(hash) & mask) + 1; // +1 to offset from zero, which has special meaning for KeyType
                };
        }
 
        /// <summary>
        /// Generic hash function
        /// </summary>
        private HashDelegate<TSrc> ComposeHashDelegate<TSrc>()
        {
            // REVIEW: Add a specialized hashing for ints, once numeric bin mapper is done.
            if (typeof(TSrc) == typeof(float))
                return (HashDelegate<TSrc>)(Delegate)ComposeFloatHashDelegate();
 
            if (typeof(TSrc) == typeof(double))
                return (HashDelegate<TSrc>)(Delegate)ComposeDoubleHashDelegate();
 
            // Default case: convert to text and hash as a string.
            var sb = default(StringBuilder);
            var conv = Data.Conversion.Conversions.DefaultInstance.GetStringConversion<TSrc>();
            return
                (in TSrc value, uint seed) =>
                {
                    conv(in value, ref sb);
                    return Hashing.MurmurHash(seed, sb, 0, sb.Length);
                };
        }
 
        /// <summary>
        /// Generate a specialized hash function for floats
        /// </summary>
        private HashDelegate<float> ComposeFloatHashDelegate()
        {
            return Hash;
        }
 
        /// <summary>
        /// Generate a specialized hash function for doubles
        /// </summary>
        private HashDelegate<double> ComposeDoubleHashDelegate() => Hash;
 
        private uint Hash(in float value, uint seed) => Hashing.MurmurRound(seed, FloatUtils.GetBits(value));
 
        private uint Hash(in double value, uint seed)
        {
            ulong v = FloatUtils.GetBits(value);
            uint hash = Hashing.MurmurRound(seed, Utils.GetLo(v));
            return Hashing.MurmurRound(hash, Utils.GetHi(v));
        }
 
        protected override DataViewType GetColumnTypeCore(int iinfo)
        {
            Host.Assert(iinfo >= 0 && iinfo < _exes.Length);
            return _exes[iinfo].OutputColumnType;
        }
    }
 
    internal static class HashJoin
    {
        [TlcModule.EntryPoint(Name = "Transforms.HashConverter",
            Desc = HashJoiningTransform.Summary,
            UserName = HashJoiningTransform.UserName,
            ShortName = HashJoiningTransform.RegistrationName)]
        public static CommonOutputs.TransformOutput Apply(IHostEnvironment env, HashJoiningTransform.Arguments input)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(input, nameof(input));
 
            var h = EntryPointUtils.CheckArgsAndCreateHost(env, "HashJoin", input);
            var view = new HashJoiningTransform(h, input, input.Data);
            return new CommonOutputs.TransformOutput()
            {
                Model = new TransformModelImpl(h, view, input.Data),
                OutputData = view
            };
        }
    }
}