File: Text\WrappedTextTransformers.cs
Web Access
Project: src\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj (Microsoft.ML.Transforms)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Data.DataLoadSave;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
 
namespace Microsoft.ML.Transforms.Text
{
    /// <summary>
    /// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
    /// </summary>
    /// <remarks>
    /// <format type="text/markdown"><![CDATA[
    /// ###  Estimator Characteristics
    /// |  |  |
    /// | -- | -- |
    /// | Does this estimator need to look at the data to train its parameters? | Yes |
    /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
    /// | Output column data type | Vector of known-size of <xref:System.Single> |
    /// | Exportable to ONNX | Yes |
    ///
    /// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
    /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
    /// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
    ///
    /// <xref:Microsoft.ML.Transforms.Text.WordBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramExtractingEstimator>
    /// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
    ///
    /// Check the See Also section for links to usage examples.
    /// ]]>
    /// </format>
    /// </remarks>
    /// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
    /// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
    public sealed class WordBagEstimator : IEstimator<ITransformer>
    {
        private readonly IHost _host;
        private readonly (string outputColumnName, string[] sourceColumnsNames)[] _columns;
        private readonly int _ngramLength;
        private readonly int _skipLength;
        private readonly bool _useAllLengths;
        private readonly int _maxNumTerms;
        private readonly NgramExtractingEstimator.WeightingCriteria _weighting;
        private readonly char _termSeparator;
        private readonly char _freqSeparator;
 
        /// <summary>
        /// Options for how the n-grams are extracted.
        /// </summary>
        public class Options
        {
            /// <summary>
            /// Maximum n-gram length.
            /// </summary>
            public int NgramLength;
 
            /// <summary>
            /// Maximum number of tokens to skip when constructing an n-gram.
            /// </summary>
            public int SkipLength;
 
            /// <summary>
            /// Whether to store all n-gram lengths up to ngramLength, or only ngramLength.
            /// </summary>
            public bool UseAllLengths;
 
            /// <summary>
            /// The maximum number of grams to store in the dictionary, for each level of n-grams,
            /// from 1 (in position 0) up to ngramLength (in position ngramLength-1)
            /// </summary>
            public int[] MaximumNgramsCount;
 
            /// <summary>
            /// The weighting criteria.
            /// </summary>
            public NgramExtractingEstimator.WeightingCriteria Weighting;
 
            public Options()
            {
                NgramLength = 2;
                SkipLength = NgramExtractingEstimator.Defaults.SkipLength;
                UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths;
                MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount };
                Weighting = NgramExtractingEstimator.Defaults.Weighting;
            }
        }
 
        /// <summary>
        /// Produces a bag of counts of n-grams (sequences of consecutive words) in <paramref name="inputColumnName"/>
        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
        /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
        /// <param name="termSeparator">Separator used to separate terms/frequency pairs.</param>
        /// <param name="freqSeparator">Separator used to separate terms from their frequency.</param>
        internal WordBagEstimator(IHostEnvironment env,
            string outputColumnName,
            string inputColumnName = null,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            int maximumNgramsCount = 10000000,
            NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf,
            char termSeparator = default,
            char freqSeparator = default)
            : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting, termSeparator, freqSeparator)
        {
        }
 
        /// <summary>
        /// Produces a bag of counts of n-grams (sequences of consecutive words) in <paramref name="inputColumnNames"/>
        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="outputColumnName">The column containing output tokens.</param>
        /// <param name="inputColumnNames">The columns containing text to compute bag of word vector.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
        /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
        /// <param name="termSeparator">Separator used to separate terms/frequency pairs.</param>
        /// <param name="freqSeparator">Separator used to separate terms from their frequency.</param>
        internal WordBagEstimator(IHostEnvironment env,
            string outputColumnName,
            string[] inputColumnNames,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            int maximumNgramsCount = 10000000,
            NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf,
            char termSeparator = default,
            char freqSeparator = default)
            : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting, termSeparator, freqSeparator)
        {
        }
 
        /// <summary>
        /// Produces a bag of counts of n-grams (sequences of consecutive words) in <paramref name="columns.inputs"/>
        /// and outputs bag of word vector for each output in <paramref name="columns.output"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="columns">Pairs of columns to compute bag of word vector.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
        /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
        /// <param name="termSeparator">Separator used to separate terms/frequency pairs.</param>
        /// <param name="freqSeparator">Separator used to separate terms from their frequency.</param>
        internal WordBagEstimator(IHostEnvironment env,
            (string outputColumnName, string[] inputColumnNames)[] columns,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            int maximumNgramsCount = 10000000,
            NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf,
            char termSeparator = default,
            char freqSeparator = default)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(WordBagEstimator));
 
            foreach (var (outputColumnName, inputColumnName) in columns)
            {
                _host.CheckUserArg(Utils.Size(inputColumnName) > 0, nameof(columns));
                _host.CheckValue(outputColumnName, nameof(columns));
            }
 
            _columns = columns;
            _ngramLength = ngramLength;
            _skipLength = skipLength;
            _useAllLengths = useAllLengths;
            _maxNumTerms = maximumNgramsCount;
            _weighting = weighting;
            _termSeparator = termSeparator;
            _freqSeparator = freqSeparator;
        }
 
        /// <summary> Trains and returns a <see cref="ITransformer"/>.</summary>
        public ITransformer Fit(IDataView input)
        {
            var estimator = WordBagBuildingTransformer.CreateEstimator(_host, CreateOptions(), SchemaShape.Create(input.Schema));
            return estimator.Fit(input);
        }
 
        private WordBagBuildingTransformer.Options CreateOptions()
        {
            return new WordBagBuildingTransformer.Options
            {
                Columns = _columns.Select(x => new WordBagBuildingTransformer.Column { Name = x.outputColumnName, Source = x.sourceColumnsNames }).ToArray(),
                NgramLength = _ngramLength,
                SkipLength = _skipLength,
                UseAllLengths = _useAllLengths,
                MaxNumTerms = new[] { _maxNumTerms },
                Weighting = _weighting,
                TermSeparator = _termSeparator,
                FreqSeparator = _freqSeparator,
            };
        }
 
        /// <summary>
        /// Schema propagation for estimators.
        /// Returns the output schema shape of the estimator, if the input schema shape is like the one provided.
        /// </summary>
        public SchemaShape GetOutputSchema(SchemaShape inputSchema)
        {
            _host.CheckValue(inputSchema, nameof(inputSchema));
 
            var estimator = WordBagBuildingTransformer.CreateEstimator(_host, CreateOptions(), inputSchema);
            return estimator.GetOutputSchema(inputSchema);
        }
    }
 
    /// <summary>
    /// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
    /// </summary>
    /// <remarks>
    /// <format type="text/markdown"><![CDATA[
    /// ###  Estimator Characteristics
    /// |  |  |
    /// | -- | -- |
    /// | Does this estimator need to look at the data to train its parameters? | Yes |
    /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
    /// | Output column data type | Vector of known-size of <xref:System.Single> |
    /// | Exportable to ONNX | No |
    ///
    /// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
    /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
    /// It does so by hashing each n-gram and using the hash value as the index in the bag.
    ///
    /// <xref:Microsoft.ML.Transforms.Text.WordHashBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramHashingEstimator>
    /// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
    ///
    /// Check the See Also section for links to usage examples.
    /// ]]>
    /// </format>
    /// </remarks>
    /// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string, int, int, int, bool, uint, bool, int)" />
    /// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, int, bool, uint, bool, int)" />
    public sealed class WordHashBagEstimator : IEstimator<ITransformer>
    {
        private readonly IHost _host;
        private readonly (string outputColumnName, string[] inputColumnNames)[] _columns;
        private readonly int _numberOfBits;
        private readonly int _ngramLength;
        private readonly int _skipLength;
        private readonly bool _useAllLengths;
        private readonly uint _seed;
        private readonly bool _ordered;
        private readonly int _maximumNumberOfInverts;
 
        /// <summary>
        /// Produces a bag of counts of hashed n-grams in <paramref name="inputColumnName"/>
        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="outputColumnName">The column containing bag of word vector. Null means <paramref name="inputColumnName"/> is replaced.</param>
        /// <param name="inputColumnName">The column containing text to compute bag of word vector.</param>
        /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="seed">Hashing seed.</param>
        /// <param name="useOrderedHashing">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
        /// <param name="maximumNumberOfInverts">During hashing we construct mappings between original values and the produced hash values.
        /// Text representation of original values are stored in the slot names of the  annotations for the new column.Hashing, as such, can map many initial values to one.
        /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
        /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
        internal WordHashBagEstimator(IHostEnvironment env,
            string outputColumnName,
            string inputColumnName = null,
            int numberOfBits = 16,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            uint seed = 314489979,
            bool useOrderedHashing = true,
            int maximumNumberOfInverts = 0)
            : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, numberOfBits: numberOfBits,
                  ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed,
                  useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts)
        {
        }
 
        /// <summary>
        /// Produces a bag of counts of hashed n-grams in <paramref name="inputColumnNames"/>
        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="outputColumnName">The column containing output tokens.</param>
        /// <param name="inputColumnNames">The columns containing text to compute bag of word vector.</param>
        /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="seed">Hashing seed.</param>
        /// <param name="useOrderedHashing">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
        /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
        /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
        /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
        /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
        internal WordHashBagEstimator(IHostEnvironment env,
            string outputColumnName,
            string[] inputColumnNames,
            int numberOfBits = 16,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            uint seed = 314489979,
            bool useOrderedHashing = true,
            int maximumNumberOfInverts = 0)
            : this(env, new[] { (outputColumnName, inputColumnNames) }, numberOfBits: numberOfBits,
                  ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed,
                  useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts)
        {
        }
 
        /// <summary>
        /// Produces a bag of counts of hashed n-grams in <paramref name="columns.inputs"/>
        /// and outputs bag of word vector for each output in <paramref name="columns.output"/>
        /// </summary>
        /// <param name="env">The environment.</param>
        /// <param name="columns">Pairs of columns to compute bag of word vector.</param>
        /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
        /// <param name="ngramLength">Ngram length.</param>
        /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
        /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
        /// <param name="seed">Hashing seed.</param>
        /// <param name="useOrderedHashing">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
        /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
        /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
        /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
        /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
        internal WordHashBagEstimator(IHostEnvironment env,
            (string outputColumnName, string[] inputColumnNames)[] columns,
            int numberOfBits = 16,
            int ngramLength = 1,
            int skipLength = 0,
            bool useAllLengths = true,
            uint seed = 314489979,
            bool useOrderedHashing = true,
            int maximumNumberOfInverts = 0)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(WordHashBagEstimator));
 
            foreach (var (input, output) in columns)
            {
                _host.CheckUserArg(Utils.Size(input) > 0, nameof(input));
                _host.CheckValue(output, nameof(input));
            }
 
            _columns = columns;
            _numberOfBits = numberOfBits;
            _ngramLength = ngramLength;
            _skipLength = skipLength;
            _useAllLengths = useAllLengths;
            _seed = seed;
            _ordered = useOrderedHashing;
            _maximumNumberOfInverts = maximumNumberOfInverts;
        }
 
        /// <summary> Trains and returns a <see cref="ITransformer"/>.</summary>
        public ITransformer Fit(IDataView input)
        {
            // Create arguments.
            var options = new WordHashBagProducingTransformer.Options
            {
                Columns = _columns.Select(x => new WordHashBagProducingTransformer.Column { Name = x.outputColumnName, Source = x.inputColumnNames }).ToArray(),
                NumberOfBits = _numberOfBits,
                NgramLength = _ngramLength,
                SkipLength = _skipLength,
                UseAllLengths = _useAllLengths,
                Seed = _seed,
                Ordered = _ordered,
                MaximumNumberOfInverts = _maximumNumberOfInverts
            };
 
            return WordHashBagProducingTransformer.CreateTransformer(_host, options, input);
        }
 
        /// <summary>
        /// Schema propagation for estimators.
        /// Returns the output schema shape of the estimator, if the input schema shape is like the one provided.
        /// </summary>
        public SchemaShape GetOutputSchema(SchemaShape inputSchema)
        {
            _host.CheckValue(inputSchema, nameof(inputSchema));
 
            var fakeSchema = FakeSchemaFactory.Create(inputSchema);
            var transformer = Fit(new EmptyDataView(_host, fakeSchema));
            return SchemaShape.Create(transformer.GetOutputSchema(fakeSchema));
        }
    }
}