File: API\ColumnInference.cs
Web Access
Project: src\src\Microsoft.ML.AutoML\Microsoft.ML.AutoML.csproj (Microsoft.ML.AutoML)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Collections.Generic;
using System.Collections.ObjectModel;
using Microsoft.ML.Data;
using Newtonsoft.Json;
 
namespace Microsoft.ML.AutoML
{
    /// <summary>
    /// Contains information AutoML inferred about columns in a dataset.
    /// </summary>
    public sealed class ColumnInferenceResults
    {
        /// <summary>
        /// Inferred <see cref="TextLoader.Options" /> for the dataset.
        /// </summary>
        /// <remarks>
        /// Can be used to instantiate a new <see cref="TextLoader" /> to load
        /// data into an <see cref="IDataView" />.
        /// </remarks>
        [JsonProperty(DefaultValueHandling = DefaultValueHandling.Include)]
        public TextLoader.Options TextLoaderOptions { get; internal set; }
 
        /// <summary>
        /// Information about the inferred columns in the dataset.
        /// </summary>
        /// <remarks>
        /// <para>Contains the inferred purposes of each column. See <see cref="AutoML.ColumnInformation"/> for more details.</para>
        /// <para>This can be fed to the AutoML API when running an experiment.
        /// See <typeref cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />
        /// for example.</para>
        /// </remarks>
        [JsonProperty(DefaultValueHandling = DefaultValueHandling.Include)]
        public ColumnInformation ColumnInformation { get; internal set; }
    }
 
    /// <summary>
    /// Information about the columns in a dataset.
    /// </summary>
    /// <remarks>
    /// <para>Contains information about the purpose of each column in the dataset. For instance,
    /// it enumerates the dataset columns that AutoML should treat as categorical,
    /// the columns AutoML should ignore, which column is the label, etc.</para>
    /// <para><see cref="ColumnInformation"/> can be fed to the AutoML API when running an experiment.
    /// See <typeref cref="ExperimentBase{TMetrics, TExperimentSettings}.Execute(IDataView, ColumnInformation, IEstimator{ITransformer}, System.IProgress{RunDetail{TMetrics}})" />
    /// for example.</para>
    /// </remarks>
    public sealed class ColumnInformation
    {
        /// <summary>
        /// The dataset column to use as the label.
        /// </summary>
        /// <value>The default value is "Label".</value>
        public string LabelColumnName { get; set; }
 
        /// <summary>
        /// The dataset column to use as a user ID for computation.
        /// </summary>
        public string UserIdColumnName { get; set; }
 
        /// <summary>
        /// The dataset column to use as a group ID for computation in a Ranking Task.
        /// If a SamplingKeyColumnName is provided, then it should be the same as this column.
        /// </summary>
        public string GroupIdColumnName { get; set; }
 
        /// <summary>
        /// The dataset column to use as a item ID for computation.
        /// </summary>
        public string ItemIdColumnName { get; set; }
 
        /// <summary>
        /// The dataset column to use for example weight.
        /// </summary>
        public string ExampleWeightColumnName { get; set; }
 
        /// <summary>
        /// The dataset column to use for grouping rows.
        /// If two examples share the same sampling key column name,
        /// they are guaranteed to appear in the same subset (train or test).
        /// This can be used to ensure no label leakage from the train to the test set.
        /// If <see langword="null"/>, no row grouping will be performed.
        /// </summary>
        public string SamplingKeyColumnName { get; set; }
 
        /// <summary>
        /// The dataset columns that are categorical.
        /// </summary>
        /// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
        /// <remarks>
        /// Categorical data columns should generally be columns that contain a small number of unique values.
        /// </remarks>
        [JsonProperty]
        public ICollection<string> CategoricalColumnNames { get; private set; }
 
        /// <summary>
        /// The dataset columns that are numeric.
        /// </summary>
        /// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
        [JsonProperty]
        public ICollection<string> NumericColumnNames { get; private set; }
 
        /// <summary>
        /// The dataset columns that are text.
        /// </summary>
        /// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
        [JsonProperty]
        public ICollection<string> TextColumnNames { get; private set; }
 
        /// <summary>
        /// The dataset columns that AutoML should ignore.
        /// </summary>
        /// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
        [JsonProperty]
        public ICollection<string> IgnoredColumnNames { get; private set; }
 
        /// <summary>
        /// The dataset columns that are image paths.
        /// </summary>
        /// <value>The default value is a new, empty <see cref="Collection{String}"/>.</value>
        [JsonProperty]
        public ICollection<string> ImagePathColumnNames { get; private set; }
 
        public ColumnInformation()
        {
            LabelColumnName = DefaultColumnNames.Label;
            CategoricalColumnNames = new Collection<string>();
            NumericColumnNames = new Collection<string>();
            TextColumnNames = new Collection<string>();
            IgnoredColumnNames = new Collection<string>();
            ImagePathColumnNames = new Collection<string>();
        }
    }
}