File: API\AutoMLExperimentExtension.cs
Web Access
Project: src\src\Microsoft.ML.AutoML\Microsoft.ML.AutoML.csproj (Microsoft.ML.AutoML)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.ML.Runtime;
using Microsoft.ML.SearchSpace.Option;
using Newtonsoft.Json;
using static Microsoft.ML.DataOperationsCatalog;
 
namespace Microsoft.ML.AutoML
{
    public static class AutoMLExperimentExtension
    {
        /// <summary>
        /// Set train and validation dataset for <see cref="AutoMLExperiment"/>. This will make <see cref="AutoMLExperiment"/> uses <paramref name="train"/>
        /// to train a model, and use <paramref name="validation"/> to evaluate the model.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="train">dataset for training a model.</param>
        /// <param name="validation">dataset for validating a model during training.</param>
        /// <param name="subSamplingTrainDataset">determine if subsampling <paramref name="train"/> to train. This will be useful if <paramref name="train"/> is too large to be held in memory.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, IDataView train, IDataView validation, bool subSamplingTrainDataset = false)
        {
            var datasetManager = new TrainValidateDatasetManager(train, validation);
 
            if (subSamplingTrainDataset)
            {
                var searchSpace = new SearchSpace.SearchSpace();
                searchSpace.Add(datasetManager.SubSamplingKey, new UniformSingleOption(0, 1, false, 0.1f));
                experiment.AddSearchSpace(nameof(TrainValidateDatasetManager), searchSpace);
            }
 
            experiment.ServiceCollection.AddSingleton<IDatasetManager>(datasetManager);
            experiment.ServiceCollection.AddSingleton(datasetManager);
 
            return experiment;
        }
 
        /// <summary>
        /// Set train and validation dataset for <see cref="AutoMLExperiment"/>. This will make <see cref="AutoMLExperiment"/> uses <see cref="TrainTestData.TrainSet"/> from <paramref name="trainValidationSplit"/>
        /// to train a model, and use <see cref="TrainTestData.TestSet"/> from <paramref name="trainValidationSplit"/> to evaluate the model.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="trainValidationSplit">a <see cref="TrainTestData"/> for train and validation.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, TrainTestData trainValidationSplit)
        {
            return experiment.SetDataset(trainValidationSplit.TrainSet, trainValidationSplit.TestSet);
        }
 
        /// <summary>
        /// Set cross-validation dataset for <see cref="AutoMLExperiment"/>. This will make <see cref="AutoMLExperiment"/> use n=<paramref name="fold"/> cross-validation split on <paramref name="dataset"/>
        /// to train and evaluate a model.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="dataset">dataset for cross-validation split.</param>
        /// <param name="fold">number of cross-validation folds</param>
        /// <param name="samplingKeyColumnName">column name for sampling key</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, IDataView dataset, int fold = 10, string samplingKeyColumnName = null)
        {
            var datasetManager = new CrossValidateDatasetManager(dataset, fold, samplingKeyColumnName);
            experiment.ServiceCollection.AddSingleton<IDatasetManager>(datasetManager);
            experiment.ServiceCollection.AddSingleton(datasetManager);
 
            return experiment;
        }
 
        /// <summary>
        /// Set <see cref="BinaryMetricManager"/> as evaluation manager for <see cref="AutoMLExperiment"/>. This will make
        /// <see cref="AutoMLExperiment"/> uses <paramref name="metric"/> as evaluation metric.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="metric">evaluation metric.</param>
        /// <param name="labelColumn">label column.</param>
        /// <param name="predictedColumn">predicted column.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetBinaryClassificationMetric(this AutoMLExperiment experiment, BinaryClassificationMetric metric, string labelColumn = "label", string predictedColumn = "PredictedLabel")
        {
            var metricManager = new BinaryMetricManager(metric, labelColumn, predictedColumn);
            return experiment.SetEvaluateMetric(metricManager);
        }
 
        /// <summary>
        /// Set <see cref="MultiClassMetricManager"/> as evaluation manager for <see cref="AutoMLExperiment"/>. This will make
        /// <see cref="AutoMLExperiment"/> uses <paramref name="metric"/> as evaluation metric.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="metric">evaluation metric.</param>
        /// <param name="labelColumn">label column.</param>
        /// <param name="predictedColumn">predicted column.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetMulticlassClassificationMetric(this AutoMLExperiment experiment, MulticlassClassificationMetric metric, string labelColumn = "label", string predictedColumn = "PredictedLabel")
        {
            var metricManager = new MultiClassMetricManager()
            {
                Metric = metric,
                PredictedColumn = predictedColumn,
                LabelColumn = labelColumn,
            };
 
            return experiment.SetEvaluateMetric(metricManager);
        }
 
        /// <summary>
        /// Set <see cref="RegressionMetricManager"/> as evaluation manager for <see cref="AutoMLExperiment"/>. This will make
        /// <see cref="AutoMLExperiment"/> uses <paramref name="metric"/> as evaluation metric.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="metric">evaluation metric.</param>
        /// <param name="labelColumn">label column.</param>
        /// <param name="scoreColumn">score column.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetRegressionMetric(this AutoMLExperiment experiment, RegressionMetric metric, string labelColumn = "Label", string scoreColumn = "Score")
        {
            var metricManager = new RegressionMetricManager()
            {
                Metric = metric,
                ScoreColumn = scoreColumn,
                LabelColumn = labelColumn,
            };
 
            return experiment.SetEvaluateMetric(metricManager);
        }
 
        /// <summary>
        /// Set <paramref name="pipeline"/> for training. This also make <see cref="AutoMLExperiment"/> uses <see cref="SweepablePipelineRunner"/>
        /// , <see cref="MLContextMonitor"/> and <see cref="EciCostFrugalTuner"/> for automl traininng as well.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="pipeline"><see cref="SweepablePipeline"/></param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, SweepablePipeline pipeline)
        {
            experiment.AddSearchSpace(AutoMLExperiment.PipelineSearchspaceName, pipeline.SearchSpace);
            experiment.ServiceCollection.AddSingleton(pipeline);
 
            experiment.SetTrialRunner<SweepablePipelineRunner>();
            experiment.SetMonitor<MLContextMonitor>();
            experiment.SetTuner<EciCostFrugalTuner>();
 
            return experiment;
        }
 
        /// <summary>
        /// Set <see cref="DefaultPerformanceMonitor"/> as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="checkIntervalInMilliseconds">the interval in milliseconds for <see cref="DefaultPerformanceMonitor"/> to sample <see cref="TrialPerformanceMetrics"/></param>
        /// <returns></returns>
        public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000)
        {
            experiment.SetPerformanceMonitor((service) =>
            {
                var channel = service.GetService<IChannel>();
                var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
                return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds);
            });
 
            return experiment;
        }
 
        /// <summary>
        /// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
        /// </summary>
        /// <typeparam name="TPerformanceMonitor"></typeparam>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="factory"></param>
        /// <returns></returns>
        public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment, Func<IServiceProvider, TPerformanceMonitor> factory)
            where TPerformanceMonitor : class, IPerformanceMonitor
        {
            experiment.ServiceCollection.AddTransient<IPerformanceMonitor>(factory);
 
            return experiment;
        }
 
        /// <summary>
        /// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
        /// </summary>
        /// <typeparam name="TPerformanceMonitor"></typeparam>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <returns></returns>
        public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment)
            where TPerformanceMonitor : class, IPerformanceMonitor
        {
            experiment.ServiceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();
 
            return experiment;
        }
 
        /// <summary>
        /// Set <see cref="SmacTuner"/> as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined 
        /// by <paramref name="numberOfTrees"/>, <paramref name="nMinForSpit"/> and <paramref name="splitRatio"/>, which are used to fit smac's inner 
        /// regressor.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        /// <param name="numberOfTrees">number of regression trees when fitting random forest.</param>
        /// <param name="fitModelEveryNTrials">re-fit random forests in smac for every N trials.</param>
        /// <param name="numberInitialPopulation">Number of points to use for random initialization.</param>
        /// <param name="splitRatio">split ratio for fitting random forest in smac.</param>
        /// <param name="nMinForSpit">minimum number of data points required to be in a node if it is to be split further for fitting random forest in smac.</param>
        /// <param name="localSearchParentCount">Number of search parents to use for local search in maximizing EI acquisition function.</param>
        /// <param name="numRandomEISearchConfigurations">Number of random configurations when maximizing EI acquisition function.</param>
        /// <param name="numNeighboursForNumericalParams">Number of neighbours to sample from when applying one-step mutation for generating new parameters.</param>
        /// <param name="epsilon">the threshold to exit during maximizing EI acquisition function.</param>
        /// <returns></returns>
        public static AutoMLExperiment SetSmacTuner(
            this AutoMLExperiment experiment,
            int numberInitialPopulation = 20,
            int fitModelEveryNTrials = 10,
            int numberOfTrees = 10,
            int nMinForSpit = 2,
            float splitRatio = 0.8f,
            int localSearchParentCount = 5,
            int numRandomEISearchConfigurations = 5000,
            double epsilon = 1e-5,
            int numNeighboursForNumericalParams = 4)
        {
            experiment.SetTuner((service) =>
            {
                var channel = service.GetRequiredService<IChannel>();
                var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
                var context = service.GetRequiredService<MLContext>();
                var smac = new SmacTuner(context, settings.SearchSpace, numberInitialPopulation, fitModelEveryNTrials, numberOfTrees, nMinForSpit, splitRatio, localSearchParentCount, numRandomEISearchConfigurations, epsilon, numNeighboursForNumericalParams, settings.Seed, channel);
 
                return smac;
            });
 
            return experiment;
        }
 
        /// <summary>
        /// Set <see cref="CostFrugalTuner"/> as tuner for hyper-parameter optimization.
        /// </summary>
        /// <param name="experiment"></param>
        /// <returns></returns>
        public static AutoMLExperiment SetCostFrugalTuner(this AutoMLExperiment experiment)
        {
            experiment.SetTuner((service) =>
            {
                var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
                var cfo = new CostFrugalTuner(settings);
 
                return cfo;
            });
 
            return experiment;
        }
 
        /// <summary>
        /// set <see cref="RandomSearchTuner"/> as tuner for hyper parameter optimization. If <paramref name="seed"/> is provided, it will use that 
        /// seed to initialize <see cref="RandomSearchTuner"/>. Otherwise, <see cref="AutoMLExperiment.AutoMLExperimentSettings.Seed"/> will be used.
        /// </summary>
        /// <param name="seed"></param>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        public static AutoMLExperiment SetRandomSearchTuner(this AutoMLExperiment experiment, int? seed = null)
        {
            experiment.SetTuner((service) =>
            {
                var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
                seed = seed ?? settings.Seed;
                var tuner = new RandomSearchTuner(settings.SearchSpace, seed);
 
                return tuner;
            });
 
            return experiment;
        }
 
        /// <summary>
        /// set <see cref="GridSearchTuner"/> as tuner for hyper parameter optimization.
        /// </summary>
        /// <param name="step">step size for numeric option.</param>
        /// <param name="experiment"><see cref="AutoMLExperiment"/></param>
        public static AutoMLExperiment SetGridSearchTuner(this AutoMLExperiment experiment, int step = 10)
        {
            experiment.SetTuner((service) =>
            {
                var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
                var tuner = new GridSearchTuner(settings.SearchSpace, step);
 
                return tuner;
            });
 
            return experiment;
        }
 
        /// <summary>
        /// Set checkpoint folder for <see cref="AutoMLExperiment"/>. The checkpoint folder will be used to save
        /// temporary output, run history and many other stuff which will be used for restoring training process 
        /// from last checkpoint and continue training.
        /// </summary>
        /// <param name="experiment"><see cref="AutoMLExperiment"/>.</param>
        /// <param name="folder">checkpoint folder. This folder will be created if not exist.</param>
        /// <returns><see cref="AutoMLExperiment"/></returns>
        public static AutoMLExperiment SetCheckpoint(this AutoMLExperiment experiment, string folder)
        {
            if (!Directory.Exists(folder))
            {
                Directory.CreateDirectory(folder);
            }
 
            experiment.ServiceCollection.AddSingleton<ITrialResultManager>(serviceProvider =>
            {
                var channel = serviceProvider.GetRequiredService<IChannel>();
                var settings = serviceProvider.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
 
                // todo
                // pull out the logic of calculating experiment id into a stand-alone service.
                var metricManager = serviceProvider.GetService<IMetricManager>();
                var csvFileName = "trialResults";
                csvFileName += $"-{settings.SearchSpace.GetHashCode()}";
                if (metricManager is IMetricManager)
                {
                    csvFileName += $"-{metricManager.MetricName}";
                }
                csvFileName += ".csv";
 
                var csvFilePath = Path.Combine(folder, csvFileName);
                var trialResultManager = new CsvTrialResultManager(csvFilePath, settings.SearchSpace, channel);
 
                return trialResultManager;
            });
 
            return experiment;
        }
 
        /// <summary>
        /// set <see cref="EciCostFrugalTuner"/> as tuner for hyper-parameter optimization. This tuner only works with search space from <see cref="SweepablePipeline"/>.
        /// </summary>
        /// <param name="experiment"></param>
        /// <returns></returns>
        public static AutoMLExperiment SetEciCostFrugalTuner(this AutoMLExperiment experiment)
        {
            experiment.SetTuner<EciCostFrugalTuner>();
 
            return experiment;
        }
 
        private static AutoMLExperiment SetEvaluateMetric<TEvaluateMetricManager>(this AutoMLExperiment experiment, TEvaluateMetricManager metricManager)
            where TEvaluateMetricManager : class, IEvaluateMetricManager
        {
            experiment.ServiceCollection.AddSingleton<IMetricManager>(metricManager);
            experiment.ServiceCollection.AddSingleton<IEvaluateMetricManager>(metricManager);
 
            return experiment;
        }
    }
}