File: Tuner\PipelineProposer.cs
Web Access
Project: src\src\Microsoft.ML.AutoML\Microsoft.ML.AutoML.csproj (Microsoft.ML.AutoML)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.AutoML.CodeGen;
using Microsoft.ML.SearchSpace.Option;
using Newtonsoft.Json;
using Tensorflow;
using static Microsoft.ML.AutoML.AutoMLExperiment;
 
namespace Microsoft.ML.AutoML
{
    /// <summary>
    /// propose sweepable estimator pipeline from a group of candidates using eci in flaml (https://arxiv.org/abs/1911.04706)
    /// </summary>
    internal class PipelineProposer
    {
        private readonly Dictionary<EstimatorType, double> _estimatorCost;
        private readonly Dictionary<string, double> _learnerInitialCost;
 
        // total time spent on last best error for each learner.
        private Dictionary<string, double> _k1;
 
        // total time spent on second last best error for each learner.
        private Dictionary<string, double> _k2;
 
        // last best error for each learner.
        private Dictionary<string, double> _e1;
 
        // second last best error for each learner.
        private Dictionary<string, double> _e2;
 
        // flaml ECI
        private readonly Dictionary<string, double> _eci;
        private double _globalBestError;
 
        private readonly Random _rand;
        private readonly SweepablePipeline _sweepablePipeline;
        private readonly string[] _pipelineSchemas;
        private bool _initialized = false;
 
        public PipelineProposer(SweepablePipeline sweepablePipeline, AutoMLExperimentSettings settings)
        {
            // this cost is used to initialize eci when started, the smaller the number, the less cost this trainer will use at start, and more likely it will be
            // picked.
            _estimatorCost = new Dictionary<EstimatorType, double>()
            {
                { EstimatorType.LightGbmRegression, 0.788 },
                { EstimatorType.FastTreeRegression, 0.382 },
                { EstimatorType.FastForestRegression, 0.374 },
                { EstimatorType.SdcaRegression, 0.566 },
                { EstimatorType.FastTreeTweedieRegression, 0.401 },
                { EstimatorType.LbfgsPoissonRegressionRegression, 4.73 },
                { EstimatorType.FastForestOva, 4.283 },
                { EstimatorType.FastTreeOva, 3.701 },
                { EstimatorType.LightGbmMulti, 4.765 },
                { EstimatorType.SdcaMaximumEntropyMulti, 10.129 },
                { EstimatorType.SdcaLogisticRegressionOva, 13.16 },
                { EstimatorType.LbfgsMaximumEntropyMulti, 7.980 },
                { EstimatorType.LbfgsLogisticRegressionOva, 11.513 },
                { EstimatorType.LightGbmBinary, 4.765 },
                { EstimatorType.FastTreeBinary, 3.701 },
                { EstimatorType.FastForestBinary, 4.283 },
                { EstimatorType.SdcaLogisticRegressionBinary, 13.16 },
                { EstimatorType.LbfgsLogisticRegressionBinary, 11.513 },
                { EstimatorType.ForecastBySsa, 1 },
                { EstimatorType.ImageClassificationMulti, 1 },
                { EstimatorType.MatrixFactorization, 1 },
            };
 
            _sweepablePipeline = sweepablePipeline;
 
            if (settings.Seed.HasValue)
            {
                _rand = new Random(settings.Seed.Value);
            }
            else
            {
                _rand = new Random();
            }
 
            _pipelineSchemas = _sweepablePipeline.Schema.ToTerms().Select(t => t.ToString()).ToArray();
            _learnerInitialCost = _pipelineSchemas.ToDictionary(kv => kv, kv => GetEstimatedCostForPipeline(kv, _sweepablePipeline));
            // initialize eci with the estimated cost and always start from pipeline which has lowest cost.
            _eci = _pipelineSchemas.ToDictionary(kv => kv, kv => GetEstimatedCostForPipeline(kv, _sweepablePipeline));
        }
 
        public string ProposeSearchSpace()
        {
            string schema;
            if (!_initialized)
            {
                schema = _eci.OrderBy(kv => kv.Value).First().Key;
                _initialized = true;
            }
            else
            {
                var probabilities = _pipelineSchemas.Select(id => _eci[id]).ToArray();
                probabilities = ArrayMath.Inverse(probabilities);
                // _eci (estimator improve cost) might be infinity, which means the estimator cost for finding an improvement is positive infinity
                // in that case, we used to set the probability to be a very small number, in some cases it can be zero ( 1 / double.infinity), so that it will be very unlikely to be picked.
                // however, there's a special situation where all the estimators have infinity eci
                // which could happen when all the estimators have been tried, all retrieves perfect loss and no improvement can be made.
                // in which case, all probablities will be zero and in that case, we will never be able to pick any of them.
                // Therefore, we need to make sure non of the probabilities is zero, and we can do that by adding a very small number (double.epsilon) to each of them after inverse.
                probabilities = probabilities.Select(p => p + double.Epsilon).ToArray();
                probabilities = ArrayMath.Normalize(probabilities);
 
                // sample
                var randdouble = _rand.NextDouble();
                var sum = 0.0;
                // selected pipeline id index
                int i;
 
                for (i = 0; i != _pipelineSchemas.Length; ++i)
                {
                    sum += probabilities[i];
                    if (sum > randdouble)
                    {
                        break;
                    }
                }
 
                schema = _pipelineSchemas[i];
            }
 
            return schema;
        }
 
        public void Update(TrialResult result, string schema)
        {
            var loss = result.Loss;
            var duration = result.DurationInMilliseconds / 1000;
            var isSuccess = duration != 0 && !double.IsNaN(loss) && !double.IsInfinity(loss);
 
            // if k1 is null, it means this is the first completed trial.
            // in that case, initialize k1, k2, e1, e2 in the following way:
            // k1: for every learner, k1[l] = c * duration where c is a ratio defined in learnerInitialCost
            // k2: k2 = k1, which indicates the hypothesis that it costs the same time for learners to reach the next break through.
            // e1: current error
            // e2: e1 + 1.05 * |e1|
 
            if (isSuccess)
            {
                if (_k1 == null)
                {
                    _k1 = _pipelineSchemas.ToDictionary(id => id, id => duration * _learnerInitialCost[id] / _learnerInitialCost[schema]);
                    _k2 = _k1.ToDictionary(kv => kv.Key, kv => kv.Value);
                    _e1 = _pipelineSchemas.ToDictionary(id => id, id => loss);
                    _e2 = _pipelineSchemas.ToDictionary(id => id, id => loss + 0.05 * Math.Abs(loss));
                    _globalBestError = loss;
                }
                else if (loss >= _e1[schema])
                {
                    // if error is larger than current best error, which means there's no improvements for
                    // the last trial with the current learner.
                    // In that case, simply increase the total spent time since the last best error for that learner.
                    _k1[schema] += duration;
                }
                else
                {
                    // there's an improvement.
                    // k2 <= k1 && e2 <= e1, and update k1, e2.
                    _k2[schema] = _k1[schema];
                    _k1[schema] = duration;
                    _e2[schema] = _e1[schema];
                    _e1[schema] = loss;
 
                    // update global best error as well
                    if (loss < _globalBestError)
                    {
                        _globalBestError = loss;
                    }
                }
 
                // update eci
                var eci1 = Math.Max(_k1[schema], _k2[schema]);
                var estimatorCostForBreakThrough = (2 * (loss - _globalBestError) + double.Epsilon) / ((_e2[schema] - _e1[schema]) / (_k2[schema] + _k1[schema]) + double.Epsilon);
                _eci[schema] = Math.Max(eci1, estimatorCostForBreakThrough);
            }
            else
            {
                // double eci of current trial twice of maxium ecis.
                _eci[schema] = _eci.Select(kv => kv.Value).Max() * 2;
            }
 
            _initialized = true;
 
            return;
        }
 
        private double GetEstimatedCostForPipeline(string schema, SweepablePipeline pipeline)
        {
            var entity = Entity.FromExpression(schema);
 
            var estimatorTypes = entity.ValueEntities().Where(v => v is StringEntity s && s.Value != "Nil")
                                         .Select(v =>
                                         {
                                             var s = v as StringEntity;
                                             var estimator = pipeline.Estimators[s.Value];
                                             return estimator.EstimatorType;
                                         });
 
            var res = 1;
 
            foreach (var estimatorType in estimatorTypes)
            {
                if (_estimatorCost.ContainsKey(estimatorType))
                {
                    return _estimatorCost[estimatorType];
                }
            }
 
            return res;
        }
 
        public class Status
        {
            public Dictionary<string, double> K1 { get; set; }
 
            public Dictionary<string, double> K2 { get; set; }
 
            public Dictionary<string, double> E1 { get; set; }
 
            public Dictionary<string, double> E2 { get; set; }
 
            public Dictionary<string, double> Eci { get; set; }
 
            public double GlobalBestError { get; set; }
        }
    }
}