File: Selector\SubModelSelector\BaseSubModelSelector.cs
Web Access
Project: src\src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj (Microsoft.ML.Ensemble)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
 
namespace Microsoft.ML.Trainers.Ensemble
{
    internal abstract class BaseSubModelSelector<TOutput> : ISubModelSelector<TOutput>
    {
        protected readonly IHost Host;
 
        public abstract Single ValidationDatasetProportion { get; }
 
        protected abstract PredictionKind PredictionKind { get; }
 
        protected BaseSubModelSelector(IHostEnvironment env, string name)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckNonWhiteSpace(name, nameof(name));
            Host = env.Register(name);
        }
 
        protected void Print(IChannel ch, IList<FeatureSubsetModel<TOutput>> models, string metricName)
        {
            // REVIEW: The output format was faithfully reproduced from the original format, but it's unclear
            // to me that this is right. Why have two bars in the header line, but only one bar in the results?
            ch.Info("List of models and the metrics after sorted");
            ch.Info("| {0}(Sorted) || Name of Model |", metricName);
            foreach (var model in models)
            {
                var metric = 0.0;
                var found = false;
                foreach (var kvp in model.Metrics)
                {
                    if (kvp.Key == metricName)
                    {
                        metric = kvp.Value;
                        found = true;
                    }
                }
                if (!found)
                    throw ch.Except("Metrics did not contain the requested metric '{0}'", metricName);
                ch.Info("| {0} |{1}", metric, model.Predictor.GetType().Name);
            }
        }
 
        public virtual IList<FeatureSubsetModel<TOutput>> Prune(IList<FeatureSubsetModel<TOutput>> models)
        {
            return models;
        }
 
        private IEvaluator GetEvaluator(IHostEnvironment env)
        {
            switch (PredictionKind)
            {
                case PredictionKind.BinaryClassification:
                    return new BinaryClassifierEvaluator(env, new BinaryClassifierEvaluator.Arguments());
                case PredictionKind.Regression:
                    return new RegressionEvaluator(env, new RegressionEvaluator.Arguments());
                case PredictionKind.MulticlassClassification:
                    return new MulticlassClassificationEvaluator(env, new MulticlassClassificationEvaluator.Arguments());
                default:
                    throw Host.Except("Unrecognized prediction kind '{0}'", PredictionKind);
            }
        }
 
        public virtual void CalculateMetrics(FeatureSubsetModel<TOutput> model,
            ISubsetSelector subsetSelector, Subset subset, Batch batch, bool needMetrics)
        {
            if (!needMetrics || model == null || model.Metrics != null)
                return;
 
            using (var ch = Host.Start("Calculate metrics"))
            {
                RoleMappedData testData = subsetSelector.GetTestData(subset, batch);
                // Because the training and test datasets are drawn from the same base dataset, the test data role mappings
                // are the same as for the train data.
                IDataScorerTransform scorePipe = ScoreUtils.GetScorer(model.Predictor, testData, Host, testData.Schema);
                RoleMappedData scoredTestData = new RoleMappedData(scorePipe,
                    GetColumnRoles(testData.Schema, scorePipe.Schema));
                // REVIEW: Should we somehow allow the user to customize the evaluator?
                // By what mechanism should we allow that?
                IEvaluator evaluator = GetEvaluator(Host);
                // REVIEW: with the new evaluators, metrics of individual models are no longer
                // printed to the Console. Consider adding an option on the combiner to print them.
                // REVIEW: Consider adding an option to the combiner to save a data view
                // containing all the results of the individual models.
                var metricsDict = evaluator.Evaluate(scoredTestData);
                if (!metricsDict.TryGetValue(MetricKinds.OverallMetrics, out IDataView metricsView))
                    throw Host.Except("Evaluator did not produce any overall metrics");
                // REVIEW: We're assuming that the metrics of interest are always doubles here.
                var metrics = EvaluateUtils.GetMetrics(metricsView, getVectorMetrics: false);
                model.Metrics = metrics.ToArray();
            }
        }
 
        private IEnumerable<KeyValuePair<RoleMappedSchema.ColumnRole, string>> GetColumnRoles(
            RoleMappedSchema testSchema, DataViewSchema scoredSchema)
        {
            switch (PredictionKind)
            {
                case PredictionKind.BinaryClassification:
                    yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Value.Name);
                    var scoreCol = EvaluateUtils.GetScoreColumn(Host, scoredSchema, null, nameof(BinaryClassifierMamlEvaluator.ArgumentsBase.ScoreColumn),
                        AnnotationUtils.Const.ScoreColumnKind.BinaryClassification);
                    yield return RoleMappedSchema.CreatePair(AnnotationUtils.Const.ScoreValueKind.Score, scoreCol.Name);
                    // Get the optional probability column.
                    var probCol = EvaluateUtils.GetOptAuxScoreColumn(Host, scoredSchema, null, nameof(BinaryClassifierMamlEvaluator.Arguments.ProbabilityColumn),
                        scoreCol.Index, AnnotationUtils.Const.ScoreValueKind.Probability, NumberDataViewType.Single.Equals);
                    if (probCol.HasValue)
                        yield return RoleMappedSchema.CreatePair(AnnotationUtils.Const.ScoreValueKind.Probability, probCol.Value.Name);
                    yield break;
                case PredictionKind.Regression:
                    yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Value.Name);
                    scoreCol = EvaluateUtils.GetScoreColumn(Host, scoredSchema, null, nameof(RegressionMamlEvaluator.Arguments.ScoreColumn),
                        AnnotationUtils.Const.ScoreColumnKind.Regression);
                    yield return RoleMappedSchema.CreatePair(AnnotationUtils.Const.ScoreValueKind.Score, scoreCol.Name);
                    yield break;
                case PredictionKind.MulticlassClassification:
                    yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Value.Name);
                    scoreCol = EvaluateUtils.GetScoreColumn(Host, scoredSchema, null, nameof(MulticlassClassificationMamlEvaluator.Arguments.ScoreColumn),
                        AnnotationUtils.Const.ScoreColumnKind.MulticlassClassification);
                    yield return RoleMappedSchema.CreatePair(AnnotationUtils.Const.ScoreValueKind.Score, scoreCol.Name);
                    yield break;
                default:
                    throw Host.Except("Unrecognized prediction kind '{0}'", PredictionKind);
            }
        }
    }
}