File: IntrospectiveTraining.cs
Web Access
Project: src\test\Microsoft.ML.IntegrationTests\Microsoft.ML.IntegrationTests.csproj (Microsoft.ML.IntegrationTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.IntegrationTests.Datasets;
using Microsoft.ML.TestFrameworkCommon;
using Microsoft.ML.Trainers;
using Microsoft.ML.Trainers.FastTree;
using Microsoft.ML.Transforms;
using Xunit;
using Xunit.Abstractions;
 
namespace Microsoft.ML.IntegrationTests
{
    public class IntrospectiveTraining : IntegrationTestBaseClass
    {
        public IntrospectiveTraining(ITestOutputHelper output) : base(output)
        {
        }
 
        /// <summary>
        /// Introspective Training: Tree ensembles learned from FastForest can be inspected.
        /// </summary>
        [Fact]
        public void InspectFastForestRegresionTrees()
        {
            var mlContext = new MLContext(seed: 1);
 
            // Get the dataset.
            var data = mlContext.Data.LoadFromTextFile<HousingRegression>(TestCommon.GetDataPath(DataDir, TestDatasets.housing.trainFilename), hasHeader: true);
 
            // Create a pipeline to train on the housing data.
            var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
                .Append(mlContext.Regression.Trainers.FastForest(
                    new FastForestRegressionTrainer.Options { NumberOfLeaves = 5, NumberOfTrees = 3, NumberOfThreads = 1 }));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Extract the boosted tree model.
            var fastForestModel = model.LastTransformer.Model;
 
            // Extract the learned Random Forests model.
            var treeCollection = fastForestModel.TrainedTreeEnsemble;
 
            // Inspect properties in the extracted model.
            Assert.Equal(3, treeCollection.Trees.Count);
            Assert.Equal(3, treeCollection.TreeWeights.Count);
            Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
            Assert.All(treeCollection.Trees, tree =>
            {
                Assert.Equal(5, tree.NumberOfLeaves);
                Assert.Equal(4, tree.NumberOfNodes);
                Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
                Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
                Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
                Assert.Empty(tree.GetCategoricalSplitFeaturesAt(0));
                Assert.Empty(tree.GetCategoricalCategoricalSplitFeatureRangeAt(0));
            });
        }
 
        /// <summary>
        /// Introspective Training: Tree ensembles learned from FastTree can be inspected.
        /// </summary>
        [Fact]
        public void InspectFastTreeModelParameters()
        {
            var mlContext = new MLContext(seed: 1);
 
            var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
                hasHeader: TestDatasets.Sentiment.fileHasHeader,
                separatorChar: TestDatasets.Sentiment.fileSeparator,
                allowQuoting: TestDatasets.Sentiment.allowQuoting);
 
            // Create a training pipeline.
            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
                .AppendCacheCheckpoint(mlContext)
                .Append(mlContext.BinaryClassification.Trainers.FastTree(
                    new FastTreeBinaryTrainer.Options { NumberOfLeaves = 5, NumberOfTrees = 3, NumberOfThreads = 1 }));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Extract the boosted tree model.
            var fastTreeModel = model.LastTransformer.Model.SubModel;
 
            // Extract the learned GBDT model.
            var treeCollection = fastTreeModel.TrainedTreeEnsemble;
 
            // Make sure the tree models were formed as expected.
            Assert.Equal(3, treeCollection.Trees.Count);
            Assert.Equal(3, treeCollection.TreeWeights.Count);
            Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
            Assert.All(treeCollection.Trees, tree =>
            {
                Assert.Equal(5, tree.NumberOfLeaves);
                Assert.Equal(4, tree.NumberOfNodes);
                Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
                Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
                Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
                Assert.Empty(tree.GetCategoricalSplitFeaturesAt(0));
                Assert.Empty(tree.GetCategoricalCategoricalSplitFeatureRangeAt(0));
            });
 
            // Add baselines for the model.
            // Verify that there is no bias.
            Assert.Equal(0, treeCollection.Bias);
            // Check the parameters of the final tree.
            var finalTree = treeCollection.Trees[2];
            Assert.Equal(finalTree.LeftChild, new int[] { 2, -2, -1, -3 });
            Assert.Equal(finalTree.RightChild, new int[] { 1, 3, -4, -5 });
            Assert.Equal(finalTree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 });
            var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 };
            var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f };
            for (int i = 0; i < finalTree.NumberOfNodes; ++i)
            {
                Assert.Equal(expectedSplitGains[i], finalTree.SplitGains[i], 0.000001);
                Assert.Equal((double)expectedThresholds[i], (double)finalTree.NumericalSplitThresholds[i], 0.000001);
            }
        }
 
        /// <summary>
        /// Introspective Training: GAM Shape Functions are easily accessed.
        /// </summary>
        [Fact]
        public void IntrospectGamShapeFunctions()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset.
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Compose the transformation.
            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
                .Append(mlContext.Regression.Trainers.Gam(
                    new GamRegressionTrainer.Options { NumberOfIterations = 100, NumberOfThreads = 1 }));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Extract the normalizer from the trained pipeline.
            var gamModel = model.LastTransformer.Model;
 
            // Take look at the shape functions.
            for (int i = 0; i < gamModel.NumberOfShapeFunctions; i++)
            {
                var shapeFunctionBins = gamModel.GetBinUpperBounds(i);
                var shapeFunctionValues = gamModel.GetBinEffects(i);
 
                // Validate that the shape functions lengths match.
                Assert.Equal(shapeFunctionBins.Count, shapeFunctionValues.Count);
                Common.AssertFiniteNumbers(shapeFunctionBins as IList<double>, shapeFunctionBins.Count - 1);
                Common.AssertFiniteNumbers(shapeFunctionValues as IList<double>);
            }
        }
 
        /// <summary>
        /// Introspective Training: LDA models can be easily inspected.
        /// </summary>
        [Fact]
        public void InspectLdaModelParameters()
        {
            // Test Parameters
            int numTopics = 10;
 
            var mlContext = new MLContext(seed: 1);
 
            // Load the dataset.
            var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
                hasHeader: TestDatasets.Sentiment.fileHasHeader,
                separatorChar: TestDatasets.Sentiment.fileSeparator,
                allowQuoting: TestDatasets.Sentiment.allowQuoting);
 
            // Define the pipeline.
            var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText")
                .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag",
                numberOfTopics: numTopics, maximumNumberOfIterations: 10));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Get the trained LDA model.
            var ldaTransform = model.LastTransformer;
 
            // Get the topics and summaries from the model.
            var ldaDetails = ldaTransform.GetLdaDetails(0);
            Assert.False(ldaDetails.ItemScoresPerTopic == null && ldaDetails.WordScoresPerTopic == null);
            if (ldaDetails.ItemScoresPerTopic != null)
                Assert.Equal(numTopics, ldaDetails.ItemScoresPerTopic.Count);
            if (ldaDetails.WordScoresPerTopic != null)
                Assert.Equal(numTopics, ldaDetails.WordScoresPerTopic.Count);
 
 
            // Transform the data.
            var transformedData = model.Transform(data);
 
            // Make sure the model weights array is the same length as the features array.
            var numFeatures = (transformedData.Schema["Features"].Type as VectorDataViewType).Size;
            Assert.Equal(numFeatures, numTopics);
        }
 
        /// <summary>
        /// Introspective Training: Linear model parameters may be inspected.
        /// </summary>
        [Fact]
        public void InspectLinearModelParameters()
        {
            var mlContext = new MLContext(seed: 1);
 
            var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
                hasHeader: TestDatasets.Sentiment.fileHasHeader,
                separatorChar: TestDatasets.Sentiment.fileSeparator,
                allowQuoting: TestDatasets.Sentiment.allowQuoting);
 
            // Create a training pipeline.
            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
                .AppendCacheCheckpoint(mlContext)
                .Append(mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(
                    new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 }));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Transform the data.
            var transformedData = model.Transform(data);
 
            // Extract the linear model from the pipeline.
            var linearModel = model.LastTransformer.Model;
 
            // Get the model bias and weights.
            var bias = linearModel.Bias;
            var weights = linearModel.Weights;
 
            // Make sure the model weights array is the same length as the features array.
            var numFeatures = (transformedData.Schema["Features"].Type as VectorDataViewType).Size;
            Assert.Equal(numFeatures, weights.Count);
        }
 
        /// <summary>
        /// Introspectable Training: Parameters of a trained Normalizer are easily accessed.
        /// </summary>
        [Fact]
        public void IntrospectNormalization()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset.
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Compose the transformation.
            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
                .Append(mlContext.Transforms.NormalizeMinMax("Features"));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Extract the normalizer from the trained pipeline.
            var normalizer = model.LastTransformer;
 
            // Extract the normalizer parameters.
            // TODO #2854: Normalizer parameters are easy to find via intellisense.
            var config = normalizer.GetNormalizerModelParameters(0) as NormalizingTransformer.AffineNormalizerModelParameters<ImmutableArray<float>>;
            Assert.NotNull(config);
            Common.AssertFiniteNumbers(config.Offset);
            Common.AssertFiniteNumbers(config.Scale);
        }
        /// <summary>
        /// Introspective Training: I can inspect a pipeline to determine which transformers were included.
        /// </summary>
        [Fact]
        public void InspectPipelineContents()
        {
            var mlContext = new MLContext(seed: 1);
 
            // Get the dataset.
            var data = mlContext.Data.LoadFromTextFile<HousingRegression>(TestCommon.GetDataPath(DataDir, TestDatasets.housing.trainFilename), hasHeader: true);
 
            // Create a pipeline to train on the housing data.
            var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
                .Append(mlContext.Regression.Trainers.FastForest(numberOfLeaves: 5, numberOfTrees: 3));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Inspect the transforms in the trained pipeline.
            var expectedTypes = new Type[] {typeof(ColumnConcatenatingTransformer),
                typeof(RegressionPredictionTransformer<FastForestRegressionModelParameters>)};
            var expectedColumns = new string[][] {
                new string[] { "Features" },
                new string[] { "Score" },
            };
            int i = 0;
            var currentSchema = data.Schema;
            foreach (var transformer in model)
            {
                // It is possible to get the type at runtime.
                Assert.IsType(expectedTypes[i], transformer);
 
                // It's also possible to inspect the schema output from the transform.
                currentSchema = transformer.GetOutputSchema(currentSchema);
                foreach (var expectedColumn in expectedColumns[i])
                {
                    var column = currentSchema.GetColumnOrNull(expectedColumn);
                    Assert.NotNull(column);
                }
                i++;
            }
        }
 
        /// <summary>
        /// Introspective Training: Hashed values can be mapped back to the original column and value.
        /// </summary>
        [Fact]
        public void InspectSlotNamesForReversibleHash()
        {
            var mlContext = new MLContext(seed: 1);
 
            // Load the Adult dataset.
            var data = mlContext.Data.LoadFromTextFile<Adult>(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename),
                hasHeader: TestDatasets.adult.fileHasHeader,
                separatorChar: TestDatasets.adult.fileSeparator);
 
            // Create the learning pipeline.
            var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures)
                .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures))
                .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", numberOfBits: 8, // get collisions!
                    maximumNumberOfInverts: -1, outputKind: OneHotEncodingEstimator.OutputKind.Bag));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Transform the data.
            var transformedData = model.Transform(data);
 
            // Verify that the slotnames can be used to backtrack to the original values by confirming that
            // all unique values in the input data are in the output data slot names.
            // First get a list of the unique values.
            VBuffer<ReadOnlyMemory<char>> categoricalSlotNames = new VBuffer<ReadOnlyMemory<char>>();
            transformedData.Schema["CategoricalFeatures"].GetSlotNames(ref categoricalSlotNames);
            var uniqueValues = new HashSet<string>();
            foreach (var slotName in categoricalSlotNames.GetValues())
            {
                var slotNameString = slotName.ToString();
                if (slotNameString.StartsWith("{"))
                {
                    // Values look like this: {3:Exec-managerial,2:Widowed}.
                    slotNameString = slotNameString.Substring(1, slotNameString.Length - 2);
                    foreach (var name in slotNameString.Split(','))
                        uniqueValues.Add(name);
                }
                else
                    uniqueValues.Add(slotNameString);
            }
 
            // Now validate that all values in the dataset are there.
            var transformedRows = mlContext.Data.CreateEnumerable<Adult>(data, false);
            foreach (var row in transformedRows)
            {
                for (int i = 0; i < Adult.CategoricalFeatures.Length; i++)
                {
                    // Fetch the categorical value.
                    string value = (string)row.GetType().GetProperty(Adult.CategoricalFeatures[i]).GetValue(row, null);
                    Assert.Contains($"{i}:{value}", uniqueValues);
                }
            }
        }
 
        /// <summary>
        /// Introspective Training: I can create nested pipelines, and extract individual components.
        /// </summary>
        [Fact]
        public void InspectNestedPipeline()
        {
            var mlContext = new MLContext(seed: 1);
 
            var data = mlContext.Data.LoadFromTextFile<Iris>(TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Create a training pipeline.
            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
                .Append(StepOne(mlContext))
                .Append(StepTwo(mlContext));
 
            // Fit the pipeline.
            var model = pipeline.Fit(data);
 
            // Extract the trained models.
            var modelComponents = model.ToList();
            var kMeansModel = (modelComponents[1] as TransformerChain<ClusteringPredictionTransformer<KMeansModelParameters>>).LastTransformer;
            var mcLrModel = (modelComponents[2] as TransformerChain<MulticlassPredictionTransformer<MaximumEntropyModelParameters>>).LastTransformer;
 
            // Validate the k-means model.
            VBuffer<float>[] centroids = default;
            kMeansModel.Model.GetClusterCentroids(ref centroids, out int nCentroids);
            Assert.Equal(4, centroids.Length);
 
            // Validate the MulticlassLogisticRegressionModel.
            VBuffer<float>[] weights = default;
            mcLrModel.Model.GetWeights(ref weights, out int classes);
            Assert.Equal(3, weights.Length);
        }
 
        private IEstimator<TransformerChain<ClusteringPredictionTransformer<KMeansModelParameters>>> StepOne(MLContext mlContext)
        {
            return mlContext.Transforms.Concatenate("LabelAndFeatures", "Label", "Features")
                .Append(mlContext.Clustering.Trainers.KMeans(
                    new KMeansTrainer.Options
                    {
                        InitializationAlgorithm = KMeansTrainer.InitializationAlgorithm.Random,
                        NumberOfClusters = 4,
                        MaximumNumberOfIterations = 10,
                        NumberOfThreads = 1
                    }));
        }
 
        private IEstimator<TransformerChain<MulticlassPredictionTransformer<MaximumEntropyModelParameters>>> StepTwo(MLContext mlContext)
        {
            return mlContext.Transforms.Conversion.MapValueToKey("Label")
                .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(
                new SdcaMaximumEntropyMulticlassTrainer.Options
                {
                    MaximumNumberOfIterations = 10,
                    NumberOfThreads = 1
                }));
        }
    }
}