TreeEnsembleFeaturizerTest.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers.FastTree;
using Xunit;
 
namespace Microsoft.ML.Tests.TrainerEstimators
{
    public partial class TrainerEstimators
    {
        [Fact]
        public void TreeEnsembleFeaturizerOutputSchemaTest()
        {
            // Create data set
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(1000).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
 
            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 10,
                    NumberOfLeaves = 5,
                });
 
            // Train the defined tree model.
            var model = trainer.Fit(dataView);
 
            // From the trained tree model, a mapper of tree featurizer is created.
            const string treesColumnName = "MyTrees";
            const string leavesColumnName = "MyLeaves";
            const string pathsColumnName = "MyPaths";
            var args = new TreeEnsembleFeaturizerBindableMapper.Arguments()
            {
                TreesColumnName = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName = pathsColumnName
            };
            var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, args, model.Model);
 
            // To get output schema, we need to create RoleMappedSchema for calling Bind(...).
            var roleMappedSchema = new RoleMappedSchema(dataView.Schema,
                label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Label),
                feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Features));
 
            // Retrieve output schema. 
            var boundMapper = (treeFeaturizer as ISchemaBindableMapper).Bind(Env, roleMappedSchema);
            var outputSchema = boundMapper.OutputSchema;
 
            {
                // Check if output schema is correct.
                var treeValuesColumn = outputSchema[0];
                Assert.Equal(treesColumnName, treeValuesColumn.Name);
                VectorDataViewType treeValuesType = treeValuesColumn.Type as VectorDataViewType;
                Assert.NotNull(treeValuesType);
                Assert.Equal(NumberDataViewType.Single, treeValuesType.ItemType);
                Assert.Equal(10, treeValuesType.Size);
                // Below we check the only metadata field.
                Assert.Single(treeValuesColumn.Annotations.Schema);
                VBuffer<ReadOnlyMemory<char>> slotNames = default;
                treeValuesColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref slotNames);
                Assert.Equal(10, slotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000", slotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009", slotNames.GetItemOrDefault(9).ToString());
            }
 
            {
                var treeLeafIdsColumn = outputSchema[1];
                // Check column of tree leaf IDs.
                Assert.Equal(leavesColumnName, treeLeafIdsColumn.Name);
                VectorDataViewType treeLeafIdsType = treeLeafIdsColumn.Type as VectorDataViewType;
                Assert.NotNull(treeLeafIdsType);
                Assert.Equal(NumberDataViewType.Single, treeLeafIdsType.ItemType);
                Assert.Equal(50, treeLeafIdsType.Size);
                // Below we check the two leaf-IDs column's metadata fields.
                Assert.Equal(2, treeLeafIdsColumn.Annotations.Schema.Count);
                // Check metadata field IsNormalized's content.
                bool leafIdsNormalizedFlag = false;
                treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref leafIdsNormalizedFlag);
                Assert.True(leafIdsNormalizedFlag);
                // Check metadata field SlotNames's content.
                VBuffer<ReadOnlyMemory<char>> leafIdsSlotNames = default;
                treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref leafIdsSlotNames);
                Assert.Equal(50, leafIdsSlotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000Leaf000", leafIdsSlotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009Leaf004", leafIdsSlotNames.GetItemOrDefault(49).ToString());
            }
 
            {
                var treePathIdsColumn = outputSchema[2];
                // Check column of path IDs.
                Assert.Equal(pathsColumnName, treePathIdsColumn.Name);
                VectorDataViewType treePathIdsType = treePathIdsColumn.Type as VectorDataViewType;
                Assert.NotNull(treePathIdsType);
                Assert.Equal(NumberDataViewType.Single, treePathIdsType.ItemType);
                Assert.Equal(40, treePathIdsType.Size);
                // Below we check the two path-IDs column's metadata fields.
                Assert.Equal(2, treePathIdsColumn.Annotations.Schema.Count);
                // Check metadata field IsNormalized's content.
                bool pathIdsNormalizedFlag = false;
                treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref pathIdsNormalizedFlag);
                Assert.True(pathIdsNormalizedFlag);
                // Check metadata field SlotNames's content.
                VBuffer<ReadOnlyMemory<char>> pathIdsSlotNames = default;
                treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref pathIdsSlotNames);
                Assert.Equal(40, pathIdsSlotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000Node000", pathIdsSlotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009Node003", pathIdsSlotNames.GetItemOrDefault(39).ToString());
            }
 
        }
 
        [Fact]
        public void TreeEnsembleFeaturizerTransformerFastTreeBinary()
        {
            // Create data set
            int dataPointCount = 20;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
 
            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 1,
                    NumberOfLeaves = 4,
                    MinimumExampleCountPerLeaf = 1
                });
 
            // Train the defined tree model.
            var model = trainer.Fit(dataView);
            var predicted = model.Transform(dataView);
 
            // From the trained tree model, a mapper of tree featurizer is created.
            const string treesColumnName = "MyTrees";
            const string leavesColumnName = "MyLeaves";
            const string pathsColumnName = "MyPaths";
            var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model.SubModel,
                treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName);
 
            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);
 
            // Extract the outputs of TreeEnsembleFeaturizer.
            var features = transformed.GetColumn<float[]>("Features").ToArray();
            var leafValues = transformed.GetColumn<float[]>(treesColumnName).ToArray();
            var leafIds = transformed.GetColumn<float[]>(leavesColumnName).ToArray();
            var paths = transformed.GetColumn<float[]>(pathsColumnName).ToArray();
 
            // Check if the TreeEnsembleFeaturizer produce expected values.
            List<int> path = null;
            for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex)
            {
                int treeIndex = 0;
                var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer<float>(10, features[dataPointIndex]), ref path);
                var leafValue = model.Model.SubModel.GetLeafValue(0, leafId);
                Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue);
                Assert.Equal(1.0, leafIds[dataPointIndex][leafId]);
                foreach (var nodeId in path)
                    Assert.Equal(1.0, paths[dataPointIndex][nodeId]);
            }
        }
 
        [Fact]
        public void TreeEnsembleFeaturizerTransformerFastForestBinary()
        {
            // Create data set
            int dataPointCount = 20;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
 
            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastForest(
                new FastForestBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 1,
                    NumberOfLeaves = 4,
                    MinimumExampleCountPerLeaf = 1
                });
 
            // Train the defined tree model.
            var model = trainer.Fit(dataView);
 
            // From the trained tree model, a mapper of tree featurizer is created.
            const string treesColumnName = "MyTrees";
            const string leavesColumnName = "MyLeaves";
            const string pathsColumnName = "MyPaths";
            var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model,
                treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName);
 
            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);
 
            // Extract the outputs of TreeEnsembleFeaturizer.
            var features = transformed.GetColumn<float[]>("Features").ToArray();
            var leafValues = transformed.GetColumn<float[]>(treesColumnName).ToArray();
            var leafIds = transformed.GetColumn<float[]>(leavesColumnName).ToArray();
            var paths = transformed.GetColumn<float[]>(pathsColumnName).ToArray();
 
            // Check if the TreeEnsembleFeaturizer produce expected values.
            List<int> path = null;
            for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex)
            {
                int treeIndex = 0;
                var leafId = model.Model.GetLeaf(treeIndex, new VBuffer<float>(10, features[dataPointIndex]), ref path);
                var leafValue = model.Model.GetLeafValue(0, leafId);
                Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue);
                Assert.Equal(1.0, leafIds[dataPointIndex][leafId]);
                foreach (var nodeId in path)
                    Assert.Equal(1.0, paths[dataPointIndex][nodeId]);
            }
        }
 
        /// <summary>
        /// A test of <see cref="PretrainedTreeFeaturizationEstimator"/>.
        /// </summary>
        [Fact]
        public void TestPretrainedTreeFeaturizationEstimator()
        {
            // Create data set
            int dataPointCount = 20;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 1,
                    NumberOfLeaves = 4,
                    MinimumExampleCountPerLeaf = 1
                });
 
            // Train the defined tree model.
            var model = trainer.Fit(dataView);
            var predicted = model.Transform(dataView);
 
            // From the trained tree model, a mapper of tree featurizer is created.
            string featureColumnName = "Features";
            string treesColumnName = "MyTrees"; // a tree-based feature column.
            string leavesColumnName = "MyLeaves"; // a tree-based feature column.
            string pathsColumnName = "MyPaths"; // a tree-based feature column.
            var options = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName = featureColumnName,
                ModelParameters = model.Model.SubModel,
                TreesColumnName = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName = pathsColumnName
            };
            var treeFeaturizer = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView);
 
            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);
 
            // Extract the outputs of TreeEnsembleFeaturizer.
            var features = transformed.GetColumn<float[]>(featureColumnName).ToArray();
            var leafValues = transformed.GetColumn<float[]>(treesColumnName).ToArray();
            var leafIds = transformed.GetColumn<float[]>(leavesColumnName).ToArray();
            var paths = transformed.GetColumn<float[]>(pathsColumnName).ToArray();
 
            // Check if the TreeEnsembleFeaturizer produce expected values.
            List<int> path = null;
            for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex)
            {
                int treeIndex = 0;
                var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer<float>(10, features[dataPointIndex]), ref path);
                var leafValue = model.Model.SubModel.GetLeafValue(0, leafId);
                Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue);
                Assert.Equal(1.0, leafIds[dataPointIndex][leafId]);
                foreach (var nodeId in path)
                    Assert.Equal(1.0, paths[dataPointIndex][nodeId]);
            }
        }
 
        /// <summary>
        /// This test contains several steps.
        ///   1. It first trains a <see cref="FastTreeBinaryModelParameters"/> using <see cref="FastTreeBinaryTrainer"/>.
        ///   2. Then, it creates the a <see cref="PretrainedTreeFeaturizationEstimator"/> from the trained <see cref="FastTreeBinaryModelParameters"/>.
        ///   3. The feature produced in step 2 would be fed into <see cref="SdcaLogisticRegression"/> to enhance the training accuracy of that linear model.
        ///   4. We train another <see cref="SdcaLogisticRegression"/> without features from trees and finally compare their scores.
        /// </summary>
        [Fact]
        public void TreeEnsembleFeaturizingPipeline()
        {
            // Create data set
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 10,
                    NumberOfLeaves = 4,
                    MinimumExampleCountPerLeaf = 10
                });
 
            // Train the defined tree model. This trained model will be used to construct TreeEnsembleFeaturizationEstimator.
            var treeModel = trainer.Fit(dataView);
            var predicted = treeModel.Transform(dataView);
 
            // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features.
            // Then train a linear model.
            var options = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                ModelParameters = treeModel.Model.SubModel
            };
            var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.BinaryClassification.Evaluate(prediction);
 
            // Then train the same linear model without tree features.
            var naivePipeline = ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "Features");
            var naiveModel = naivePipeline.Fit(dataView);
            var naivePrediction = naiveModel.Transform(dataView);
            var naiveMetrics = ML.BinaryClassification.Evaluate(naivePrediction);
 
            // The linear model trained with tree features should perform better than that without tree features.
            Assert.True(metrics.Accuracy > naiveMetrics.Accuracy);
            Assert.True(metrics.LogLoss < naiveMetrics.LogLoss);
            Assert.True(metrics.AreaUnderPrecisionRecallCurve > naiveMetrics.AreaUnderPrecisionRecallCurve);
        }
 
        [Fact]
        public void TestFastTreeBinaryFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastTreeBinaryFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.BinaryClassification.Evaluate(prediction);
 
            Assert.True(metrics.Accuracy > 0.98);
            Assert.True(metrics.LogLoss < 0.05);
            Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98);
        }
 
        [Fact]
        public void TestFastForestBinaryFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastForestBinaryTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastForestBinaryFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.BinaryClassification.Evaluate(prediction);
 
            Assert.True(metrics.Accuracy > 0.97);
            Assert.True(metrics.LogLoss < 0.07);
            Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98);
        }
 
        [Fact]
        public void TestFastTreeRegressionFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastTreeRegressionTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastTreeRegressionFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastTreeRegression(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.2);
            Assert.True(metrics.MeanSquaredError < 0.05);
        }
 
        [Fact]
        public void TestFastForestRegressionFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
        }
 
        [Fact]
        public void TestFastTreeTweedieFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastTreeTweedieTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastTreeTweedieFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastTreeTweedie(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
        }
 
        [Fact]
        public void TestFastTreeRankingFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorUlongGroupIdSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastTreeRankingTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastTreeRankingFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastTreeRanking(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
        }
 
        [Fact]
        public void TestSaveAndLoadTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
 
            // Save the trained model into file.
            ITransformer loadedModel = null;
            var tempPath = Path.GetTempFileName();
            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);
 
                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics = ML.Regression.Evaluate(loadedPrediction);
 
            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError, 0.00001);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError, 0.00001);
        }
 
        [Fact]
        public void TestSaveAndLoadDoubleTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            // Trains tree featurization on "Features" and applies on "CopiedFeatures".
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName = "CopiedFeatures",
                TrainerOptions = trainerOptions,
                TreesColumnName = "OhMyTrees",
                LeavesColumnName = "OhMyLeaves",
                PathsColumnName = "OhMyPaths"
            };
 
            var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.Regression.Evaluate(prediction);
 
            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
 
            // Save the trained model into file and then load it back.
            ITransformer loadedModel = null;
            var tempPath = Path.GetTempFileName();
            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);
 
                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }
 
            // Compute prediction using the loaded model.
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics = ML.Regression.Evaluate(loadedPrediction);
 
            // Check if the loaded model produces the same result as the trained model.
            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError, 0.00001);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError, 0.00001);
 
            var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                .Append(ML.Transforms.NormalizeBinning("CopiedFeatures"))
                .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var secondModel = secondPipeline.Fit(dataView);
            var secondPrediction = secondModel.Transform(dataView);
            var secondMetrics = ML.Regression.Evaluate(secondPrediction);
 
            // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline
            // is different from the first pipeline.
            Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError);
            Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError);
        }
 
        [Fact]
        public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs()
        {
            int dataPointCount = 200;
            var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "Label"
            };
 
            var options = new FastTreeBinaryFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TrainerOptions = trainerOptions,
                TreesColumnName = null,
                PathsColumnName = null,
                LeavesColumnName = "Leaves"
            };
 
 
            bool isWrong = false;
            try
            {
                var wrongPipeline = ML.Transforms.FeaturizeByFastTreeBinary(options)
                    .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                    .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
                var wrongModel = wrongPipeline.Fit(dataView);
            }
            catch
            {
                isWrong = true; // Only "Leaves" is produced by the tree featurizer, so accessing "Trees" and "Paths" will lead to an error.
            }
            Assert.True(isWrong);
 
            var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options)
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves"))
                .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
            var model = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics = ML.BinaryClassification.Evaluate(prediction);
 
            Assert.True(metrics.Accuracy > 0.98);
            Assert.True(metrics.LogLoss < 0.05);
            Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98);
        }
 
        /// <summary>
        /// Apply tree-based featurization on multiclass classification by converting key-typed labels to floats and training
        /// a regression tree model for featurization.
        /// </summary>
        [Fact]
        public void TreeEnsembleFeaturizingPipelineMulticlass()
        {
            int dataPointCount = 1000;
            var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);
            dataView = ML.Data.Cache(dataView);
 
            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees = 10,
                NumberOfLeaves = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName = "Features",
                LabelColumnName = "FloatLabel",
                ShuffleLabels = true
            };
 
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName = "Features",
                TreesColumnName = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName = "Paths",
                TrainerOptions = trainerOptions
            };
 
            Action<RowWithKey, RowWithFloat> actionConvertKeyToFloat = (RowWithKey rowWithKey, RowWithFloat rowWithFloat) =>
            {
                rowWithFloat.FloatLabel = rowWithKey.KeyLabel == 0 ? float.NaN : rowWithKey.KeyLabel - 1;
            };
 
            var split = ML.Data.TrainTestSplit(dataView, 0.5);
            var trainData = split.TrainSet;
            var testData = split.TestSet;
 
            var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label")
                .Append(ML.Transforms.CustomMapping(actionConvertKeyToFloat, "KeyLabel"))
                .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                .Append(ML.Transforms.Concatenate("CombinedFeatures", "Trees", "Leaves", "Paths"))
                .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures"));
 
            var model = pipeline.Fit(trainData);
            var prediction = model.Transform(testData);
            var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel");
 
            Assert.True(metrics.MacroAccuracy > 0.6);
            Assert.True(metrics.MicroAccuracy > 0.6);
        }
 
        private class RowWithKey
        {
            [KeyType(4)]
            public uint KeyLabel { get; set; }
        }
 
        private class RowWithFloat
        {
            public float FloatLabel { get; set; }
        }
    }
}
File: TrainerEstimators\TreeEnsembleFeaturizerTest.cs	Web Access
Project: src\test\Microsoft.ML.Tests\Microsoft.ML.Tests.csproj (Microsoft.ML.Tests)