File: Dynamic\Transforms\TreeFeaturization\PretrainedTreeEnsembleFeaturizationWithOptions.cs
Web Access
Project: src\docs\samples\Microsoft.ML.Samples\Microsoft.ML.Samples.csproj (Microsoft.ML.Samples)
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers.FastTree;
 
namespace Samples.Dynamic.Transforms.TreeFeaturization
{
    public static class PretrainedTreeEnsembleFeaturizationWithOptions
    {
        public static void Example()
        {
            // Create data set
            int dataPointCount = 200;
            // Create a new context for ML.NET operations. It can be used for
            // exception tracking and logging, as a catalog of available operations
            // and as the source of randomness. Setting the seed to a fixed number
            // in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);
 
            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(dataPointCount).ToList();
 
            // Convert the list of data points to an IDataView object, which is
            // consumable by ML.NET API.
            var dataView = mlContext.Data.LoadFromEnumerable(dataPoints);
 
            // Define input and output columns of tree-based featurizer.
            string labelColumnName = nameof(DataPoint.Label);
            string featureColumnName = nameof(DataPoint.Features);
            string treesColumnName = nameof(TransformedDataPoint.Trees);
            string leavesColumnName = nameof(TransformedDataPoint.Leaves);
            string pathsColumnName = nameof(TransformedDataPoint.Paths);
 
            // Define a tree model whose trees will be extracted to construct a tree
            // featurizer.
            var trainer = mlContext.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
                {
                    NumberOfThreads = 1,
                    NumberOfTrees = 1,
                    NumberOfLeaves = 4,
                    MinimumExampleCountPerLeaf = 1,
                    FeatureColumnName = featureColumnName,
                    LabelColumnName = labelColumnName
                });
 
            // Train the defined tree model.
            var model = trainer.Fit(dataView);
            var predicted = model.Transform(dataView);
 
            // Define the configuration of tree-based featurizer.
            var options = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName = featureColumnName,
                ModelParameters = model.Model.SubModel, // Pretrained tree model.
                TreesColumnName = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName = pathsColumnName
            };
 
            // Fit the created featurizer. It doesn't perform actual training
            // because a pretrained model is provided.
            var treeFeaturizer = mlContext.Transforms
                .FeaturizeByPretrainTreeEnsemble(options).Fit(dataView);
 
            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);
 
            // Convert IDataView object to a list. Each element in the resulted list
            // corresponds to a row in the IDataView.
            var transformedDataPoints = mlContext.Data.CreateEnumerable<
                TransformedDataPoint>(transformed, false).ToList();
 
            // Print out the transformation of the first 3 data points.
            for (int i = 0; i < 3; ++i)
            {
                var dataPoint = dataPoints[i];
                var transformedDataPoint = transformedDataPoints[i];
                Console.WriteLine("The original feature vector [" + String.Join(
                    ",", dataPoint.Features) + "] is transformed to three " +
                    "different tree-based feature vectors:");
 
                Console.WriteLine("  Trees' output values: [" + String.Join(",",
                    transformedDataPoint.Trees) + "].");
 
                Console.WriteLine("  Leave IDs' 0-1 representation: [" + String
                    .Join(",", transformedDataPoint.Leaves) + "].");
 
                Console.WriteLine("  Paths IDs' 0-1 representation: [" + String
                    .Join(",", transformedDataPoint.Paths) + "].");
            }
 
            // Expected output:
            //  The original feature vector[0.8173254, 0.7680227, 0.5581612] is
            //  transformed to three different tree - based feature vectors:
            //    Trees' output values: [0.4172185].
            //    Leave IDs' 0-1 representation: [1,0,0,0].
            //    Paths IDs' 0-1 representation: [1,1,1].
            //  The original feature vector[0.7588848, 1.106027, 0.6421779] is
            //   transformed to three different tree - based feature vectors:
            //    Trees' output values: [-1].
            //    Leave IDs' 0-1 representation: [0,0,1,0].
            //    Paths IDs' 0-1 representation: [1,1,0].
            //  The original feature vector[0.2737045, 0.2919063, 0.4673147] is
            //   transformed to three different tree - based feature vectors:
            //    Trees' output values: [0.4172185].
            //    Leave IDs' 0-1 representation: [1,0,0,0].
            //    Paths IDs' 0-1 representation: [1,1,1].
            //
            //   Note that the trained model contains only one tree.
            //
            //            Node 0
            //            /    \
            //           /    Leaf -2
            //         Node 1
            //         /    \
            //        /    Leaf -3
            //      Node 2
            //      /    \
            //     /    Leaf -4
            //   Leaf -1
            //
            //   Thus, if a data point reaches Leaf indexed by -1, its 0-1 path
            //   representation may be [1,1,1] because that data point
            //   went through all Node 0, Node 1, and Node 2.
 
        }
 
        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count,
            int seed = 0)
        {
            var random = new Random(seed);
            float randomFloat() => (float)random.NextDouble();
            for (int i = 0; i < count; i++)
            {
                var label = randomFloat() > 0.5;
                yield return new DataPoint
                {
                    Label = label,
                    // Create random features that are correlated with the label.
                    // For data points with false label, the feature values are
                    // slightly increased by adding a constant.
                    Features = Enumerable.Repeat(label, 3).Select(x => x ?
                    randomFloat() : randomFloat() + 0.2f).ToArray()
                };
            }
        }
 
        // Example with label and 3 feature values. A data set is a collection of
        // such examples.
        private class DataPoint
        {
            public bool Label { get; set; }
            [VectorType(3)]
            public float[] Features { get; set; }
        }
 
        // Class used to capture the output of tree-base featurization.
        private class TransformedDataPoint : DataPoint
        {
            // The i-th value is the output value of the i-th decision tree.
            public float[] Trees { get; set; }
            // The 0-1 encoding of leaves the input feature vector falls into.
            public float[] Leaves { get; set; }
            // The 0-1 encoding of paths the input feature vector reaches the
            // leaves.
            public float[] Paths { get; set; }
        }
    }
}