PcaTrainer.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.IO;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Internal.CpuMath;
using Microsoft.ML.Internal.Internallearn;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Model;
using Microsoft.ML.Numeric;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
 
[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Options),
    new[] { typeof(SignatureAnomalyDetectorTrainer), typeof(SignatureTrainer) },
    RandomizedPcaTrainer.UserNameValue,
    RandomizedPcaTrainer.LoadNameValue,
    RandomizedPcaTrainer.ShortName)]
 
[assembly: LoadableClass(typeof(PcaModelParameters), null, typeof(SignatureLoadModel),
    "PCA Anomaly Executor", PcaModelParameters.LoaderSignature)]
 
[assembly: LoadableClass(typeof(void), typeof(RandomizedPcaTrainer), null, typeof(SignatureEntryPointModule), RandomizedPcaTrainer.LoadNameValue)]
 
namespace Microsoft.ML.Trainers
{
    // REVIEW: make RFF transformer an option here.
 
    /// <summary>
    /// The <see cref="IEstimator{TTransformer}"/> for training an approximate PCA using Randomized SVD algorithm.
    /// </summary>
    /// <remarks>
    /// <format type="text/markdown"><![CDATA[
    /// To create this trainer, use [RandomizedPca](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,System.String,System.String,System.Int32,System.Int32,System.Boolean,System.Nullable{System.Int32}))
    /// or [RandomizedPca(Options)](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,Microsoft.ML.Trainers.RandomizedPcaTrainer.Options)).
    ///
    /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-anomaly-detection.md)]
    ///
    /// ### Trainer Characteristics
    /// |  |  |
    /// | -- | -- |
    /// | Machine learning task | Anomaly Detection |
    /// | Is normalization required? | Yes |
    /// | Is caching required? | No |
    /// | Required NuGet in addition to Microsoft.ML | None |
    /// | Exportable to ONNX | No |
    ///
    /// ### Training Algorithm Details
    /// This trainer uses the top eigenvectors to approximate the subspace containing the normal class.
    /// For each new instance, it computes the norm difference between the raw feature vector and the projected feature on that subspace.
    /// If the error is close to 0, the instance is considered normal (non-anomaly).
    ///
    /// More specifically, this trainer trains an approximate PCA using a randomized method for computing the singular value decomposition (SVD) of
    /// the matrix whose rows are the input vectors.
    /// The model generated by this trainer contains three parameters:
    /// - A projection matrix $U$
    /// - The mean vector in the original feature space $m$
    /// - The mean vector in the projected feature space $p$
    ///
    /// For an input feature vector $x$, the anomaly score is computed by comparing the $L_2$
    /// norm of the original input vector, and the $L_2$ norm of the projected vector:
    /// $\sqrt{\left(\|x-m\|_2^2 - \|Ux-p\|_2^2\right)\|x-m\|_2^2}$.
    ///
    /// The method is described [here](https://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf).
    ///
    /// Note that the algorithm can be made into Kernel PCA by applying the <xref:Microsoft.ML.Transforms.ApproximatedKernelTransformer>
    /// to the data before passing it to the trainer.
    ///
    /// Check the See Also section for links to usage examples.
    /// ]]>
    /// </format>
    /// </remarks>
    /// <seealso cref="PcaCatalog.RandomizedPca(AnomalyDetectionCatalog.AnomalyDetectionTrainers, string, string, int, int, bool, int?)"/>
    /// <seealso cref="PcaCatalog.RandomizedPca(AnomalyDetectionCatalog.AnomalyDetectionTrainers, Options)"/>
    /// <seealso cref="Options"/>
    public sealed class RandomizedPcaTrainer : TrainerEstimatorBase<AnomalyPredictionTransformer<PcaModelParameters>, PcaModelParameters>
    {
        internal const string LoadNameValue = "pcaAnomaly";
        internal const string UserNameValue = "PCA Anomaly Detector";
        internal const string ShortName = "pcaAnom";
        internal const string Summary = "This algorithm trains an approximate PCA using Randomized SVD algorithm. "
            + "This PCA can be made into Kernel PCA by using Random Fourier Features transform.";
 
        /// <summary>
        /// Options for the <see cref="RandomizedPcaTrainer"/> as used in
        /// [RandomizedPca(Options)](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,Microsoft.ML.Trainers.RandomizedPcaTrainer.Options)).
        /// </summary>
        public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
        {
            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)]
            [TGUI(SuggestedSweeps = "10,20,40,80")]
            [TlcModule.SweepableDiscreteParam("Rank", new object[] { 10, 20, 40, 80 })]
            public int Rank = Defaults.NumComponents;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", SortOrder = 50)]
            [TGUI(SuggestedSweeps = "10,20,40")]
            [TlcModule.SweepableDiscreteParam("Oversampling", new object[] { 10, 20, 40 })]
            public int Oversampling = Defaults.OversamplingParameters;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", Name = "Center", ShortName = "center")]
            [TlcModule.SweepableDiscreteParam("Center", null, isBool: true)]
            public bool EnsureZeroMean = Defaults.EnsureZeroMean;
 
            [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")]
            public int? Seed;
 
            internal static class Defaults
            {
                public const int NumComponents = 20;
                public const int OversamplingParameters = 20;
                public const bool EnsureZeroMean = true;
            }
        }
 
        private readonly int _rank;
        private readonly int _oversampling;
        private readonly bool _ensureZeroMean;
        private readonly int _seed;
        private readonly string _featureColumn;
 
        private protected override PredictionKind PredictionKind => PredictionKind.AnomalyDetection;
 
        // The training performs two passes, only. Probably not worth caching.
        private static readonly TrainerInfo _info = new TrainerInfo(caching: false);
        public override TrainerInfo Info => _info;
 
        /// <summary>
        /// Initializes a new instance of <see cref="RandomizedPcaTrainer"/>.
        /// </summary>
        /// <param name="env">The local instance of the <see cref="IHostEnvironment"/>.</param>
        /// <param name="featureColumnName">The name of the feature column.</param>
        /// <param name="exampleWeightColumnName">The name of the weight column.</param>
        /// <param name="rank">The number of components in the PCA.</param>
        /// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
        /// <param name="ensureZeroMean">If enabled, data is centered to be zero mean.</param>
        /// <param name="seed">The seed for random number generation.</param>
        internal RandomizedPcaTrainer(IHostEnvironment env,
            string featureColumnName,
            string exampleWeightColumnName = null,
            int rank = Options.Defaults.NumComponents,
            int oversampling = Options.Defaults.OversamplingParameters,
            bool ensureZeroMean = Options.Defaults.EnsureZeroMean,
            int? seed = null)
            : this(env, null, featureColumnName, exampleWeightColumnName, rank, oversampling, ensureZeroMean, seed)
        {
        }
 
        internal RandomizedPcaTrainer(IHostEnvironment env, Options options)
            : this(env, options, options.FeatureColumnName, options.ExampleWeightColumnName)
        {
        }
 
        private RandomizedPcaTrainer(IHostEnvironment env, Options options, string featureColumnName, string exampleWeightColumnName,
            int rank = 20, int oversampling = 20, bool center = true, int? seed = null)
            : base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(featureColumnName), default, TrainerUtils.MakeR4ScalarWeightColumn(exampleWeightColumnName))
        {
            // if the args are not null, we got here from maml, and the internal ctor.
            if (options != null)
            {
                _rank = options.Rank;
                _ensureZeroMean = options.EnsureZeroMean;
                _oversampling = options.Oversampling;
                _seed = options.Seed ?? Host.Rand.Next();
            }
            else
            {
                _rank = rank;
                _ensureZeroMean = center;
                _oversampling = oversampling;
                _seed = seed ?? Host.Rand.Next();
            }
 
            _featureColumn = featureColumnName;
 
            Host.CheckUserArg(_rank > 0, nameof(_rank), "Rank must be positive");
            Host.CheckUserArg(_oversampling >= 0, nameof(_oversampling), "Oversampling must be non-negative");
 
        }
 
        private protected override PcaModelParameters TrainModelCore(TrainContext context)
        {
            Host.CheckValue(context, nameof(context));
 
            context.TrainingSet.CheckFeatureFloatVector(out int dimension);
 
            using (var ch = Host.Start("Training"))
            {
                return TrainCore(ch, context.TrainingSet, dimension);
            }
        }
 
        private static SchemaShape.Column MakeWeightColumn(string weightColumn)
        {
            if (weightColumn == null)
                return default;
            return new SchemaShape.Column(weightColumn, SchemaShape.Column.VectorKind.Scalar, NumberDataViewType.Single, false);
        }
 
        private static SchemaShape.Column MakeFeatureColumn(string featureColumn)
        {
            return new SchemaShape.Column(featureColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false);
        }
 
        //Note: the notations used here are the same as in https://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf (pg. 9)
        private PcaModelParameters TrainCore(IChannel ch, RoleMappedData data, int dimension)
        {
            Host.AssertValue(ch);
            ch.AssertValue(data);
 
            if (_rank > dimension)
                throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension);
            int oversampledRank = Math.Min(_rank + _oversampling, dimension);
 
            //exact: (size of the 2 big matrices + other minor allocations) / (2^30)
            Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(float) / 1e9;
            if (memoryUsageEstimate > 2)
                ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate);
 
            var y = Zeros(oversampledRank, dimension);
            var mean = _ensureZeroMean ? VBufferUtils.CreateDense<float>(dimension) : VBufferUtils.CreateEmpty<float>(dimension);
 
            var omega = GaussianMatrix(oversampledRank, dimension, _seed);
 
            CursOpt cursorOpt = CursOpt.Features;
            if (data.Schema.Weight.HasValue)
                cursorOpt |= CursOpt.Weight;
 
            var cursorFactory = new FeatureFloatVectorCursor.Factory(data, cursorOpt);
            long numBad;
            Project(Host, cursorFactory, ref mean, omega, y, out numBad);
            if (numBad > 0)
                ch.Warning("Skipped {0} instances with missing features/weights during training", numBad);
 
            //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm.
            //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm
            for (var i = 0; i < oversampledRank; ++i)
            {
                var v = y[i];
                VectorUtils.ScaleBy(v, 1 / VectorUtils.Norm(y[i]));
 
                // Make the next vectors in the queue orthogonal to the orthonormalized vectors.
                for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v.
                    VectorUtils.AddMult(v, y[j], -VectorUtils.DotProduct(v, y[j]));
            }
            var q = y; // q in QR decomposition.
 
            var b = omega; // reuse the memory allocated by Omega.
            Project(Host, cursorFactory, ref mean, q, b, out numBad);
 
            //Compute B2 = B' * B
            var b2 = new float[oversampledRank * oversampledRank];
            for (var i = 0; i < oversampledRank; ++i)
            {
                for (var j = i; j < oversampledRank; ++j)
                    b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(b[i], b[j]);
            }
 
            float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2.
            float[] smallEigenvectors;
            EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors);
            PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank);
 
            return new PcaModelParameters(Host, _rank, b, in mean);
        }
 
        private static float[][] Zeros(int k, int d)
        {
            float[][] rv = new float[k][];
            for (var i = 0; i < k; ++i)
                rv[i] = new float[d];
            return rv;
        }
 
        private static float[][] GaussianMatrix(int k, int d, int seed)
        {
            var rv = Zeros(k, d);
            var rng = new Random(seed);
 
            // REVIEW: use a faster Gaussian random matrix generator
            //MKL has a fast vectorized random number generation.
            for (var i = 0; i < k; ++i)
            {
                for (var j = 0; j < d; ++j)
                    rv[i][j] = (float)Stats.SampleFromGaussian(rng); // not fast for large matrix generation
            }
            return rv;
        }
 
        //Project the covariance matrix A on to Omega: Y <- A * Omega
        //A = X' * X / n, where X = data - mean
        //Note that the covariance matrix is not computed explicitly
        private static void Project(IHost host, FeatureFloatVectorCursor.Factory cursorFactory, ref VBuffer<float> mean, float[][] omega, float[][] y, out long numBad)
        {
            Contracts.AssertValue(host, "host");
            host.AssertNonEmpty(omega);
            host.Assert(Utils.Size(y) == omega.Length); // Size of Y and Omega: dimension x oversampled rank
            int numCols = omega.Length;
 
            for (int i = 0; i < y.Length; ++i)
                Array.Clear(y[i], 0, y[i].Length);
 
            bool center = mean.IsDense;
            float n = 0;
            long count = 0;
            using (var pch = host.StartProgressChannel("Project covariance matrix"))
            using (var cursor = cursorFactory.Create())
            {
                pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, count));
                while (cursor.MoveNext())
                {
                    if (center)
                        VectorUtils.AddMult(in cursor.Features, cursor.Weight, ref mean);
                    for (int i = 0; i < numCols; i++)
                    {
                        VectorUtils.AddMult(
                            in cursor.Features,
                            y[i],
                            cursor.Weight * VectorUtils.DotProduct(omega[i], in cursor.Features));
                    }
                    n += cursor.Weight;
                    count++;
                }
                pch.Checkpoint(count);
                numBad = cursor.SkippedRowCount;
            }
 
            Contracts.Check(n > 0, "Empty training data");
            float invn = 1 / n;
 
            for (var i = 0; i < numCols; ++i)
                VectorUtils.ScaleBy(y[i], invn);
 
            if (center)
            {
                VectorUtils.ScaleBy(ref mean, invn);
                for (int i = 0; i < numCols; i++)
                    VectorUtils.AddMult(in mean, y[i], -VectorUtils.DotProduct(omega[i], in mean));
            }
        }
 
        /// <summary>
        /// Modifies <paramref name="y"/> in place so it becomes <paramref name="y"/> * eigenvectors / eigenvalues.
        /// </summary>
        // REVIEW: improve
        private static void PostProcess(float[][] y, float[] sigma, float[] z, int d, int k)
        {
            var pinv = new float[k];
            var tmp = new float[k];
 
            for (int i = 0; i < k; i++)
                pinv[i] = (float)(1.0) / ((float)(1e-6) + sigma[i]);
 
            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < k; j++)
                {
                    tmp[j] = 0;
                    for (int l = 0; l < k; l++)
                        tmp[j] += y[l][i] * z[j * k + l];
                }
                for (int j = 0; j < k; j++)
                    y[j][i] = pinv[j] * tmp[j];
            }
        }
 
        private protected override SchemaShape.Column[] GetOutputColumnsCore(SchemaShape inputSchema)
        {
            return new[]
           {
                new SchemaShape.Column(DefaultColumnNames.Score,
                        SchemaShape.Column.VectorKind.Scalar,
                        NumberDataViewType.Single,
                        false,
                        new SchemaShape(AnnotationUtils.GetTrainerOutputAnnotation())),
 
                new SchemaShape.Column(DefaultColumnNames.PredictedLabel,
                        SchemaShape.Column.VectorKind.Scalar,
                        BooleanDataViewType.Instance,
                        false,
                        new SchemaShape(AnnotationUtils.GetTrainerOutputAnnotation()))
            };
        }
 
        private protected override AnomalyPredictionTransformer<PcaModelParameters> MakeTransformer(PcaModelParameters model, DataViewSchema trainSchema)
            => new AnomalyPredictionTransformer<PcaModelParameters>(Host, model, trainSchema, _featureColumn);
 
        [TlcModule.EntryPoint(Name = "Trainers.PcaAnomalyDetector",
            Desc = "Train an PCA Anomaly model.",
            UserName = UserNameValue,
            ShortName = ShortName)]
        internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Options input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("TrainPCAAnomaly");
            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);
 
            return TrainerEntryPointsUtils.Train<Options, CommonOutputs.AnomalyDetectionOutput>(host, input,
                () => new RandomizedPcaTrainer(host, input),
                getWeight: () => TrainerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.ExampleWeightColumnName));
        }
    }
 
    // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
    /// <summary>
    /// Model parameters for <see cref="RandomizedPcaTrainer"/>.
    /// </summary>
    public sealed class PcaModelParameters : ModelParametersBase<float>,
        IValueMapper,
        ICanGetSummaryAsIDataView,
        ICanSaveInTextFormat,
        ICanSaveSummary
    {
        internal const string LoaderSignature = "pcaAnomExec";
        internal const string RegistrationName = "PCAPredictor";
 
        private static VersionInfo GetVersionInfo()
        {
            return new VersionInfo(
                modelSignature: "PCA ANOM",
                verWrittenCur: 0x00010001, // Initial
                verReadableCur: 0x00010001,
                verWeCanReadBack: 0x00010001,
                loaderSignature: LoaderSignature,
                loaderAssemblyName: typeof(PcaModelParameters).Assembly.FullName);
        }
 
        private readonly int _dimension;
        private readonly int _rank;
        private readonly VBuffer<float>[] _eigenVectors; // top-k eigenvectors of the train data's covariance matrix
        private readonly float[] _meanProjected; // for centering
        private readonly VBuffer<float> _mean; // used to compute (x-mu)^2
        private readonly float _norm2Mean;
 
        private readonly DataViewType _inputType;
 
        private protected override PredictionKind PredictionKind => PredictionKind.AnomalyDetection;
 
        /// <summary>
        /// Instantiate new model parameters from trained model.
        /// </summary>
        /// <param name="env">The host environment.</param>
        /// <param name="rank">The rank of the PCA approximation of the covariance matrix. This is the number of eigenvectors in the model.</param>
        /// <param name="eigenVectors">Array of eigenvectors.</param>
        /// <param name="mean">The mean vector of the training data.</param>
        internal PcaModelParameters(IHostEnvironment env, int rank, float[][] eigenVectors, in VBuffer<float> mean)
            : base(env, RegistrationName)
        {
            _dimension = eigenVectors[0].Length;
            _rank = rank;
            _eigenVectors = new VBuffer<float>[rank];
            _meanProjected = new float[rank];
 
            for (var i = 0; i < rank; ++i) // Only want first k
            {
                _eigenVectors[i] = new VBuffer<float>(eigenVectors[i].Length, eigenVectors[i]);
                _meanProjected[i] = VectorUtils.DotProduct(in _eigenVectors[i], in mean);
                Host.CheckParam(_eigenVectors[i].GetValues().All(FloatUtils.IsFinite),
                    nameof(eigenVectors),
                    "The learnt eigenvectors contained NaN values, consider modifying the dataset or lower the rank or oversampling parameters");
            }
 
            _mean = mean;
            _norm2Mean = VectorUtils.NormSquared(mean);
 
            _inputType = new VectorDataViewType(NumberDataViewType.Single, _dimension);
        }
 
        private PcaModelParameters(IHostEnvironment env, ModelLoadContext ctx)
            : base(env, RegistrationName, ctx)
        {
            // *** Binary format ***
            // int: dimension (aka. number of features)
            // int: rank
            // bool: center
            // If (center)
            //  float[]: mean vector
            // float[][]: eigenvectors
            _dimension = ctx.Reader.ReadInt32();
            Host.CheckDecode(FloatUtils.IsFinite(_dimension));
 
            _rank = ctx.Reader.ReadInt32();
            Host.CheckDecode(FloatUtils.IsFinite(_rank));
 
            bool center = ctx.Reader.ReadBoolByte();
            if (center)
            {
                var meanArray = ctx.Reader.ReadFloatArray(_dimension);
                Host.CheckDecode(meanArray.All(FloatUtils.IsFinite));
                _mean = new VBuffer<float>(_dimension, meanArray);
                _norm2Mean = VectorUtils.NormSquared(_mean);
            }
            else
            {
                _mean = VBufferUtils.CreateEmpty<float>(_dimension);
                _norm2Mean = 0;
            }
 
            _eigenVectors = new VBuffer<float>[_rank];
            _meanProjected = new float[_rank];
            for (int i = 0; i < _rank; ++i)
            {
                var vi = ctx.Reader.ReadFloatArray(_dimension);
                Host.CheckDecode(vi.All(FloatUtils.IsFinite));
                _eigenVectors[i] = new VBuffer<float>(_dimension, vi);
                _meanProjected[i] = VectorUtils.DotProduct(in _eigenVectors[i], in _mean);
            }
            WarnOnOldNormalizer(ctx, GetType(), Host);
 
            _inputType = new VectorDataViewType(NumberDataViewType.Single, _dimension);
        }
 
        private protected override void SaveCore(ModelSaveContext ctx)
        {
            base.SaveCore(ctx);
            ctx.SetVersionInfo(GetVersionInfo());
            var writer = ctx.Writer;
 
            // *** Binary format ***
            // int: dimension (aka. number of features)
            // int: rank
            // bool: center
            // If (center)
            //  float[]: mean vector
            // float[][]: eigenvectors
            writer.Write(_dimension);
            writer.Write(_rank);
 
            if (_mean.IsDense) // centered
            {
                writer.WriteBoolByte(true);
                writer.WriteSinglesNoCount(_mean.GetValues().Slice(0, _dimension));
            }
            else
                writer.WriteBoolByte(false);
 
            for (int i = 0; i < _rank; ++i)
                writer.WriteSinglesNoCount(_eigenVectors[i].GetValues().Slice(0, _dimension));
        }
 
        internal static PcaModelParameters Create(IHostEnvironment env, ModelLoadContext ctx)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());
            return new PcaModelParameters(env, ctx);
        }
 
        void ICanSaveSummary.SaveSummary(TextWriter writer, RoleMappedSchema schema)
        {
            ((ICanSaveInTextFormat)this).SaveAsText(writer, schema);
        }
 
        void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema)
        {
            writer.WriteLine("Dimension: {0}", _dimension);
            writer.WriteLine("Rank: {0}", _rank);
 
            if (_mean.IsDense)
            {
                writer.Write("Mean vector:");
                foreach (var value in _mean.Items(all: true))
                    writer.Write(" {0}", value.Value);
 
                writer.WriteLine();
                writer.Write("Projected mean vector:");
                foreach (var value in _meanProjected)
                    writer.Write(" {0}", value);
            }
 
            writer.WriteLine();
            writer.WriteLine("# V");
            for (var i = 0; i < _rank; ++i)
            {
                VBufferUtils.ForEachDefined(in _eigenVectors[i],
                    (ind, val) => { if (val != 0) writer.Write(" {0}:{1}", ind, val); });
                writer.WriteLine();
            }
        }
 
        IDataView ICanGetSummaryAsIDataView.GetSummaryDataView(RoleMappedSchema schema)
        {
            var bldr = new ArrayDataViewBuilder(Host);
 
            var cols = new VBuffer<float>[_rank + 1];
            var names = new string[_rank + 1];
            for (var i = 0; i < _rank; ++i)
            {
                names[i] = "EigenVector" + i;
                cols[i] = _eigenVectors[i];
            }
            names[_rank] = "MeanVector";
            cols[_rank] = _mean;
 
            bldr.AddColumn("VectorName", names);
            bldr.AddColumn("VectorData", NumberDataViewType.Single, cols);
 
            return bldr.GetDataView();
        }
 
        DataViewType IValueMapper.InputType
        {
            get { return _inputType; }
        }
 
        DataViewType IValueMapper.OutputType
        {
            get { return NumberDataViewType.Single; }
        }
 
        ValueMapper<TIn, TOut> IValueMapper.GetMapper<TIn, TOut>()
        {
            Host.Check(typeof(TIn) == typeof(VBuffer<float>));
            Host.Check(typeof(TOut) == typeof(float));
 
            ValueMapper<VBuffer<float>, float> del =
                (in VBuffer<float> src, ref float dst) =>
                {
                    Host.Check(src.Length == _dimension);
                    dst = Score(in src);
                };
            return (ValueMapper<TIn, TOut>)(Delegate)del;
        }
 
        private float Score(in VBuffer<float> src)
        {
            Host.Assert(src.Length == _dimension);
 
            // REVIEW: Can this be done faster in a single pass over src and _mean?
            var mean = _mean;
            float norm2X = VectorUtils.NormSquared(in src) -
                2 * VectorUtils.DotProduct(in mean, in src) + _norm2Mean;
            // Because the distance between src and _mean is computed using the above expression, the result
            // may be negative due to round off error. If this happens, we let the distance be 0.
            if (norm2X < 0)
                norm2X = 0;
 
            float norm2U = 0;
            for (int i = 0; i < _rank; i++)
            {
                float component = VectorUtils.DotProduct(in _eigenVectors[i], in src) - _meanProjected[i];
                norm2U += component * component;
            }
 
            return MathUtils.Sqrt((norm2X - norm2U) / norm2X); // normalized error
        }
 
        /// <summary>
        /// Copies the top eigenvectors of the covariance matrix of the training data
        /// into a set of buffers.
        /// </summary>
        /// <param name="vectors">A possibly reusable set of vectors, which will
        /// be expanded as necessary to accommodate the data.</param>
        /// <param name="rank">Set to the rank, which is also the logical length
        /// of <paramref name="vectors"/>.</param>
        public void GetEigenVectors(ref VBuffer<float>[] vectors, out int rank)
        {
            rank = _eigenVectors.Length;
            Utils.EnsureSize(ref vectors, _eigenVectors.Length, _eigenVectors.Length);
            for (int i = 0; i < _eigenVectors.Length; i++)
                _eigenVectors[i].CopyTo(ref vectors[i]);
        }
 
        /// <summary>
        /// Copies the mean vector of the training data.
        /// </summary>
        public void GetMean(ref VBuffer<float> mean)
        {
            _mean.CopyTo(ref mean);
        }
    }
}
File: PcaTrainer.cs	Web Access
Project: src\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj (Microsoft.ML.PCA)