File: AnomalyDetectionTests.cs
Web Access
Project: src\test\Microsoft.ML.Tests\Microsoft.ML.Tests.csproj (Microsoft.ML.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.RunTests;
using Microsoft.ML.TestFrameworkCommon;
using Xunit;
using Xunit.Abstractions;
 
namespace Microsoft.ML.Tests
{
    public class AnomalyDetectionTests : TestDataPipeBase
    {
        public AnomalyDetectionTests(ITestOutputHelper output) : base(output)
        {
        }
 
        /// <summary>
        /// RandomizedPcaTrainer test.
        /// </summary>
        [Fact]
        public void RandomizedPcaTrainerBaselineTest()
        {
            var trainPath = GetDataPath(TestDatasets.mnistOneClass.trainFilename);
            var testPath = GetDataPath(TestDatasets.mnistOneClass.testFilename);
 
            var transformedData = DetectAnomalyInMnistOneClass(trainPath, testPath);
 
            // Evaluate
            var metrics = ML.AnomalyDetection.Evaluate(transformedData, falsePositiveCount: 5);
 
            Assert.Equal(0.98667, metrics.AreaUnderRocCurve, 0.00001);
            Assert.Equal(0.90000, metrics.DetectionRateAtFalsePositiveCount, 0.00001);
        }
 
        /// <summary>
        /// Test anomaly detection when the test data has no anomalies.
        /// </summary>
        [Fact]
        public void NoAnomalyTest()
        {
            var trainPath = GetDataPath(TestDatasets.mnistOneClass.trainFilename);
 
            var transformedData = DetectAnomalyInMnistOneClass(trainPath, trainPath);
 
            Assert.Throws<ArgumentOutOfRangeException>(() => ML.AnomalyDetection.Evaluate(transformedData));
        }
 
        [Fact]
        public static void RandomizedPcaInMemory()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);
 
            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
            var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, ensureZeroMean: false);
 
            // Test the first detector.
            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);
 
            // Object required in the creation of another detector.
            var options = new Trainers.RandomizedPcaTrainer.Options()
            {
                FeatureColumnName = nameof(DataPoint.Features),
                Rank = 1,
                EnsureZeroMean = false,
                Seed = 10
            };
 
            // Create anther anomaly detector. Its underlying algorithm is randomized PCA.
            var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
 
            // Test the second detector.
            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
        }
 
        [Fact]
        public static void RandomizedPcaChangeThreshold()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);
 
            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
            var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, ensureZeroMean: false);
 
            // Test the first detector.
            ExecuteRandomizedPcaTrainerChangeThreshold(mlContext, trainer1);
 
            // Object required in the creation of another detector.
            var options = new Trainers.RandomizedPcaTrainer.Options()
            {
                FeatureColumnName = nameof(DataPoint.Features),
                Rank = 1,
                EnsureZeroMean = false,
                Seed = 10
            };
 
            // Create anther anomaly detector. Its underlying algorithm is randomized PCA.
            var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
 
            // Test the second detector.
            ExecuteRandomizedPcaTrainerChangeThreshold(mlContext, trainer2);
        }
 
        /// <summary>
        /// Example with 3 feature values used in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
        /// </summary>
        private class DataPoint
        {
            [VectorType(3)]
            public float[] Features { get; set; }
        }
 
        /// <summary>
        /// Class used to capture prediction of <see cref="DataPoint"/> in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
        /// </summary>
        private class Result
        {
            // Outlier gets false while inlier has true.
            public bool PredictedLabel { get; set; }
            // Outlier gets smaller score.
            public float Score { get; set; }
        }
 
        /// <summary>
        /// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
        /// </summary>
        private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
        {
            var samples = new List<DataPoint>()
            {
                new DataPoint(){ Features = new float[3] {0, 2, 1} },
                new DataPoint(){ Features = new float[3] {0, 2, 3} },
                new DataPoint(){ Features = new float[3] {0, 2, 4} },
                new DataPoint(){ Features = new float[3] {0, 2, 1} },
                new DataPoint(){ Features = new float[3] {0, 2, 2} },
                new DataPoint(){ Features = new float[3] {0, 2, 3} },
                new DataPoint(){ Features = new float[3] {0, 2, 4} },
                new DataPoint(){ Features = new float[3] {1, 0, 0} }
            };
 
            // Convert the List<DataPoint> to IDataView, a consumable format to ML.NET functions.
            var data = mlContext.Data.LoadFromEnumerable(samples);
 
            // Train the anomaly detector.
            var model = trainer.Fit(data);
 
            // Apply the trained model on the training data.
            var transformed = model.Transform(data);
 
            // Read ML.NET predictions into IEnumerable<Result>.
            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
 
            // First 5 examples are inliers.
            for (int i = 0; i < 7; ++i)
            {
                // Inlier should be predicted as false.
                Assert.False(results[i].PredictedLabel);
                // Higher score means closer to inlier.
                Assert.InRange(results[i].Score, 0, 0.5);
            }
 
            // Last example is outlier. Note that outlier should be predicted as true.
            Assert.True(results[7].PredictedLabel);
            Assert.InRange(results[7].Score, 0.5, 1);
        }
 
 
        /// <summary>
        /// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
        /// </summary>
        private static void ExecuteRandomizedPcaTrainerChangeThreshold(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
        {
            var samples = new List<DataPoint>()
            {
                new DataPoint(){ Features = new float[3] {0, 2, 1} },
                new DataPoint(){ Features = new float[3] {0, 2, 3} },
                new DataPoint(){ Features = new float[3] {0, 2, 4} },
                new DataPoint(){ Features = new float[3] {0, 2, 1} },
                new DataPoint(){ Features = new float[3] {0, 2, 2} },
                new DataPoint(){ Features = new float[3] {0, 2, 3} },
                new DataPoint(){ Features = new float[3] {0, 2, 4} },
                new DataPoint(){ Features = new float[3] {1, 0, 0} }
            };
 
            // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
            var data = mlContext.Data.LoadFromEnumerable(samples);
 
            // Train the anomaly detector.
            var model = trainer.Fit(data);
 
            var transformer = mlContext.AnomalyDetection.ChangeModelThreshold(model, 0.3f);
 
            // Apply the trained model on the training data.
            var transformed = transformer.Transform(data);
 
            // Read ML.NET predictions into IEnumerable<Result>.
            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
 
            // Outlier should be predicted as true.
            Assert.True(results[0].PredictedLabel);
            Assert.InRange(results[0].Score, 0.3, 1);
            // Inlier should be predicted as false.
            Assert.False(results[1].PredictedLabel);
            Assert.InRange(results[1].Score, 0, 0.3);
            // Inlier should be predicted as false.
            Assert.False(results[2].PredictedLabel);
            Assert.InRange(results[2].Score, 0, 0.3);
            // Outlier should be predicted as true.
            Assert.True(results[3].PredictedLabel);
            Assert.InRange(results[3].Score, 0.3, 1);
 
            // Inlier should be predicted as false.
            Assert.False(results[4].PredictedLabel);
            Assert.InRange(results[4].Score, 0, 0.3);
 
            // Inlier should be predicted as false.
            Assert.False(results[5].PredictedLabel);
            Assert.InRange(results[5].Score, 0, 0.3);
            // Inlier should be predicted as false.
            Assert.False(results[6].PredictedLabel);
            Assert.InRange(results[6].Score, 0, 0.3);
 
            // Outlier should be predicted as true.
            Assert.True(results[7].PredictedLabel);
            Assert.InRange(results[7].Score, 0.3, 1);
        }
 
        private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
        {
            var loader = ML.Data.CreateTextLoader(new[]
            {
                new TextLoader.Column(DefaultColumnNames.Label, DataKind.Single, 0),
                new TextLoader.Column(DefaultColumnNames.Features, DataKind.Single, 1, 784)
            },
            allowSparse: true);
 
            var trainData = loader.Load(trainPath);
            var testData = loader.Load(testPath);
 
            var trainer = ML.AnomalyDetection.Trainers.RandomizedPca();
 
            var model = trainer.Fit(trainData);
            return model.Transform(testData);
        }
 
        /// <summary>
        /// Check that when PCA created invalid eigenvectors with NaNs a readable exception message is thrown.
        /// </summary>
        [Fact]
 
        public void PcaTrainerInvalidEigenvectorsException()
        {
            var mlContext = new MLContext(seed: 0);
 
            var trainer = mlContext.AnomalyDetection.Trainers.RandomizedPca(
                featureColumnName: nameof(DataPoint.Features), rank: 3);
 
            var samples = new List<DataPoint>()
            {
                new DataPoint(){ Features = new float[3] {1, 0, 2} },
                new DataPoint(){ Features = new float[3] {2, 0, 4} },
                new DataPoint(){ Features = new float[3] {4, 0, 8} },
                new DataPoint(){ Features = new float[3] {8, 0, 16} }
            };
 
            var data = mlContext.Data.LoadFromEnumerable(samples);
 
            bool exceptionThrown = false;
            try
            {
                // Since we provided a dataset where all rows are linearly dependent,
                // the PCA algorithm will likely fail when extracting 3 eigenvectors
                // and produce eigenvectors with NaN.
                var model = trainer.Fit(data);
            }
            catch (ArgumentOutOfRangeException ex)
            {
                exceptionThrown = true;
                Assert.Contains("The learnt eigenvectors contained NaN values", ex.Message);
            }
 
            Assert.True(exceptionThrown);
        }
    }
}