File: DataTransformation.cs
Web Access
Project: src\test\Microsoft.ML.IntegrationTests\Microsoft.ML.IntegrationTests.csproj (Microsoft.ML.IntegrationTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using Microsoft.ML.IntegrationTests.Datasets;
using Microsoft.ML.TestFrameworkCommon;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms.Text;
using Xunit;
using Xunit.Abstractions;
using static Microsoft.ML.Transforms.HashingEstimator;
 
namespace Microsoft.ML.IntegrationTests
{
    public class DataTransformation : IntegrationTestBaseClass
    {
        public DataTransformation(ITestOutputHelper output) : base(output)
        {
        }
 
        /// <summary>
        /// Extensibility: Add a new column that is a function of other columns.
        /// </summary>
        [Fact]
        public void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Subsample it down to the first 10 rows.
            int numSamples = 10;
            data = mlContext.Data.TakeRows(data, numSamples);
 
            // Create a stand-alone function to produce a random number.
            static float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength)
            {
                var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength);
                var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength);
                return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude));
            }
 
            // Create a function that generates a column.
            Action<Iris, IrisWithOneExtraColumn> generateGroupId = (input, output) =>
            {
                output.Label = input.Label;
                output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth);
                output.PetalLength = input.PetalLength;
                output.PetalWidth = input.PetalWidth;
                output.SepalLength = input.SepalLength;
                output.SepalWidth = input.SepalWidth;
            };
 
            // Create a pipeline to execute the custom function.
            var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
 
            // Transform the data.
            var transformedData = pipeline.Fit(data).Transform(data);
 
            // Verify that the column has the correct data.
            var transformedRows = mlContext.Data.CreateEnumerable<IrisWithOneExtraColumn>(transformedData, reuseRowObject: true);
            foreach (var row in transformedRows)
            {
                var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth);
                Assert.Equal(cosineDistance, row.Float1);
            }
        }
 
        /// <summary>
        /// Extensibility: Add multiple new columns.
        /// </summary>
        [Fact]
        public void ExtensibilityAddingTwoColumns()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Subsample it down to the first 10 rows.
            int numSamples = 10;
            data = mlContext.Data.TakeRows(data, numSamples);
 
            // Create a function that generates a column.
            Action<Iris, IrisWithTwoExtraColumns> generateGroupId = (input, output) =>
            {
                output.Label = input.Label;
                output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
                output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
                output.PetalLength = input.PetalLength;
                output.PetalWidth = input.PetalWidth;
                output.SepalLength = input.SepalLength;
                output.SepalWidth = input.SepalWidth;
            };
 
            // Create a pipeline to execute the custom function.
            var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
 
            // Transform the data.
            var transformedData = pipeline.Fit(data).Transform(data);
 
            // Verify that the column has the correct data.
            var transformedRows = mlContext.Data.CreateEnumerable<IrisWithTwoExtraColumns>(transformedData, reuseRowObject: true);
            foreach (var row in transformedRows)
            {
                var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
                var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
                Assert.Equal(randomNumber1, row.Float1);
                Assert.Equal(randomNumber2, row.Float2);
            }
        }
 
        /// <summary>
        /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
        /// </summary>
        [Fact]
        public void ExtensibilityModifyTextFeaturization()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
                hasHeader: TestDatasets.Sentiment.fileHasHeader,
                separatorChar: TestDatasets.Sentiment.fileSeparator);
 
            // Create a training pipeline.
            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features",
                    new TextFeaturizingEstimator.Options
                    {
                        CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false },
                        WordFeatureExtractor = new WordBagEstimator.Options(),
                        Norm = TextFeaturizingEstimator.NormFunction.L1
                    }, "SentimentText")
                .AppendCacheCheckpoint(mlContext)
                .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
                    new SdcaLogisticRegressionBinaryTrainer.Options { NumberOfThreads = 1 }));
 
            // Train the model.
            var model = pipeline.Fit(data);
 
            // Evaluate the model.
            var scoredData = model.Transform(data);
            var metrics = mlContext.BinaryClassification.Evaluate(scoredData);
 
            // Check that the metrics returned are valid.
            Common.AssertMetrics(metrics);
        }
 
        /// <summary>
        /// Extensibility: Apply a normalizer to columns in the dataset.
        /// </summary>
        [Fact]
        public void ExtensibilityNormalizeColumns()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset.
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Compose the transformation.
            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
                .Append(mlContext.Transforms.NormalizeMinMax("Features"));
 
            // Transform the data.
            var transformedData = pipeline.Fit(data).Transform(data);
 
            // Validate that the data was normalized to between -1 and 1.
            var dataEnumerator = mlContext.Data.CreateEnumerable<FeatureColumn>(transformedData, true);
            foreach (var row in dataEnumerator)
                // Verify per-slot normalization.
                for (int i = 0; i < row.Features.Length; i++)
                    Assert.InRange(row.Features[i], -1, 1);
        }
 
        [Fact]
        void HashColumns()
        {
            // Concurrency must be 1 to assure that the mapping is done sequentially.
            var mlContext = new MLContext(seed: 1);
 
            // Load the Iris dataset.
            var data = mlContext.Data.LoadFromTextFile<Iris>(
                TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
                hasHeader: TestDatasets.iris.fileHasHeader,
                separatorChar: TestDatasets.iris.fileSeparator);
 
            // Compose the transformation.
            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
                .Append(mlContext.Transforms.Conversion.Hash(new[] {
                    new ColumnOptions("Features", "Features", 31, useOrderedHashing: true) }));
 
            // Transform the data.
            var transformedData = pipeline.Fit(data).Transform(data);
 
            // Validate that the data was normalized to between -1 and 1.
            var dataEnumerator = mlContext.Data.CreateEnumerable<HashedFeatureColumn>(transformedData, true);
            foreach (var row in dataEnumerator)
                // Verify per-slot normalization.
                for (int i = 0; i < row.Features.Length; i++)
                    Assert.InRange(row.Features[i], (uint)0, (uint)Math.Pow(2, 31));
        }
 
        private float GetRandomNumber(float number)
        {
            var seed = (int)(10 * number);
            var rng = new Random(seed);
            return (float)rng.NextDouble();
        }
    }
}