File: DataFrameTests.Computations.cs
Web Access
Project: src\test\Microsoft.Data.Analysis.Tests\Microsoft.Data.Analysis.Tests.csproj (Microsoft.Data.Analysis.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.ML.TestFramework;
using Xunit;
using Xunit.Abstractions;
 
namespace Microsoft.Data.Analysis.Tests
{
    public partial class DataFrameTests : BaseTestClass
    {
        [Fact]
        public void TestComputations()
        {
            DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10);
            df["Int"][0] = -10;
            Assert.Equal(-10, df.Columns["Int"][0]);
 
            DataFrameColumn absColumn = df.Columns["Int"].Abs();
            Assert.Equal(10, absColumn[0]);
            Assert.Equal(-10, df.Columns["Int"][0]);
            df.Columns["Int"].Abs(true);
            Assert.Equal(10, df.Columns["Int"][0]);
 
            Assert.Throws<NotSupportedException>(() => df.Columns["Byte"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Byte"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Char"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Char"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["DateTime"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["DateTime"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Decimal"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Decimal"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Double"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Double"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Float"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Float"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Int"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Int"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Long"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Long"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Sbyte"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Sbyte"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Short"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Short"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Uint"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Uint"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ulong"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ulong"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ushort"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ushort"].Any());
 
            bool any = df.Columns["Bool"].Any();
            bool all = df.Columns["Bool"].All();
            Assert.True(any);
            Assert.False(all);
 
            // Test the computation results
            df.Columns["Double"][0] = 100.0;
            DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax();
            for (int i = 0; i < doubleColumn.Length; i++)
            {
                if (i == 5)
                    Assert.Null(doubleColumn[i]);
                else
                    Assert.Equal(100.0, (double)doubleColumn[i]);
            }
            Assert.Equal(1.0, df.Columns["Double"][1]);
            df.Columns["Double"].CumulativeMax(true);
            for (int i = 0; i < df.Columns["Double"].Length; i++)
            {
                if (i == 5)
                    Assert.Null(df.Columns["Double"][i]);
                else
                    Assert.Equal(100.0, (double)df.Columns["Double"][i]);
            }
 
            df.Columns["Float"][0] = -10.0f;
            DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin();
            for (int i = 0; i < floatColumn.Length; i++)
            {
                if (i == 5)
                    Assert.Null(floatColumn[i]);
                else
                    Assert.Equal(-10.0f, (float)floatColumn[i]);
            }
            Assert.Equal(9.0f, df.Columns["Float"][9]);
            df.Columns["Float"].CumulativeMin(true);
            for (int i = 0; i < df.Columns["Float"].Length; i++)
            {
                if (i == 5)
                    Assert.Null(df.Columns["Float"][i]);
                else
                    Assert.Equal(-10.0f, (float)df.Columns["Float"][i]);
            }
 
            DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct();
            Assert.Equal((uint)0, uintColumn[8]);
            Assert.Equal((uint)8, df.Columns["Uint"][8]);
            df.Columns["Uint"].CumulativeProduct(true);
            Assert.Equal((uint)0, df.Columns["Uint"][9]);
 
            DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum();
            Assert.Equal((ushort)40, ushortColumn[9]);
            Assert.Equal((ushort)9, df.Columns["Ushort"][9]);
            df.Columns["Ushort"].CumulativeSum(true);
            Assert.Equal((ushort)40, df.Columns["Ushort"][9]);
 
            Assert.Equal(100.0, df.Columns["Double"].Max());
            Assert.Equal(-10.0f, df.Columns["Float"].Min());
            Assert.Equal((uint)0, df.Columns["Uint"].Product());
            Assert.Equal((ushort)130, df.Columns["Ushort"].Sum());
 
            df.Columns["Double"][0] = 100.1;
            Assert.Equal(100.1, df.Columns["Double"][0]);
            DataFrameColumn roundColumn = df.Columns["Double"].Round();
            Assert.Equal(100.0, roundColumn[0]);
            Assert.Equal(100.1, df.Columns["Double"][0]);
            df.Columns["Double"].Round(true);
            Assert.Equal(100.0, df.Columns["Double"][0]);
 
            // Test that none of the numeric column types throw
            for (int i = 0; i < df.Columns.Count; i++)
            {
                DataFrameColumn column = df.Columns[i];
                if (column.DataType == typeof(bool))
                {
                    Assert.Throws<NotSupportedException>(() => column.CumulativeMax());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeMin());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeSum());
                    Assert.Throws<NotSupportedException>(() => column.Max());
                    Assert.Throws<NotSupportedException>(() => column.Min());
                    Assert.Throws<NotSupportedException>(() => column.Product());
                    Assert.Throws<NotSupportedException>(() => column.Sum());
                    continue;
                }
                else if (column.DataType == typeof(string))
                {
                    Assert.Throws<NotImplementedException>(() => column.CumulativeMax());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeMin());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeSum());
                    Assert.Throws<NotImplementedException>(() => column.Max());
                    Assert.Throws<NotImplementedException>(() => column.Min());
                    Assert.Throws<NotImplementedException>(() => column.Product());
                    Assert.Throws<NotImplementedException>(() => column.Sum());
                    continue;
                }
                else if (column.DataType == typeof(DateTime))
                {
                    column.CumulativeMax();
                    column.CumulativeMin();
                    column.Max();
                    column.Min();
 
                    Assert.Throws<NotSupportedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeSum());
                    Assert.Throws<NotSupportedException>(() => column.Product());
                    Assert.Throws<NotSupportedException>(() => column.Sum());
                    continue;
                }
 
                column.CumulativeMax();
                column.CumulativeMin();
                column.CumulativeProduct();
                column.CumulativeSum();
                column.Max();
                column.Min();
                column.Product();
                column.Sum();
            }
        }
 
        [Fact]
        public void TestComputationsIncludingDateTime()
        {
            DataFrame df = MakeDataFrameWithNumericStringAndDateTimeColumns(10);
            df["Int"][0] = -10;
            Assert.Equal(-10, df.Columns["Int"][0]);
 
            DataFrameColumn absColumn = df.Columns["Int"].Abs();
            Assert.Equal(10, absColumn[0]);
            Assert.Equal(-10, df.Columns["Int"][0]);
            df.Columns["Int"].Abs(true);
            Assert.Equal(10, df.Columns["Int"][0]);
 
            Assert.Throws<NotSupportedException>(() => df.Columns["Byte"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Byte"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Char"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Char"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Decimal"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Decimal"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Double"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Double"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Float"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Float"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Int"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Int"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Long"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Long"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Sbyte"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Sbyte"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Short"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Short"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Uint"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Uint"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ulong"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ulong"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ushort"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["Ushort"].Any());
            Assert.Throws<NotSupportedException>(() => df.Columns["DateTime"].All());
            Assert.Throws<NotSupportedException>(() => df.Columns["DateTime"].Any());
 
            // Test the computation results
            var maxDate = SampleDateTime.AddDays(100);
            df.Columns["DateTime"][0] = maxDate;
            DataFrameColumn dateTimeColumn = df.Columns["DateTime"].CumulativeMax();
            for (int i = 0; i < dateTimeColumn.Length; i++)
            {
                if (i == 5)
                    Assert.Null(dateTimeColumn[i]);
                else
                    Assert.Equal(maxDate, (DateTime)dateTimeColumn[i]);
            }
            Assert.Equal(maxDate, dateTimeColumn.Max());
 
            df.Columns["Double"][0] = 100.0;
            DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax();
            for (int i = 0; i < doubleColumn.Length; i++)
            {
                if (i == 5)
                    Assert.Null(doubleColumn[i]);
                else
                    Assert.Equal(100.0, (double)doubleColumn[i]);
            }
            Assert.Equal(1.0, df.Columns["Double"][1]);
            df.Columns["Double"].CumulativeMax(true);
            for (int i = 0; i < df.Columns["Double"].Length; i++)
            {
                if (i == 5)
                    Assert.Null(df.Columns["Double"][i]);
                else
                    Assert.Equal(100.0, (double)df.Columns["Double"][i]);
            }
 
            df.Columns["Float"][0] = -10.0f;
            DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin();
            for (int i = 0; i < floatColumn.Length; i++)
            {
                if (i == 5)
                    Assert.Null(floatColumn[i]);
                else
                    Assert.Equal(-10.0f, (float)floatColumn[i]);
            }
            Assert.Equal(9.0f, df.Columns["Float"][9]);
            df.Columns["Float"].CumulativeMin(true);
            for (int i = 0; i < df.Columns["Float"].Length; i++)
            {
                if (i == 5)
                    Assert.Null(df.Columns["Float"][i]);
                else
                    Assert.Equal(-10.0f, (float)df.Columns["Float"][i]);
            }
 
            DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct();
            Assert.Equal((uint)0, uintColumn[8]);
            Assert.Equal((uint)8, df.Columns["Uint"][8]);
            df.Columns["Uint"].CumulativeProduct(true);
            Assert.Equal((uint)0, df.Columns["Uint"][9]);
 
            DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum();
            Assert.Equal((ushort)40, ushortColumn[9]);
            Assert.Equal((ushort)9, df.Columns["Ushort"][9]);
            df.Columns["Ushort"].CumulativeSum(true);
            Assert.Equal((ushort)40, df.Columns["Ushort"][9]);
 
            Assert.Equal(100.0, df.Columns["Double"].Max());
            Assert.Equal(-10.0f, df.Columns["Float"].Min());
            Assert.Equal((uint)0, df.Columns["Uint"].Product());
            Assert.Equal((ushort)130, df.Columns["Ushort"].Sum());
 
            df.Columns["Double"][0] = 100.1;
            Assert.Equal(100.1, df.Columns["Double"][0]);
            DataFrameColumn roundColumn = df.Columns["Double"].Round();
            Assert.Equal(100.0, roundColumn[0]);
            Assert.Equal(100.1, df.Columns["Double"][0]);
            df.Columns["Double"].Round(true);
            Assert.Equal(100.0, df.Columns["Double"][0]);
 
            // Test that none of the numeric column types throw
            for (int i = 0; i < df.Columns.Count; i++)
            {
                DataFrameColumn column = df.Columns[i];
                if (column.DataType == typeof(bool))
                {
                    Assert.Throws<NotSupportedException>(() => column.CumulativeMax());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeMin());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeSum());
                    Assert.Throws<NotSupportedException>(() => column.Max());
                    Assert.Throws<NotSupportedException>(() => column.Min());
                    Assert.Throws<NotSupportedException>(() => column.Product());
                    Assert.Throws<NotSupportedException>(() => column.Sum());
                    continue;
                }
                else if (column.DataType == typeof(string))
                {
                    Assert.Throws<NotImplementedException>(() => column.CumulativeMax());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeMin());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotImplementedException>(() => column.CumulativeSum());
                    Assert.Throws<NotImplementedException>(() => column.Max());
                    Assert.Throws<NotImplementedException>(() => column.Min());
                    Assert.Throws<NotImplementedException>(() => column.Product());
                    Assert.Throws<NotImplementedException>(() => column.Sum());
                    continue;
                }
                else if (column.DataType == typeof(DateTime))
                {
                    Assert.Throws<NotSupportedException>(() => column.CumulativeProduct());
                    Assert.Throws<NotSupportedException>(() => column.CumulativeSum());
                    Assert.Throws<NotSupportedException>(() => column.Product());
                    Assert.Throws<NotSupportedException>(() => column.Sum());
                    continue;
                }
                column.CumulativeMax();
                column.CumulativeMin();
                column.CumulativeProduct();
                column.CumulativeSum();
                column.Max();
                column.Min();
                column.Product();
                column.Sum();
            }
        }
 
        [Fact]
        public void TestIntComputations_MaxMin_WithNulls()
        {
            var column = new Int32DataFrameColumn("Int", new int?[]
                {
                    null,
                    2,
                    1,
                    4,
                    3,
                    null
                });
 
            Assert.Equal(1, column.Min());
            Assert.Equal(4, column.Max());
        }
 
        [Fact]
        public void TestIntSum_OnColumnWithNullsOnly()
        {
            var column = new Int32DataFrameColumn("Int", new int?[] { null, null });
            Assert.Null(column.Sum());
        }
 
        [Fact]
        public void TestIntSum_OnEmptyColumn()
        {
            var column = new Int32DataFrameColumn("Int");
            Assert.Null(column.Sum());
        }
 
        [Fact]
        public void TestIntComputations_MaxMin_OnEmptyColumn()
        {
            var column = new Int32DataFrameColumn("Int");
 
            Assert.Null(column.Min());
            Assert.Null(column.Max());
        }
 
        [Fact]
        public void TestDateTimeComputations_MaxMin_OnEmptyColumn()
        {
            var column = new DateTimeDataFrameColumn("DateTime");
 
            Assert.Null(column.Min());
            Assert.Null(column.Max());
        }
 
        [Fact]
        public void TestDateTimeComputations_MaxMin_WithNulls()
        {
            var dateTimeColumn = new DateTimeDataFrameColumn("DateTime", new DateTime?[]
                {
                    null,
                    new DateTime(2022, 1, 1),
                    new DateTime(2020, 1, 1),
                    new DateTime(2023, 1, 1),
                    new DateTime(2021, 1, 1),
                    null
                });
 
            Assert.Equal(new DateTime(2020, 1, 1), dateTimeColumn.Min());
            Assert.Equal(new DateTime(2023, 1, 1), dateTimeColumn.Max());
        }
 
        [Theory]
        [InlineData(5, 10)]
        [InlineData(-15, 10)]
        [InlineData(-5, 10)]
        public void TestComputations_WithNegativeNumbers_MaxMin_Calculated(int startingFrom, int length)
        {
            // Arrange
 
            IEnumerable<int> range = Enumerable.Range(startingFrom, length);
 
            int max = range.Max();
            int min = range.Min();
 
            DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls: false, startingFrom);
 
            var byteColumn = (PrimitiveDataFrameColumn<byte>)df.Columns["Byte"];
            var decimalColumn = (PrimitiveDataFrameColumn<decimal>)df.Columns["Decimal"];
            var doubleColumn = (PrimitiveDataFrameColumn<double>)df.Columns["Double"];
            var floatColumn = (PrimitiveDataFrameColumn<float>)df.Columns["Float"];
            var intColumn = (PrimitiveDataFrameColumn<int>)df.Columns["Int"];
            var longColumn = (PrimitiveDataFrameColumn<long>)df.Columns["Long"];
            var sbyteColumn = (PrimitiveDataFrameColumn<sbyte>)df.Columns["Sbyte"];
            var shortColumn = (PrimitiveDataFrameColumn<short>)df.Columns["Short"];
            var uintColumn = (PrimitiveDataFrameColumn<uint>)df.Columns["Uint"];
            var ulongColumn = (PrimitiveDataFrameColumn<ulong>)df.Columns["Ulong"];
            var ushortColumn = (PrimitiveDataFrameColumn<ushort>)df.Columns["Ushort"];
 
            // Act, Assert
 
            // We need to iterate over all range with conversion to byte due to negative numbers issue
            Assert.Equal((byte)byteColumn.Max(), range.Select(x => (byte)x).Max());
 
            Assert.Equal((decimal)decimalColumn.Max(), (decimal)max);
            Assert.Equal((double)doubleColumn.Max(), (double)max);
            Assert.Equal((float)floatColumn.Max(), (float)max);
            Assert.Equal((int)intColumn.Max(), (int)max);
            Assert.Equal((long)longColumn.Max(), (long)max);
            Assert.Equal((sbyte)sbyteColumn.Max(), (sbyte)max);
            Assert.Equal((short)shortColumn.Max(), (short)max);
 
            // We need to iterate over all range with conversion to uint due to negative numbers issue
            Assert.Equal((uint)uintColumn.Max(), range.Select(x => (uint)x).Max());
 
            // We need to iterate over all range with conversion to ulong due to negative numbers issue
            Assert.Equal((ulong)ulongColumn.Max(), range.Select(x => (ulong)x).Max());
 
            // We need to iterate over all range with conversion to ushort due to negative numbers issue
            Assert.Equal((ushort)ushortColumn.Max(), range.Select(x => (ushort)x).Max());
 
            // We need to iterate over all range with conversion to byte due to negative numbers issue
            Assert.Equal((byte)byteColumn.Min(), range.Select(x => (byte)x).Min());
 
            Assert.Equal((decimal)decimalColumn.Min(), (decimal)min);
            Assert.Equal((double)doubleColumn.Min(), (double)min);
            Assert.Equal((float)floatColumn.Min(), (float)min);
            Assert.Equal((int)intColumn.Min(), (int)min);
            Assert.Equal((long)longColumn.Min(), (long)min);
            Assert.Equal((sbyte)sbyteColumn.Min(), (sbyte)min);
            Assert.Equal((short)shortColumn.Min(), (short)min);
 
            // We need to iterate over all range with conversion to uint due to negative numbers issue
            Assert.Equal((uint)uintColumn.Min(), range.Select(x => (uint)x).Min());
 
            // We need to iterate over all range with conversion to ulong due to negative numbers issue
            Assert.Equal((ulong)ulongColumn.Min(), range.Select(x => (ulong)x).Min());
 
            // We need to iterate over all range with conversion to ushort due to negative numbers issue
            Assert.Equal((ushort)ushortColumn.Min(), range.Select(x => (ushort)x).Min());
        }
 
        [Fact]
        public void Test_Logical_Computation_And()
        {
            var col1 = new BooleanDataFrameColumn("col1", new Boolean[] { true, false, true });
            var col2 = new BooleanDataFrameColumn("col2", new Boolean[] { false, true, true });
            var dfTest = new DataFrame(col1, col2);
            var col3 = dfTest["col1"].And(dfTest["col2"]);
            var col4 = col1.And(col2);
 
            for (int i = 0; i < col1.Length; i++)
            {
                var exprectedValue = col1[i] & col2[i];
                Assert.Equal(exprectedValue, col3[i]);
                Assert.Equal(exprectedValue, col4[i]);
            }
        }
 
        [Fact]
        public void Test_Logical_Computation_Or()
        {
            var col1 = new BooleanDataFrameColumn("col1", new Boolean[] { true, false, true });
            var col2 = new BooleanDataFrameColumn("col2", new Boolean[] { false, true, true });
            var dfTest = new DataFrame(col1, col2);
            var col3 = dfTest["col1"].Or(dfTest["col2"]);
            var col4 = col1.Or(col2);
 
            for (int i = 0; i < col1.Length; i++)
            {
                var exprectedValue = col1[i] | col2[i];
                Assert.Equal(exprectedValue, col3[i]);
                Assert.Equal(exprectedValue, col4[i]);
            }
        }
    }
}