|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;
namespace Microsoft.Data.Analysis
{
/// <summary>
/// Options for DropNull().
/// </summary>
public enum DropNullOptions
{
/// <summary>
/// "Any" drops a row if any of the row values are null.
/// </summary>
Any,
/// <summary>
/// "All" drops a row when all of the row values are null.
/// </summary>
All
}
/// <summary>
/// A DataFrame to support indexing, binary operations, sorting, selection and other APIs. This will eventually also expose an IDataView for ML.NET
/// </summary>
public partial class DataFrame
{
internal const int DefaultMaxRowsToShowInPreview = 25;
private readonly DataFrameColumnCollection _columnCollection;
private readonly DataFrameRowCollection _rowCollection;
/// <summary>
/// Constructs a <see cref="DataFrame"/> with <paramref name="columns"/>.
/// </summary>
/// <param name="columns">The columns of this <see cref="DataFrame"/>.</param>
public DataFrame(IEnumerable<DataFrameColumn> columns)
{
_columnCollection = new DataFrameColumnCollection(columns, OnColumnsChanged);
_rowCollection = new DataFrameRowCollection(this);
}
public DataFrame(params DataFrameColumn[] columns)
{
_columnCollection = new DataFrameColumnCollection(columns, OnColumnsChanged);
_rowCollection = new DataFrameRowCollection(this);
}
/// <summary>
/// Returns the columns contained in the <see cref="DataFrame"/> as a <see cref="DataFrameColumnCollection"/>
/// </summary>
public DataFrameColumnCollection Columns => _columnCollection;
/// <summary>
/// Returns a <see cref="DataFrameRowCollection"/> that contains a view of the rows in this <see cref="DataFrame"/>
/// </summary>
public DataFrameRowCollection Rows => _rowCollection;
internal IReadOnlyList<string> GetColumnNames() => _columnCollection.GetColumnNames();
#region Operators
/// <summary>
/// An Indexer to get or set values.
/// </summary>
/// <param name="rowIndex">Zero based row index</param>
/// <param name="columnIndex">Zero based column index</param>
/// <returns>The value stored at the intersection of <paramref name="rowIndex"/> and <paramref name="columnIndex"/></returns>
public object this[long rowIndex, int columnIndex]
{
get => _columnCollection[columnIndex][rowIndex];
set => _columnCollection[columnIndex][rowIndex] = value;
}
/// <summary>
/// Returns a new DataFrame using the boolean values in <paramref name="filter"/>
/// </summary>
/// <param name="filter">A column of booleans</param>
public DataFrame Filter(PrimitiveDataFrameColumn<bool> filter) => Clone(filter);
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
/// <param name="rowIndices">A column of row indices</param>
public DataFrame Filter(PrimitiveDataFrameColumn<int> rowIndices) => Clone(rowIndices);
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
/// <param name="rowIndices">A column of row indices</param>
public DataFrame Filter(PrimitiveDataFrameColumn<long> rowIndices) => Clone(rowIndices);
/// <summary>
/// Returns a new DataFrame using the boolean values in filter
/// </summary>
/// <param name="rowFilter">A column of booleans</param>
public DataFrame this[PrimitiveDataFrameColumn<bool> rowFilter] => Filter(rowFilter);
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
/// <param name="rowIndices">A column of row indices</param>
public DataFrame this[PrimitiveDataFrameColumn<int> rowIndices] => Filter(rowIndices);
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
/// <param name="rowIndices">A column of row indices</param>
public DataFrame this[PrimitiveDataFrameColumn<long> rowIndices] => Filter(rowIndices);
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
public DataFrame this[IEnumerable<int> rowIndices]
{
get
{
PrimitiveDataFrameColumn<int> filterColumn = new PrimitiveDataFrameColumn<int>("Filter", rowIndices);
return Clone(filterColumn);
}
}
/// <summary>
/// Returns a new DataFrame using the row indices in <paramref name="rowIndices"/>
/// </summary>
public DataFrame this[IEnumerable<long> rowIndices]
{
get
{
PrimitiveDataFrameColumn<long> filterColumn = new PrimitiveDataFrameColumn<long>("Filter", rowIndices);
return Clone(filterColumn);
}
}
/// <summary>
/// Returns a new DataFrame using the boolean values in <paramref name="rowFilter"/>
/// </summary>
public DataFrame this[IEnumerable<bool> rowFilter]
{
get
{
PrimitiveDataFrameColumn<bool> filterColumn = new PrimitiveDataFrameColumn<bool>("Filter", rowFilter);
return Clone(filterColumn);
}
}
/// <summary>
/// An indexer based on <see cref="DataFrameColumn.Name"/>
/// </summary>
/// <param name="columnName">The name of a <see cref="DataFrameColumn"/></param>
/// <returns>A <see cref="DataFrameColumn"/> if it exists.</returns>
/// <exception cref="ArgumentException">Throws if <paramref name="columnName"/> is not present in this <see cref="DataFrame"/></exception>
public DataFrameColumn this[string columnName]
{
get => Columns[columnName];
set => Columns[columnName] = value;
}
/// <summary>
/// Returns the first <paramref name="numberOfRows"/> rows
/// </summary>
/// <param name="numberOfRows"></param>
public DataFrame Head(int numberOfRows)
{
return Clone(new PrimitiveDataFrameColumn<int>("Filter", Enumerable.Range(0, numberOfRows)));
}
/// <summary>
/// Returns the last <paramref name="numberOfRows"/> rows
/// </summary>
/// <param name="numberOfRows"></param>
public DataFrame Tail(int numberOfRows)
{
PrimitiveDataFrameColumn<long> filter = new PrimitiveDataFrameColumn<long>("Filter", numberOfRows);
for (long i = Rows.Count - numberOfRows; i < Rows.Count; i++)
{
filter[i - (Rows.Count - numberOfRows)] = i;
}
return Clone(filter);
}
// TODO: Add strongly typed versions of these APIs
#endregion
/// <summary>
/// Returns a full copy
/// </summary>
public DataFrame Clone()
{
return Clone(mapIndices: null);
}
private DataFrame Clone(DataFrameColumn mapIndices = null)
{
List<DataFrameColumn> newColumns = new List<DataFrameColumn>(Columns.Count);
for (int i = 0; i < Columns.Count; i++)
{
newColumns.Add(Columns[i].Clone(mapIndices));
}
return new DataFrame(newColumns);
}
/// <summary>
/// Generates a concise summary of each column in the DataFrame
/// </summary>
public DataFrame Info()
{
DataFrame ret = new DataFrame();
bool firstColumn = true;
foreach (DataFrameColumn column in Columns)
{
if (firstColumn)
{
firstColumn = false;
StringDataFrameColumn strColumn = new StringDataFrameColumn("Info", 2);
strColumn[0] = Strings.DataType;
strColumn[1] = Strings.DescriptionMethodLength;
ret.Columns.Add(strColumn);
}
ret.Columns.Add(column.Info());
}
return ret;
}
/// <summary>
/// Generates descriptive statistics that summarize each numeric column
/// </summary>
public DataFrame Description()
{
DataFrame ret = new DataFrame();
bool firstDescriptionColumn = true;
foreach (DataFrameColumn column in Columns)
{
if (!column.HasDescription())
{
continue;
}
if (firstDescriptionColumn)
{
firstDescriptionColumn = false;
StringDataFrameColumn stringColumn = new StringDataFrameColumn("Description", 0);
stringColumn.Append(Strings.DescriptionMethodLength);
stringColumn.Append("Max");
stringColumn.Append("Min");
stringColumn.Append("Mean");
ret.Columns.Add(stringColumn);
}
ret.Columns.Add(column.Description());
}
return ret;
}
/// <summary>
/// Orders the data frame by a specified column.
/// </summary>
/// <param name="columnName">The column name to order by.</param>
/// <param name="ascending">Sorting order.</param>
/// <param name="putNullValuesLast">If true, null values are always put at the end.</param>
public DataFrame OrderBy(string columnName, bool ascending = true, bool putNullValuesLast = true)
{
return Sort(columnName, ascending, putNullValuesLast);
}
/// <summary>
/// Orders the data frame by a specified column in descending order.
/// </summary>
/// <param name="columnName">The column name to order by.</param>
/// <param name="putNullValuesLast">If true, null values are always put at the end.</param>
public DataFrame OrderByDescending(string columnName, bool putNullValuesLast = true)
{
return Sort(columnName, false, putNullValuesLast);
}
/// <summary>
/// Clamps values beyond the specified thresholds on numeric columns
/// </summary>
/// <typeparam name="U"></typeparam>
/// <param name="min">Minimum value. All values below this threshold will be set to it</param>
/// <param name="max">Maximum value. All values above this threshold will be set to it</param>
/// <param name="inPlace">Indicates if the operation should be performed in place</param>
public DataFrame Clamp<U>(U min, U max, bool inPlace = false)
{
DataFrame ret = inPlace ? this : Clone();
for (int i = 0; i < ret.Columns.Count; i++)
{
DataFrameColumn column = ret.Columns[i];
if (column.IsNumericColumn())
column.Clamp(min, max, inPlace: true);
}
return ret;
}
/// <summary>
/// Adds a prefix to the column names
/// </summary>
public DataFrame AddPrefix(string prefix, bool inPlace = false)
{
DataFrame df = inPlace ? this : Clone();
for (int i = 0; i < df.Columns.Count; i++)
{
DataFrameColumn column = df.Columns[i];
column.SetName(prefix + column.Name);
df.OnColumnsChanged();
}
return df;
}
/// <summary>
/// Adds a suffix to the column names
/// </summary>
public DataFrame AddSuffix(string suffix, bool inPlace = false)
{
DataFrame df = inPlace ? this : Clone();
for (int i = 0; i < df.Columns.Count; i++)
{
DataFrameColumn column = df.Columns[i];
column.SetName(column.Name + suffix);
df.OnColumnsChanged();
}
return df;
}
/// <summary>
/// Returns a random sample of rows
/// </summary>
/// <param name="numberOfRows">Number of rows in the returned DataFrame</param>
public DataFrame Sample(int numberOfRows)
{
if (numberOfRows > Rows.Count)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfRows, Rows.Count), nameof(numberOfRows));
}
int shuffleLowerLimit = 0;
int shuffleUpperLimit = (int)Math.Min(Int32.MaxValue, Rows.Count);
int[] shuffleArray = Enumerable.Range(0, shuffleUpperLimit).ToArray();
Random rand = new Random();
while (shuffleLowerLimit < numberOfRows)
{
int randomIndex = rand.Next(shuffleLowerLimit, shuffleUpperLimit);
int temp = shuffleArray[shuffleLowerLimit];
shuffleArray[shuffleLowerLimit] = shuffleArray[randomIndex];
shuffleArray[randomIndex] = temp;
shuffleLowerLimit++;
}
ArraySegment<int> segment = new ArraySegment<int>(shuffleArray, 0, shuffleLowerLimit);
PrimitiveDataFrameColumn<int> indices = new PrimitiveDataFrameColumn<int>("indices", segment);
return Clone(indices);
}
/// <summary>
/// Groups the rows of the <see cref="DataFrame"/> by unique values in the <paramref name="columnName"/> column.
/// </summary>
/// <param name="columnName">The column used to group unique values</param>
/// <returns>A GroupBy object that stores the group information.</returns>
public GroupBy GroupBy(string columnName)
{
int columnIndex = _columnCollection.IndexOf(columnName);
if (columnIndex == -1)
throw new ArgumentException(String.Format(Strings.InvalidColumnName, columnName), nameof(columnName));
DataFrameColumn column = _columnCollection[columnIndex];
return column.GroupBy(columnIndex, this);
}
/// <summary>
/// Groups the rows of the <see cref="DataFrame"/> by unique values in the <paramref name="columnName"/> column.
/// </summary>
/// <typeparam name="TKey">Type of column used for grouping</typeparam>
/// <param name="columnName">The column used to group unique values</param>
/// <returns>A GroupBy object that stores the group information.</returns>
public GroupBy<TKey> GroupBy<TKey>(string columnName)
{
GroupBy<TKey> group = GroupBy(columnName) as GroupBy<TKey>;
if (group == null)
{
DataFrameColumn column = this[columnName];
throw new InvalidCastException(String.Format(Strings.BadColumnCastDuringGrouping, columnName, column.DataType, typeof(TKey)));
}
return group;
}
// In GroupBy and ReadCsv calls, columns get resized. We need to set the RowCount to reflect the true Length of the DataFrame. This does internal validation
internal void SetTableRowCount(long rowCount)
{
// Even if current RowCount == rowCount, do the validation
for (int i = 0; i < Columns.Count; i++)
{
if (Columns[i].Length != rowCount)
throw new ArgumentException(String.Format("{0} {1}", Strings.MismatchedRowCount, Columns[i].Name));
}
_columnCollection.RowCount = rowCount;
}
/// <summary>
/// Returns a DataFrame with no missing values
/// </summary>
/// <param name="options"></param>
public DataFrame DropNulls(DropNullOptions options = DropNullOptions.Any)
{
var filter = new BooleanDataFrameColumn("Filter");
if (options == DropNullOptions.Any)
{
filter.AppendMany(true, Rows.Count);
var buffers = filter.ColumnContainer.Buffers;
foreach (var column in Columns)
{
long index = 0;
for (int b = 0; b < buffers.Count; b++)
{
var span = buffers.GetOrCreateMutable(b).Span;
for (int i = 0; i < span.Length; i++)
{
span[i] = span[i] && column.IsValid(index);
index++;
}
}
}
}
else
{
filter.AppendMany(false, Rows.Count);
var buffers = filter.ColumnContainer.Buffers;
foreach (var column in Columns)
{
long index = 0;
for (int b = 0; b < buffers.Count; b++)
{
var span = buffers.GetOrCreateMutable(b).Span;
for (int i = 0; i < span.Length; i++)
{
span[i] = span[i] || column.IsValid(index);
index++;
}
}
}
}
return this[filter];
}
/// <summary>
/// Fills <see langword="null" /> values with <paramref name="value"/>.
/// </summary>
/// <param name="value">The value to replace <see langword="null" /> with.</param>
/// <param name="inPlace">A boolean flag to indicate if the operation should be in place</param>
/// <returns>A new <see cref="DataFrame"/> if <paramref name="inPlace"/> is not set. Returns this <see cref="DataFrame"/> otherwise.</returns>
public DataFrame FillNulls(object value, bool inPlace = false)
{
DataFrame ret = inPlace ? this : Clone();
for (int i = 0; i < ret.Columns.Count; i++)
{
ret.Columns[i].FillNulls(value, inPlace: true);
}
return ret;
}
/// <summary>
/// Fills <see langword="null" /> values in each column with values from <paramref name="values"/>.
/// </summary>
/// <param name="values">The values to replace <see langword="null" /> with, one value per column. Should be equal to the number of columns in this <see cref="DataFrame"/>. </param>
/// <param name="inPlace">A boolean flag to indicate if the operation should be in place</param>
/// <returns>A new <see cref="DataFrame"/> if <paramref name="inPlace"/> is not set. Returns this <see cref="DataFrame"/> otherwise.</returns>
public DataFrame FillNulls(IList<object> values, bool inPlace = false)
{
if (values.Count != Columns.Count)
throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(values));
DataFrame ret = inPlace ? this : Clone();
for (int i = 0; i < ret.Columns.Count; i++)
{
Columns[i].FillNulls(values[i], inPlace: true);
}
return ret;
}
private void ResizeByOneAndAppend(DataFrameColumn column, object value)
{
long length = column.Length;
column.Resize(length + 1);
column[length] = value;
}
/// <summary>
/// Appends rows to the DataFrame
/// </summary>
/// <remarks>If an input column's value doesn't match a DataFrameColumn's data type, a conversion will be attempted</remarks>
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
/// <remarks> Values are appended based on the column names</remarks>
/// <param name="rows">The rows to be appended to this DataFrame </param>
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
/// <param name="cultureInfo">culture info for formatting values</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
{
DataFrame ret = inPlace ? this : Clone();
foreach (DataFrameRow row in rows)
{
ret.Append(row.GetValues(), inPlace: true, cultureInfo: cultureInfo);
}
return ret;
}
/// <summary>
/// Appends a row to the DataFrame
/// </summary>
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
/// <param name="row"></param>
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
/// <param name="cultureInfo">Culture info for formatting values</param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}
DataFrame ret = inPlace ? this : Clone();
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
bool columnMoveNext = columnEnumerator.MoveNext();
if (row != null)
{
// Go through row first to make sure there are no data type incompatibilities
IEnumerator<object> rowEnumerator = row.GetEnumerator();
bool rowMoveNext = rowEnumerator.MoveNext();
List<object> cachedObjectConversions = new List<object>();
while (columnMoveNext && rowMoveNext)
{
DataFrameColumn column = columnEnumerator.Current;
object value = rowEnumerator.Current;
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
if (value is string stringValue)
{
if (stringValue.Length == 0 && column.DataType != typeof(string))
{
value = null;
}
else if (stringValue.Equals("null", StringComparison.OrdinalIgnoreCase))
{
value = null;
}
}
if (value != null)
{
value = Convert.ChangeType(value, column.DataType, cultureInfo);
if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), column.Name);
}
}
cachedObjectConversions.Add(value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
}
if (rowMoveNext)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
}
// Reset the enumerators
columnEnumerator = ret.Columns.GetEnumerator();
columnMoveNext = columnEnumerator.MoveNext();
rowEnumerator = row.GetEnumerator();
rowMoveNext = rowEnumerator.MoveNext();
int cacheIndex = 0;
while (columnMoveNext && rowMoveNext)
{
DataFrameColumn column = columnEnumerator.Current;
object value = cachedObjectConversions[cacheIndex];
ret.ResizeByOneAndAppend(column, value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
cacheIndex++;
}
}
while (columnMoveNext)
{
// Fill the remaining columns with null
DataFrameColumn column = columnEnumerator.Current;
ret.ResizeByOneAndAppend(column, null);
columnMoveNext = columnEnumerator.MoveNext();
}
ret.Columns.RowCount++;
return ret;
}
/// <summary>
/// Appends a row by enumerating column names and values from <paramref name="row"/>
/// </summary>
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
/// <param name="row">An enumeration of column name and value to be appended</param>
/// <param name="inPlace">If set, appends <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
/// <param name="cultureInfo">Culture info for formatting values</param>
public DataFrame Append(IEnumerable<KeyValuePair<string, object>> row, bool inPlace = false, CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}
DataFrame ret = inPlace ? this : Clone();
if (row == null)
{
throw new ArgumentNullException(nameof(row));
}
List<object> cachedObjectConversions = new List<object>();
foreach (KeyValuePair<string, object> columnAndValue in row)
{
string columnName = columnAndValue.Key;
int index = ret.Columns.IndexOf(columnName);
if (index == -1)
{
throw new ArgumentException(String.Format(Strings.InvalidColumnName, columnName), nameof(columnName));
}
DataFrameColumn column = ret.Columns[index];
object value = columnAndValue.Value;
if (value != null)
{
value = Convert.ChangeType(value, column.DataType, cultureInfo);
if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), column.Name);
}
}
cachedObjectConversions.Add(value);
}
int cacheIndex = 0;
foreach (KeyValuePair<string, object> columnAndValue in row)
{
string columnName = columnAndValue.Key;
int index = ret.Columns.IndexOf(columnName);
DataFrameColumn column = ret.Columns[index];
object value = cachedObjectConversions[cacheIndex];
ret.ResizeByOneAndAppend(column, value);
cacheIndex++;
}
foreach (DataFrameColumn column in ret.Columns)
{
if (column.Length == Rows.Count)
{
ret.ResizeByOneAndAppend(column, null);
}
}
ret.Columns.RowCount++;
return ret;
}
/// <summary>
/// Invalidates any cached data after a column has changed.
/// </summary>
private void OnColumnsChanged()
{
_schema = null;
}
private DataFrame Sort(string columnName, bool ascending, bool putNullValuesLast)
{
DataFrameColumn column = Columns[columnName];
PrimitiveDataFrameColumn<long> sortIndices = column.GetSortIndices(ascending, putNullValuesLast);
List<DataFrameColumn> newColumns = new List<DataFrameColumn>(Columns.Count);
for (int i = 0; i < Columns.Count; i++)
{
DataFrameColumn oldColumn = Columns[i];
DataFrameColumn newColumn = oldColumn.Clone(sortIndices);
Debug.Assert(newColumn.NullCount == oldColumn.NullCount);
newColumns.Add(newColumn);
}
return new DataFrame(newColumns);
}
/// <summary>
/// A preview of the contents of this <see cref="DataFrame"/> as a string.
/// </summary>
/// <returns>A preview of the contents of this <see cref="DataFrame"/>.</returns>
public override string ToString() => ToString(DefaultMaxRowsToShowInPreview);
/// <summary>
/// A preview of the contents of this <see cref="DataFrame"/> as a string.
/// </summary>
/// <param name="rowsToShow">Max amount of rows to show in preview.</param>
/// <returns></returns>
public string ToString(long rowsToShow)
{
StringBuilder sb = new StringBuilder();
int longestColumnName = 0;
for (int i = 0; i < Columns.Count; i++)
{
longestColumnName = Math.Max(longestColumnName, Columns[i].Name.Length);
}
int padding = Math.Max(10, longestColumnName + 1);
for (int i = 0; i < Columns.Count; i++)
{
// Left align by 10 or more (in case of longer column names)
sb.Append(string.Format(Columns[i].Name.PadRight(padding)));
}
sb.AppendLine();
long numberOfRows = Math.Min(Rows.Count, rowsToShow);
for (long i = 0; i < numberOfRows; i++)
{
foreach (object obj in Rows[i])
{
sb.Append((obj ?? "null").ToString().PadRight(padding));
}
sb.AppendLine();
}
if (numberOfRows < Rows.Count)
{
sb.Append(String.Format(Strings.AmountOfRowsShown, rowsToShow, Rows.Count));
sb.AppendLine();
}
return sb.ToString();
}
}
}
|