File: DataViewSchema.cs
Web Access
Project: src\src\Microsoft.ML.DataView\Microsoft.ML.DataView.csproj (Microsoft.ML.DataView)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Microsoft.ML.Data;
 
namespace Microsoft.ML
{
    /// <summary>
    /// Represents the schema of an <see cref="IDataView"/> or an <see cref="DataViewRow"/>.
    /// The schema is a collection of <see cref="DataViewSchema.Column"/>.
    /// </summary>
    [DebuggerTypeProxy(typeof(SchemaDebuggerProxy))]
    public sealed class DataViewSchema : IReadOnlyList<DataViewSchema.Column>
    {
        private readonly Column[] _columns;
        private readonly Dictionary<string, int> _nameMap;
 
        /// <summary>
        /// Number of columns in the schema.
        /// </summary>
        public int Count => _columns.Length;
 
        /// <summary>
        /// Get the column by name. Throws an exception if such column does not exist.
        /// Note that if multiple columns exist with the same name, the one with the biggest index is returned.
        /// The other columns are considered 'hidden', and only accessible by their index.
        /// </summary>
        public Column this[string name]
        {
            get
            {
                if (string.IsNullOrEmpty(name)) throw new ArgumentNullException(nameof(name));
                if (!_nameMap.TryGetValue(name, out int col))
                    throw new ArgumentOutOfRangeException(nameof(name), $"Column '{name}' not found");
                return _columns[col];
            }
        }
 
        /// <summary>
        /// Get the column by index.
        /// </summary>
        public Column this[int columnIndex]
        {
            get
            {
                if (!(0 <= columnIndex && columnIndex < _columns.Length))
                    throw new ArgumentOutOfRangeException(nameof(columnIndex));
                return _columns[columnIndex];
            }
        }
 
        /// <summary>
        /// Get the column by name, or <c>null</c> if the column is not present.
        /// </summary>
        public Column? GetColumnOrNull(string name)
        {
            if (string.IsNullOrEmpty(name)) throw new ArgumentNullException(nameof(name));
            if (_nameMap.TryGetValue(name, out int col))
                return _columns[col];
            return null;
        }
 
        public IEnumerator<Column> GetEnumerator() => ((IEnumerable<Column>)_columns).GetEnumerator();
 
        IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
 
        public override string ToString() => $"{Count} columns";
 
        /// <summary>
        /// This class describes one column in the particular schema.
        /// </summary>
        public struct Column
        {
            /// <summary>
            /// The name of the column.
            /// </summary>
            public string Name { get; }
 
            /// <summary>
            /// The column's index in the schema.
            /// </summary>
            public int Index { get; }
 
            /// <summary>
            /// Whether this column is hidden (accessible only by index).
            /// </summary>
            public bool IsHidden { get; }
 
            /// <summary>
            /// The type of the column.
            /// </summary>
            public DataViewType Type { get; }
 
            /// <summary>
            /// The annotations of the column.
            /// </summary>
            public Annotations Annotations { get; }
 
            internal Column(string name, int index, bool isHidden, DataViewType type, Annotations annotations)
            {
                if (string.IsNullOrEmpty(name))
                    throw new ArgumentNullException(nameof(name));
                if (index < 0)
                    throw new ArgumentOutOfRangeException(nameof(index));
 
                Name = name;
                Index = index;
                IsHidden = isHidden;
                Type = type ?? throw new ArgumentNullException(nameof(type));
                Annotations = annotations ?? Annotations.Empty;
            }
 
            public override string ToString()
            {
                var annotationsString = (Annotations == null || Annotations.Schema.Count == 0) ?
                    null : $" {{{string.Join(", ", Annotations.Schema.Select(x => x.Name))}}}";
                return $"{Name}: {Type}{annotationsString}";
            }
        }
 
        /// <summary>
        /// This class represents the schema of one column of a data view, without an attachment to a particular <see cref="DataViewSchema"/>.
        /// </summary>
        public struct DetachedColumn
        {
            /// <summary>
            /// The name of the column.
            /// </summary>
            public string Name { get; }
            /// <summary>
            /// The type of the column.
            /// </summary>
            public DataViewType Type { get; }
            /// <summary>
            /// The annotations associated with the column.
            /// </summary>
            public Annotations Annotations { get; }
 
            /// <summary>
            /// Creates an instance of a <see cref="DetachedColumn"/>.
            /// </summary>
            public DetachedColumn(string name, DataViewType type, Annotations annotations = null)
            {
                if (string.IsNullOrEmpty(name))
                    throw new ArgumentNullException(nameof(name));
 
                Name = name;
                Type = type ?? throw new ArgumentNullException(nameof(type));
                Annotations = annotations ?? Annotations.Empty;
            }
 
            /// <summary>
            /// Create an instance of <see cref="DetachedColumn"/> from an existing schema's column.
            /// </summary>
            public DetachedColumn(Column column)
            {
                Name = column.Name;
                Type = column.Type;
                Annotations = column.Annotations;
            }
        }
 
        /// <summary>
        /// The schema annotations of one <see cref="Column"/>.
        /// </summary>
        [DebuggerTypeProxy(typeof(AnnotationsDebuggerProxy))]
        public sealed class Annotations
        {
            /// <summary>
            /// Annotation getter delegates. Useful to construct annotations out of other annotations.
            /// </summary>
            private readonly Delegate[] _getters;
 
            /// <summary>
            /// The schema of the annotations row. It is different from the schema that the column belongs to.
            /// </summary>
            public DataViewSchema Schema { get; }
 
            public static Annotations Empty { get; } = new Annotations(new DataViewSchema(new Column[0]), new Delegate[0]);
 
            /// <summary>
            /// Create an annotations row by supplying the schema columns and the getter delegates for all the values.
            /// </summary>
            /// <remarks>
            /// Note: The <paramref name="getters"/> array is owned by this <see cref="Annotations"/> instance.
            /// </remarks>
            internal Annotations(DataViewSchema schema, Delegate[] getters)
            {
                Debug.Assert(schema != null);
                Debug.Assert(getters != null);
 
                Debug.Assert(schema.Count == getters.Length);
                // Check all getters.
                for (int i = 0; i < schema.Count; i++)
                {
                    var getter = getters[i];
                    if (getter == null)
                        throw new ArgumentNullException(nameof(getter), $"Delegate at index '{i}' of {nameof(getters)} was null.");
                    Utils.MarshalActionInvoke(CheckGetter<int>, schema[i].Type.RawType, getter);
                }
                Schema = schema;
                _getters = getters;
            }
 
            private void CheckGetter<TValue>(Delegate getter)
            {
                var typedGetter = getter as ValueGetter<TValue>;
                if (typedGetter == null)
                    throw new ArgumentNullException(nameof(getter), $"Getter of type '{typeof(TValue)}' expected, but {getter.GetType()} found");
            }
 
            /// <summary>
            /// Get a getter delegate for one value of the annotations row.
            /// </summary>
            public ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column)
            {
                if (column.Index >= _getters.Length)
                    throw new ArgumentException(nameof(column));
                var typedGetter = _getters[column.Index] as ValueGetter<TValue>;
                if (typedGetter == null)
                {
                    Debug.Assert(_getters[column.Index] != null);
                    throw new InvalidOperationException($"Invalid call to '{nameof(GetGetter)}'");
                }
                return typedGetter;
            }
 
            /// <summary>
            /// Get the value of an annotation, by annotation kind (aka column name).
            /// </summary>
            public void GetValue<TValue>(string kind, ref TValue value)
            {
                var column = Schema.GetColumnOrNull(kind);
                if (column == null)
                    throw new InvalidOperationException($"Invalid call to '{nameof(GetValue)}'");
                GetGetter<TValue>(column.Value)(ref value);
            }
 
            public override string ToString() => string.Join(", ", Schema.Select(x => x.Name));
 
            internal Delegate GetGetterInternal(int index)
            {
                Debug.Assert(0 <= index && index < Schema.Count);
                return _getters[index];
            }
 
            /// <summary>
            /// Class containing operations to build an <see cref="Annotations"/>.
            /// </summary>
            public sealed class Builder
            {
                private readonly List<(string Name, DataViewType Type, Delegate Getter, Annotations Annotations)> _items;
 
                public Builder()
                {
                    _items = new List<(string Name, DataViewType Type, Delegate Getter, Annotations Annotations)>();
                }
 
                /// <summary>
                /// Add some columns from <paramref name="annotations"/> into our new annotations, by applying <paramref name="selector"/>
                /// to all the names.
                /// </summary>
                /// <param name="annotations">The annotations row to take values from.</param>
                /// <param name="selector">The predicate describing which annotation columns to keep.</param>
                public void Add(Annotations annotations, Func<string, bool> selector)
                {
                    if (annotations == null)
                        return;
 
                    if (selector == null)
                        throw new ArgumentNullException(nameof(selector));
 
                    foreach (var column in annotations.Schema)
                    {
                        if (selector(column.Name))
                            _items.Add((column.Name, column.Type, annotations.GetGetterInternal(column.Index), column.Annotations));
                    }
                }
 
                /// <summary>
                /// Add one annotation column, strongly-typed version.
                /// </summary>
                /// <typeparam name="TValue">The type of the value.</typeparam>
                /// <param name="name">The annotation name.</param>
                /// <param name="type">The annotation type.</param>
                /// <param name="getter">The getter delegate.</param>
                /// <param name="annotations">Annotations of the input column. Note that annotations on an annotation column is somewhat rare
                /// except for certain types (for example, slot names for a vector, key values for something of key type).</param>
                public void Add<TValue>(string name, DataViewType type, ValueGetter<TValue> getter, Annotations annotations = null)
                {
                    if (string.IsNullOrEmpty(name))
                        throw new ArgumentNullException(nameof(name));
                    if (type == null)
                        throw new ArgumentNullException(nameof(type));
                    if (getter == null)
                        throw new ArgumentNullException(nameof(getter));
                    if (type.RawType != typeof(TValue))
                        throw new ArgumentException($"{nameof(type)}.{nameof(type.RawType)} must be of type '{typeof(TValue).FullName}'.", nameof(type));
 
                    _items.Add((name, type, getter, annotations));
                }
 
                /// <summary>
                /// Add one annotation column, weakly-typed version.
                /// </summary>
                /// <param name="name">The annotation name.</param>
                /// <param name="type">The annotation type.</param>
                /// <param name="getter">The getter delegate that provides the value. Note that the type of the getter is still checked
                /// inside this method.</param>
                /// <param name="annotations">Annotations of the input column. Note that annotations on an annotation column is somewhat rare
                /// except for certain types (for example, slot names for a vector, key values for something of key type).</param>
                public void Add(string name, DataViewType type, Delegate getter, Annotations annotations = null)
                {
                    if (string.IsNullOrEmpty(name))
                        throw new ArgumentNullException(nameof(name));
                    if (type == null)
                        throw new ArgumentNullException(nameof(type));
                    if (getter == null)
                        throw new ArgumentNullException(nameof(getter));
 
                    Utils.MarshalActionInvoke(AddDelegate<int>, type.RawType, name, type, getter, annotations);
                }
 
                /// <summary>
                /// Add one annotation column for a primitive value type.
                /// </summary>
                /// <param name="name">The annotation name.</param>
                /// <param name="type">The annotation type.</param>
                /// <param name="value">The value of the annotation.</param>
                /// <param name="annotations">Annotations of the input column. Note that annotations on an annotation column is somewhat rare
                /// except for certain types (for example, slot names for a vector, key values for something of key type).</param>
                public void AddPrimitiveValue<TValue>(string name, PrimitiveDataViewType type, TValue value, Annotations annotations = null)
                {
                    if (string.IsNullOrEmpty(name))
                        throw new ArgumentNullException(nameof(name));
                    if (type == null)
                        throw new ArgumentNullException(nameof(type));
                    if (type.RawType != typeof(TValue))
                        throw new ArgumentException($"{nameof(type)}.{nameof(type.RawType)} must be of type '{typeof(TValue).FullName}'.", nameof(type));
 
                    Add(name, type, (ref TValue dst) => dst = value, annotations);
                }
 
                /// <summary>
                /// Returns a <see cref="Annotations"/> row that contains the current contents of this <see cref="Builder"/>.
                /// </summary>
                public Annotations ToAnnotations()
                {
                    var builder = new DataViewSchema.Builder();
                    foreach (var item in _items)
                        builder.AddColumn(item.Name, item.Type, item.Annotations);
                    return new Annotations(builder.ToSchema(), _items.Select(x => x.Getter).ToArray());
                }
 
                private void AddDelegate<TValue>(string name, DataViewType type, Delegate getter, Annotations annotations)
                {
                    Debug.Assert(!string.IsNullOrEmpty(name));
                    Debug.Assert(type != null);
                    Debug.Assert(getter != null);
 
                    var typedGetter = getter as ValueGetter<TValue>;
                    if (typedGetter == null)
                        throw new ArgumentException($"{nameof(getter)} must be of type '{typeof(ValueGetter<TValue>).FullName}'", nameof(getter));
                    _items.Add((name, type, typedGetter, annotations));
                }
            }
        }
 
        /// <summary>
        /// Class containing operations to build a <see cref="DataViewSchema"/>.
        /// </summary>
        public sealed class Builder
        {
            private readonly List<(string Name, DataViewType Type, Annotations Annotations)> _items;
 
            /// <summary>
            /// Create a new instance of <see cref="Builder"/>.
            /// </summary>
            public Builder()
            {
                _items = new List<(string Name, DataViewType Type, Annotations Annotations)>();
            }
 
            /// <summary>
            /// Add one column to the schema being built.
            /// </summary>
            /// <param name="name">The column name.</param>
            /// <param name="type">The column type.</param>
            /// <param name="annotations">The column annotations.</param>
            public void AddColumn(string name, DataViewType type, Annotations annotations = null)
            {
                if (string.IsNullOrEmpty(name))
                    throw new ArgumentNullException(nameof(name));
                if (type == null)
                    throw new ArgumentNullException(nameof(type));
 
                _items.Add((name, type, annotations));
            }
 
            /// <summary>
            /// Add multiple existing columns to the schema being built.
            /// </summary>
            /// <param name="source">Columns to add.</param>
            public void AddColumns(IEnumerable<Column> source)
            {
                foreach (var column in source)
                    AddColumn(column.Name, column.Type, column.Annotations);
            }
 
            /// <summary>
            /// Add multiple existing columns to the schema being built.
            /// </summary>
            /// <param name="source">Columns to add.</param>
            public void AddColumns(IEnumerable<DetachedColumn> source)
            {
                foreach (var column in source)
                    AddColumn(column.Name, column.Type, column.Annotations);
            }
 
            /// <summary>
            /// Returns a <see cref="DataViewSchema"/> that contains the current contents of this <see cref="Builder"/>.
            /// </summary>
            public DataViewSchema ToSchema()
            {
                var nameMap = new Dictionary<string, int>();
                for (int i = 0; i < _items.Count; i++)
                    nameMap[_items[i].Name] = i;
 
                var columns = new Column[_items.Count];
                for (int i = 0; i < columns.Length; i++)
                    columns[i] = new Column(_items[i].Name, i, nameMap[_items[i].Name] != i, _items[i].Type, _items[i].Annotations);
 
                return new DataViewSchema(columns);
            }
        }
 
        /// <summary>
        /// This constructor should only be called by <see cref="Builder"/>.
        /// </summary>
        /// <param name="columns">The input columns. The constructed instance takes ownership of the array.</param>
        private DataViewSchema(Column[] columns)
        {
            if (columns == null)
                throw new ArgumentNullException(nameof(columns));
 
            _columns = columns;
            _nameMap = new Dictionary<string, int>();
            for (int i = 0; i < _columns.Length; i++)
            {
                Debug.Assert(_columns[i].Index == i);
                _nameMap[_columns[i].Name] = i;
            }
        }
    }
}