|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
using Apache.Arrow;
using Apache.Arrow.Types;
using Microsoft.ML;
using Microsoft.ML.Data;
namespace Microsoft.Data.Analysis
{
/// <summary>
/// An immutable column to hold Arrow style strings
/// </summary>
public partial class ArrowStringDataFrameColumn : DataFrameColumn, IEnumerable<string>
{
private readonly IList<ReadOnlyDataFrameBuffer<byte>> _dataBuffers;
private readonly IList<ReadOnlyDataFrameBuffer<int>> _offsetsBuffers;
private readonly IList<ReadOnlyDataFrameBuffer<byte>> _nullBitMapBuffers;
/// <summary>
/// Constructs an empty <see cref="ArrowStringDataFrameColumn"/> with the given <paramref name="name"/>.
/// </summary>
/// <param name="name">The name of the column.</param>
public ArrowStringDataFrameColumn(string name) : base(name, 0, typeof(string))
{
_dataBuffers = new List<ReadOnlyDataFrameBuffer<byte>>();
_offsetsBuffers = new List<ReadOnlyDataFrameBuffer<int>>();
_nullBitMapBuffers = new List<ReadOnlyDataFrameBuffer<byte>>();
}
/// <summary>
/// Constructs an <see cref="ArrowStringDataFrameColumn"/> with the given <paramref name="name"/>, <paramref name="length"/> and <paramref name="nullCount"/>. The <paramref name="values"/>, <paramref name="offsets"/> and <paramref name="nullBits"/> are the contents of the column in the Arrow format.
/// </summary>
/// <param name="name">The name of the column.</param>
/// <param name="values">The Arrow formatted string values in this column.</param>
/// <param name="offsets">The Arrow formatted offsets in this column.</param>
/// <param name="nullBits">The Arrow formatted null bits in this column.</param>
/// <param name="length">The length of the column.</param>
/// <param name="nullCount">The number of <see langword="null" /> values in this column.</param>
public ArrowStringDataFrameColumn(string name, ReadOnlyMemory<byte> values, ReadOnlyMemory<byte> offsets, ReadOnlyMemory<byte> nullBits, int length, int nullCount) : base(name, length, typeof(string))
{
ReadOnlyDataFrameBuffer<byte> dataBuffer = new ReadOnlyDataFrameBuffer<byte>(values, values.Length);
ReadOnlyDataFrameBuffer<int> offsetBuffer = new ReadOnlyDataFrameBuffer<int>(offsets, length + 1);
ReadOnlyDataFrameBuffer<byte> nullBitMapBuffer = new ReadOnlyDataFrameBuffer<byte>(nullBits, nullBits.Length);
if (length + 1 != offsetBuffer.Length)
throw new ArgumentException(nameof(offsetBuffer));
_dataBuffers = new List<ReadOnlyDataFrameBuffer<byte>>();
_offsetsBuffers = new List<ReadOnlyDataFrameBuffer<int>>();
_nullBitMapBuffers = new List<ReadOnlyDataFrameBuffer<byte>>();
_dataBuffers.Add(dataBuffer);
_offsetsBuffers.Add(offsetBuffer);
_nullBitMapBuffers.Add(nullBitMapBuffer);
_nullCount = nullCount;
}
private long _nullCount;
/// <inheritdoc/>
public override long NullCount => _nullCount;
/// <inheritdoc/>
public override bool IsValid(long index) => NullCount == 0 || GetValidityBit(index);
private bool GetValidityBit(long index)
{
if ((ulong)index > (ulong)Length)
{
throw new ArgumentOutOfRangeException(nameof(index));
}
// First find the right bitMapBuffer
int bitMapIndex = GetBufferIndexContainingRowIndex(index, out int indexInBuffer);
Debug.Assert(_nullBitMapBuffers.Count > bitMapIndex);
ReadOnlyDataFrameBuffer<byte> bitMapBuffer = _nullBitMapBuffers[bitMapIndex];
int bitMapBufferIndex = (int)((uint)index / 8);
Debug.Assert(bitMapBuffer.Length > bitMapBufferIndex);
byte curBitMap = bitMapBuffer[bitMapBufferIndex];
return ((curBitMap >> (indexInBuffer & 7)) & 1) != 0;
}
private void SetValidityBit(long index, bool value)
{
if ((ulong)index > (ulong)Length)
{
throw new ArgumentOutOfRangeException(nameof(index));
}
// First find the right bitMapBuffer
int bitMapIndex = GetBufferIndexContainingRowIndex(index, out int indexInBuffer);
Debug.Assert(_nullBitMapBuffers.Count > bitMapIndex);
DataFrameBuffer<byte> bitMapBuffer = (DataFrameBuffer<byte>)_nullBitMapBuffers[bitMapIndex];
// Set the bit
int bitMapBufferIndex = (int)((uint)indexInBuffer / 8);
Debug.Assert(bitMapBuffer.Length >= bitMapBufferIndex);
if (bitMapBuffer.Length == bitMapBufferIndex)
bitMapBuffer.Append(0);
byte curBitMap = bitMapBuffer[bitMapBufferIndex];
byte newBitMap;
if (value)
{
newBitMap = (byte)(curBitMap | (byte)(1 << (indexInBuffer & 7))); //bit hack for index % 8
if (((curBitMap >> (indexInBuffer & 7)) & 1) == 0 && indexInBuffer < Length - 1 && NullCount > 0)
{
// Old value was null.
_nullCount--;
}
}
else
{
if (((curBitMap >> (indexInBuffer & 7)) & 1) == 1 && indexInBuffer < Length)
{
// old value was NOT null and new value is null
_nullCount++;
}
else if (indexInBuffer == Length - 1)
{
// New entry from an append
_nullCount++;
}
newBitMap = (byte)(curBitMap & (byte)~(1 << (int)((uint)indexInBuffer & 7)));
}
bitMapBuffer[bitMapBufferIndex] = newBitMap;
}
/// <summary>
/// Returns an enumeration of immutable buffers representing the underlying values in the Apache Arrow format
/// </summary>
/// <remarks><see langword="null" /> values are encoded in the buffers returned by GetReadOnlyNullBitmapBuffers in the Apache Arrow format</remarks>
/// <remarks>The offsets buffers returned by GetReadOnlyOffsetBuffers can be used to delineate each value</remarks>
/// <returns>An enumeration of <see cref="ReadOnlyMemory{Byte}"/> whose elements are the raw data buffers for the UTF8 string values.</returns>
public IEnumerable<ReadOnlyMemory<byte>> GetReadOnlyDataBuffers()
{
for (int i = 0; i < _dataBuffers.Count; i++)
{
ReadOnlyDataFrameBuffer<byte> buffer = _dataBuffers[i];
yield return buffer.RawReadOnlyMemory;
}
}
/// <summary>
/// Returns an enumeration of immutable <see cref="ReadOnlyMemory{Byte}"/> buffers representing <see langword="null" /> values in the Apache Arrow format
/// </summary>
/// <remarks>Each <see cref="ReadOnlyMemory{Byte}"/> encodes the indices of <see langword="null" /> values in its corresponding Data buffer</remarks>
/// <returns>An enumeration of <see cref="ReadOnlyMemory{Byte}"/> objects whose elements encode the null bit maps for the column's values</returns>
public IEnumerable<ReadOnlyMemory<byte>> GetReadOnlyNullBitMapBuffers()
{
for (int i = 0; i < _nullBitMapBuffers.Count; i++)
{
ReadOnlyDataFrameBuffer<byte> buffer = _nullBitMapBuffers[i];
yield return buffer.RawReadOnlyMemory;
}
}
/// <summary>
/// Returns an enumeration of immutable <see cref="ReadOnlyMemory{Int32}"/> representing offsets into its corresponding Data buffer.
/// The Apache Arrow format specifies how the offset buffer encodes the length of each value in the Data buffer
/// </summary>
/// <returns>An enumeration of <see cref="ReadOnlyMemory{Int32}"/> objects.</returns>
public IEnumerable<ReadOnlyMemory<int>> GetReadOnlyOffsetsBuffers()
{
for (int i = 0; i < _offsetsBuffers.Count; i++)
{
ReadOnlyDataFrameBuffer<int> buffer = _offsetsBuffers[i];
yield return buffer.ReadOnlyMemory;
}
}
// This is an immutable column, however this method exists to support Clone(). Keep this method private
// Appending a default string is equivalent to appending null. It increases the NullCount and sets a null bitmap bit
// Appending an empty string is valid. It does NOT affect the NullCount. It instead adds a new offset entry
private void Append(ReadOnlySpan<byte> value)
{
if (_dataBuffers.Count == 0)
{
_dataBuffers.Add(new DataFrameBuffer<byte>());
_nullBitMapBuffers.Add(new DataFrameBuffer<byte>());
_offsetsBuffers.Add(new DataFrameBuffer<int>());
}
DataFrameBuffer<int> mutableOffsetsBuffer = (DataFrameBuffer<int>)_offsetsBuffers[_offsetsBuffers.Count - 1];
if (mutableOffsetsBuffer.Length == 0)
{
mutableOffsetsBuffer.Append(0);
}
Length++;
if (value.IsEmpty)
{
mutableOffsetsBuffer.Append(mutableOffsetsBuffer[mutableOffsetsBuffer.Length - 1]);
}
else
{
DataFrameBuffer<byte> mutableDataBuffer = (DataFrameBuffer<byte>)_dataBuffers[_dataBuffers.Count - 1];
if (mutableDataBuffer.Length == ReadOnlyDataFrameBuffer<byte>.MaxCapacity)
{
mutableDataBuffer = new DataFrameBuffer<byte>();
_dataBuffers.Add(mutableDataBuffer);
_nullBitMapBuffers.Add(new DataFrameBuffer<byte>());
mutableOffsetsBuffer = new DataFrameBuffer<int>();
_offsetsBuffers.Add(mutableOffsetsBuffer);
mutableOffsetsBuffer.Append(0);
}
var startIndex = mutableDataBuffer.Length;
mutableDataBuffer.IncreaseSize(value.Length);
value.CopyTo(mutableDataBuffer.RawSpan.Slice(startIndex));
mutableOffsetsBuffer.Append(mutableOffsetsBuffer[mutableOffsetsBuffer.Length - 1] + value.Length);
}
SetValidityBit(Length - 1, !value.IsEmpty);
}
private int GetBufferIndexContainingRowIndex(long rowIndex, out int indexInBuffer)
{
if (rowIndex >= Length)
{
throw new ArgumentOutOfRangeException(Strings.IndexIsGreaterThanColumnLength, nameof(rowIndex));
}
// Since the strings here could be of variable length, scan linearly
int curArrayIndex = 0;
int numBuffers = _offsetsBuffers.Count;
while (curArrayIndex < numBuffers && rowIndex > _offsetsBuffers[curArrayIndex].Length - 1)
{
rowIndex -= _offsetsBuffers[curArrayIndex].Length - 1;
curArrayIndex++;
}
indexInBuffer = (int)rowIndex;
return curArrayIndex;
}
private ReadOnlySpan<byte> GetBytes(long index)
{
int offsetsBufferIndex = GetBufferIndexContainingRowIndex(index, out int indexInBuffer);
ReadOnlySpan<int> offsetBufferSpan = _offsetsBuffers[offsetsBufferIndex].ReadOnlySpan;
int currentOffset = offsetBufferSpan[indexInBuffer];
int nextOffset = offsetBufferSpan[indexInBuffer + 1];
int numberOfBytes = nextOffset - currentOffset;
return _dataBuffers[offsetsBufferIndex].ReadOnlySpan.Slice(currentOffset, numberOfBytes);
}
/// <inheritdoc/>
protected override object GetValue(long rowIndex) => GetValueImplementation(rowIndex);
private string GetValueImplementation(long rowIndex)
{
if (!IsValid(rowIndex))
{
return null;
}
var bytes = GetBytes(rowIndex);
unsafe
{
fixed (byte* data = &MemoryMarshal.GetReference(bytes))
return Encoding.UTF8.GetString(data, bytes.Length);
}
}
/// <inheritdoc/>
protected override IReadOnlyList<object> GetValues(long startIndex, int length)
{
var ret = new List<object>();
while (ret.Count < length)
{
ret.Add(GetValueImplementation(startIndex++));
}
return ret;
}
/// <inheritdoc/>
protected override void SetValue(long rowIndex, object value) => throw new NotSupportedException(Strings.ImmutableColumn);
/// <summary>
/// Indexer to get values. This is an immutable column
/// </summary>
/// <param name="rowIndex">Zero based row index</param>
/// <returns>The value stored at this <paramref name="rowIndex"/></returns>
public new string this[long rowIndex]
{
get => GetValueImplementation(rowIndex);
set => throw new NotSupportedException(Strings.ImmutableColumn);
}
/// <summary>
/// Returns <paramref name="length"/> number of values starting from <paramref name="startIndex"/>.
/// </summary>
/// <param name="startIndex">The index of the first value to return.</param>
/// <param name="length">The number of values to return starting from <paramref name="startIndex"/></param>
/// <returns>A new list of string values</returns>
public new List<string> this[long startIndex, int length]
{
get
{
var ret = new List<string>();
while (ret.Count < length)
{
ret.Add(GetValueImplementation(startIndex++));
}
return ret;
}
}
/// <summary>
/// Returns an enumerator that iterates through the string values in this column.
/// </summary>
public IEnumerator<string> GetEnumerator()
{
for (long i = 0; i < Length; i++)
{
yield return this[i];
}
}
/// <inheritdoc/>
protected override IEnumerator GetEnumeratorCore() => GetEnumerator();
/// <inheritdoc/>
protected internal override Field GetArrowField() => new Field(Name, StringType.Default, NullCount != 0);
/// <inheritdoc/>
protected internal override int GetMaxRecordBatchLength(long startIndex)
{
if (Length == 0)
return 0;
int offsetsBufferIndex = GetBufferIndexContainingRowIndex(startIndex, out int indexInBuffer);
Debug.Assert(indexInBuffer <= Int32.MaxValue);
return _offsetsBuffers[offsetsBufferIndex].Length - indexInBuffer;
}
private int GetNullCount(long startIndex, int numberOfRows)
{
int nullCount = 0;
for (long i = startIndex; i < numberOfRows; i++)
{
if (!IsValid(i))
nullCount++;
}
return nullCount;
}
/// <inheritdoc/>
protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int numberOfRows)
{
if (numberOfRows == 0)
return new StringArray(numberOfRows, ArrowBuffer.Empty, ArrowBuffer.Empty, ArrowBuffer.Empty);
int offsetsBufferIndex = GetBufferIndexContainingRowIndex(startIndex, out int indexInBuffer);
if (numberOfRows != 0 && numberOfRows > _offsetsBuffers[offsetsBufferIndex].Length - 1 - indexInBuffer)
{
throw new ArgumentException(Strings.SpansMultipleBuffers, nameof(numberOfRows));
}
ArrowBuffer dataBuffer = new ArrowBuffer(_dataBuffers[offsetsBufferIndex].ReadOnlyBuffer);
ArrowBuffer offsetsBuffer = new ArrowBuffer(_offsetsBuffers[offsetsBufferIndex].ReadOnlyBuffer);
ArrowBuffer nullBuffer = new ArrowBuffer(_nullBitMapBuffers[offsetsBufferIndex].ReadOnlyBuffer);
int nullCount = GetNullCount(indexInBuffer, numberOfRows);
return new StringArray(numberOfRows, offsetsBuffer, dataBuffer, nullBuffer, nullCount, indexInBuffer);
}
protected internal override PrimitiveDataFrameColumn<long> GetSortIndices(bool ascending, bool putNullValuesLast) => throw new NotSupportedException();
public new ArrowStringDataFrameColumn Clone(long numberOfNullsToAppend = 0)
{
return (ArrowStringDataFrameColumn)CloneImplementation(numberOfNullsToAppend);
}
public new ArrowStringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
return (ArrowStringDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);
}
/// <inheritdoc/>
protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend)
{
var ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < Length; i++)
ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan<byte>));
for (long i = 0; i < numberOfNullsToAppend; i++)
ret.Append(default);
return ret;
}
/// <inheritdoc/>
protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
ArrowStringDataFrameColumn clone;
if (!(mapIndices is null))
{
Type dataType = mapIndices.DataType;
if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool))
throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices));
if (mapIndices.DataType == typeof(long))
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
else if (dataType == typeof(int))
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
else
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<bool>);
for (long i = 0; i < numberOfNullsToAppend; i++)
clone.Append(default);
}
else
{
clone = Clone(numberOfNullsToAppend);
}
return clone;
}
private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn<bool> boolColumn)
{
if (boolColumn.Length > Length)
throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn));
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < boolColumn.Length; i++)
{
bool? value = boolColumn[i];
if (value == true)
ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan<byte>));
}
return ret;
}
private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn<int> mapIndices, bool invertMapIndices)
{
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < mapIndices.Length; i++)
{
int? index = mapIndices[invertMapIndices ? mapIndices.Length - 1 - i : i];
if (index == null)
{
ret.Append(default);
continue;
}
ret.Append(IsValid(index.Value) ? GetBytes(index.Value) : default(ReadOnlySpan<byte>));
}
return ret;
}
private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn<long> mapIndices, bool invertMapIndices)
{
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < mapIndices.Length; i++)
{
long? index = mapIndices[invertMapIndices ? mapIndices.Length - 1 - i : i];
if (index == null)
{
ret.Append(default);
continue;
}
ret.Append(IsValid(index.Value) ? GetBytes(index.Value) : default(ReadOnlySpan<byte>));
}
return ret;
}
/// <inheritdoc/>
public override DataFrame ValueCounts()
{
Dictionary<string, ICollection<long>> groupedValues = GroupColumnValues<string>(out HashSet<long> _);
return StringDataFrameColumn.ValueCountsImplementation(groupedValues);
}
/// <inheritdoc/>
public override GroupBy GroupBy(int columnIndex, DataFrame parent)
{
Dictionary<string, ICollection<long>> dictionary = GroupColumnValues<string>(out HashSet<long> _);
return new GroupBy<string>(parent, columnIndex, dictionary);
}
/// <inheritdoc/>
public override Dictionary<TKey, ICollection<long>> GroupColumnValues<TKey>(out HashSet<long> nullIndices)
{
if (typeof(TKey) == typeof(string))
{
nullIndices = new HashSet<long>();
Dictionary<string, ICollection<long>> multimap = new Dictionary<string, ICollection<long>>(EqualityComparer<string>.Default);
for (long i = 0; i < Length; i++)
{
string str = this[i];
if (str != null)
{
bool containsKey = multimap.TryGetValue(str, out ICollection<long> values);
if (containsKey)
{
values.Add(i);
}
else
{
multimap.Add(str, new List<long>() { i });
}
}
else
{
nullIndices.Add(i);
}
}
return multimap as Dictionary<TKey, ICollection<long>>;
}
else
{
throw new NotSupportedException(nameof(TKey));
}
}
/// <inheritdoc/>
public ArrowStringDataFrameColumn FillNulls(string value, bool inPlace = false)
{
if (value == null)
{
throw new ArgumentException(nameof(value));
}
if (inPlace)
{
/* For now throw an exception if inPlace = true. Need to investigate if Apache Arrow
* format supports filling nulls for variable length arrays
*/
throw new NotSupportedException();
}
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < Length; i++)
{
ret.Append(IsValid(i) ? GetBytes(i) : Encoding.UTF8.GetBytes(value));
}
return ret;
}
protected override DataFrameColumn FillNullsImplementation(object value, bool inPlace)
{
if (value is string valueString)
{
return FillNulls(valueString, inPlace);
}
else
{
throw new ArgumentException(String.Format(Strings.MismatchedValueType, typeof(string)), nameof(value));
}
}
/// <inheritdoc/>
public new ArrowStringDataFrameColumn DropNulls()
{
return (ArrowStringDataFrameColumn)DropNullsImplementation();
}
protected override DataFrameColumn DropNullsImplementation()
{
var ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < Length; i++)
{
if (IsValid(i))
ret.Append(GetBytes(i));
}
return ret;
}
public override DataFrameColumn Clamp<U>(U min, U max, bool inPlace = false) => throw new NotSupportedException();
public override DataFrameColumn Filter<U>(U min, U max) => throw new NotSupportedException();
/// <inheritdoc/>
protected internal override void AddDataViewColumn(DataViewSchema.Builder builder)
{
builder.AddColumn(Name, TextDataViewType.Instance);
}
/// <inheritdoc/>
protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)
{
return CreateValueGetterDelegate(cursor);
}
private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();
/// <summary>
/// Returns a boolean column that is the result of an elementwise equality comparison of each value in the column with <paramref name="value"/>
/// </summary>
public PrimitiveDataFrameColumn<bool> ElementwiseEquals(string value)
{
ReadOnlySpan<byte> bytes = value != null ? Encoding.UTF8.GetBytes(value) : default(ReadOnlySpan<byte>);
PrimitiveDataFrameColumn<bool> ret = new PrimitiveDataFrameColumn<bool>(Name, Length);
if (value == null)
{
for (long i = 0; i < Length; i++)
{
ret[i] = !IsValid(i);
}
}
else
{
for (long i = 0; i < Length; i++)
{
var strBytes = GetBytes(i);
ret[i] = strBytes.SequenceEqual(bytes);
}
}
return ret;
}
/// <inheritdoc/>
public override PrimitiveDataFrameColumn<bool> ElementwiseEquals<T>(T value)
{
if (value is DataFrameColumn column)
{
return ElementwiseEquals(column);
}
return ElementwiseEquals(value.ToString());
}
/// <inheritdoc/>
public override PrimitiveDataFrameColumn<bool> ElementwiseEquals(DataFrameColumn column)
{
return StringDataFrameColumn.ElementwiseEqualsImplementation(this, column);
}
/// <summary>
/// Returns a boolean column that is the result of an elementwise not-equal comparison of each value in the column with <paramref name="value"/>
/// </summary>
public PrimitiveDataFrameColumn<bool> ElementwiseNotEquals(string value)
{
ReadOnlySpan<byte> bytes = value != null ? Encoding.UTF8.GetBytes(value) : default(ReadOnlySpan<byte>);
PrimitiveDataFrameColumn<bool> ret = new PrimitiveDataFrameColumn<bool>(Name, Length);
if (value == null)
{
for (long i = 0; i < Length; i++)
{
ret[i] = IsValid(i);
}
}
else
{
for (long i = 0; i < Length; i++)
{
var strBytes = GetBytes(i);
ret[i] = !strBytes.SequenceEqual(bytes);
}
}
return ret;
}
/// <inheritdoc/>
public override PrimitiveDataFrameColumn<bool> ElementwiseNotEquals<T>(T value)
{
if (value is DataFrameColumn column)
{
return ElementwiseNotEquals(column);
}
return ElementwiseNotEquals(value.ToString());
}
/// <inheritdoc/>
public override PrimitiveDataFrameColumn<bool> ElementwiseNotEquals(DataFrameColumn column)
{
return StringDataFrameColumn.ElementwiseNotEqualsImplementation(this, column);
}
/// <summary>
/// Applies a function to all the values
/// </summary>
/// <param name="func">The function to apply</param>
/// <returns>A <see cref="ArrowStringDataFrameColumn"/> containing the new string values</returns>
/// <remarks>This function converts from UTF-8 to UTF-16 strings</remarks>
public ArrowStringDataFrameColumn Apply(Func<string, string> func)
{
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
Encoding encoding = Encoding.UTF8;
for (long i = 0; i < Length; i++)
{
string cur = this[i];
string funcResult = func(cur);
ret.Append(funcResult != null ? encoding.GetBytes(funcResult) : default(ReadOnlySpan<byte>));
}
return ret;
}
public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<string>(other, out otherColumnNullIndices);
}
}
}
|