Skip to content

Fix incorrect DataFrame min max computation with NULL #6734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 39 additions & 18 deletions src/Microsoft.Data.Analysis/DateTimeComputation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Reflection;
using System.Text;

namespace Microsoft.Data.Analysis
Expand Down Expand Up @@ -189,26 +191,35 @@ public void CumulativeSum(PrimitiveColumnContainer<DateTime> column, IEnumerable
throw new NotSupportedException();
}

public void Max(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
public void Max(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
{
ret = column.Buffers[0].ReadOnlySpan[0];
var maxDate = DateTime.MinValue;
bool hasMaxValue = false;

for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var readOnlySpan = buffer.ReadOnlySpan;
var readOnlySpan = column.Buffers[b].ReadOnlySpan;
var bitmapSpan = column.NullBitMapBuffers[b].ReadOnlySpan;
for (int i = 0; i < readOnlySpan.Length; i++)
{
//Check if bit is not set (value is null) - skip
if (!BitmapHelper.IsValid(bitmapSpan, i))
continue;

var val = readOnlySpan[i];

if (val > ret)
if (val > maxDate)
{
ret = val;
maxDate = val;
hasMaxValue = true;
}
}
}

ret = hasMaxValue ? maxDate : null;
}

public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
{
ret = default;
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
Expand Down Expand Up @@ -237,26 +248,36 @@ public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> row
}
}

public void Min(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
public void Min(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
{
ret = column.Buffers[0].ReadOnlySpan[0];
var minDate = DateTime.MaxValue;
bool hasMinValue = false;

for (int b = 0; b < column.Buffers.Count; b++)
{
var buffer = column.Buffers[b];
var readOnlySpan = buffer.ReadOnlySpan;
var readOnlySpan = column.Buffers[b].ReadOnlySpan;
var bitmapSpan = column.NullBitMapBuffers[b].ReadOnlySpan;

for (int i = 0; i < readOnlySpan.Length; i++)
{
//Check if bit is not set (value is null) - skip
if (!BitmapHelper.IsValid(bitmapSpan, i))
continue;

var val = readOnlySpan[i];

if (val < ret)
if (val < minDate)
{
ret = val;
minDate = val;
hasMinValue = true;
}
}
}

ret = hasMinValue ? minDate : null;
}

public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
{
ret = default;
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
Expand Down Expand Up @@ -285,22 +306,22 @@ public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> row
}
}

public void Product(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
public void Product(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
{
throw new NotSupportedException();
}

public void Product(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
public void Product(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
{
throw new NotSupportedException();
}

public void Sum(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
public void Sum(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
{
throw new NotSupportedException();
}

public void Sum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
public void Sum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
{
throw new NotSupportedException();
}
Expand Down
41 changes: 22 additions & 19 deletions src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,24 @@

namespace Microsoft.Data.Analysis
{
internal static class BitmapHelper
{
// Faster to use when we already have a span since it avoids indexing
public static bool IsValid(ReadOnlySpan<byte> bitMapBufferSpan, int index)
{
int nullBitMapSpanIndex = index / 8;
byte thisBitMap = bitMapBufferSpan[nullBitMapSpanIndex];
return IsBitSet(thisBitMap, index);
}

public static bool IsBitSet(byte curBitMap, int index)
{
return ((curBitMap >> (index & 7)) & 1) != 0;
}
}

/// <summary>
/// PrimitiveDataFrameColumnContainer is just a store for the column data. APIs that want to change the data must be defined in PrimitiveDataFrameColumn
/// PrimitiveColumnContainer is just a store for the column data. APIs that want to change the data must be defined in PrimitiveDataFrameColumn
/// </summary>
/// <typeparam name="T"></typeparam>
internal partial class PrimitiveColumnContainer<T> : IEnumerable<T?>
Expand Down Expand Up @@ -224,7 +240,7 @@ public void ApplyElementwise(Func<T?, long, T?> func)
for (int i = 0; i < mutableBuffer.Length; i++)
{
long curIndex = i + prevLength;
bool isValid = IsValid(mutableNullBitMapBuffer, i);
bool isValid = BitmapHelper.IsValid(mutableNullBitMapBuffer, i);
T? value = func(isValid ? mutableBuffer[i] : null, curIndex);
mutableBuffer[i] = value.GetValueOrDefault();
SetValidityBit(mutableNullBitMapBuffer, i, value != null);
Expand All @@ -247,22 +263,14 @@ public void Apply<TResult>(Func<T?, TResult?> func, PrimitiveColumnContainer<TRe

for (int i = 0; i < sourceBuffer.Length; i++)
{
bool isValid = IsValid(sourceNullBitMap, i);
bool isValid = BitmapHelper.IsValid(sourceNullBitMap, i);
TResult? value = func(isValid ? sourceBuffer[i] : null);
mutableResultBuffer[i] = value.GetValueOrDefault();
resultContainer.SetValidityBit(mutableResultNullBitMapBuffers, i, value != null);
}
}
}

// Faster to use when we already have a span since it avoids indexing
public bool IsValid(ReadOnlySpan<byte> bitMapBufferSpan, int index)
{
int nullBitMapSpanIndex = index / 8;
byte thisBitMap = bitMapBufferSpan[nullBitMapSpanIndex];
return IsBitSet(thisBitMap, index);
}

public bool IsValid(long index) => NullCount == 0 || GetValidityBit(index);

private byte SetBit(byte curBitMap, int index, bool value)
Expand Down Expand Up @@ -330,11 +338,6 @@ internal void SetValidityBit(long index, bool value)
SetValidityBit(bitMapBuffer.Span, (int)index, value);
}

private bool IsBitSet(byte curBitMap, int index)
{
return ((curBitMap >> (index & 7)) & 1) != 0;
}

private bool GetValidityBit(long index)
{
if ((uint)index >= Length)
Expand All @@ -351,7 +354,7 @@ private bool GetValidityBit(long index)
int bitMapBufferIndex = (int)((uint)index / 8);
Debug.Assert(bitMapBuffer.Length > bitMapBufferIndex);
byte curBitMap = bitMapBuffer[bitMapBufferIndex];
return IsBitSet(curBitMap, (int)index);
return BitmapHelper.IsBitSet(curBitMap, (int)index);
}

public long Length;
Expand Down Expand Up @@ -513,7 +516,7 @@ public PrimitiveColumnContainer<T> Clone<U>(PrimitiveColumnContainer<U> mapIndic
spanIndex = buffer.Length - 1 - i;

long mapRowIndex = mapIndicesIntSpan.IsEmpty ? mapIndicesLongSpan[spanIndex] : mapIndicesIntSpan[spanIndex];
bool mapRowIndexIsValid = mapIndices.IsValid(mapIndicesNullBitMapSpan, spanIndex);
bool mapRowIndexIsValid = BitmapHelper.IsValid(mapIndicesNullBitMapSpan, spanIndex);
if (mapRowIndexIsValid && (mapRowIndex < minRange || mapRowIndex >= maxRange))
{
int bufferIndex = (int)(mapRowIndex / maxCapacity);
Expand All @@ -528,7 +531,7 @@ public PrimitiveColumnContainer<T> Clone<U>(PrimitiveColumnContainer<U> mapIndic
{
mapRowIndex -= minRange;
value = thisSpan[(int)mapRowIndex];
isValid = IsValid(thisNullBitMapSpan, (int)mapRowIndex);
isValid = BitmapHelper.IsValid(thisNullBitMapSpan, (int)mapRowIndex);
}

retSpan[i] = isValid ? value : default;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,49 +93,49 @@ public override DataFrameColumn CumulativeSum(IEnumerable<long> rowIndices, bool
/// <inheritdoc/>
public override object Max()
{
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Max(IEnumerable<long> rowIndices)
{
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, rowIndices, out T ret);
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, rowIndices, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Min()
{
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Min(IEnumerable<long> rowIndices)
{
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, rowIndices, out T ret);
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, rowIndices, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Product()
{
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Product(IEnumerable<long> rowIndices)
{
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, rowIndices, out T ret);
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, rowIndices, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Sum()
{
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, out T? ret);
return ret;
}
/// <inheritdoc/>
public override object Sum(IEnumerable<long> rowIndices)
{
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, rowIndices, out T ret);
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, rowIndices, out T? ret);
return ret;
}
/// <inheritdoc/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ namespace Microsoft.Data.Analysis
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(ret._columnContainer);
return ret;
<# } else if (compMethod.MethodType == MethodType.ElementwiseComputation && compMethod.HasReturnValue == true) {#>
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T? ret);
return ret;
<# } else if (compMethod.MethodType == MethodType.Reduction && compMethod.IsNumeric == true && compMethod.SupportsRowSubsets == true) { #>
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, rowIndices, out T ret);
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, rowIndices, out T? ret);
return ret;
<# } else if (compMethod.MethodType == MethodType.Reduction && compMethod.IsNumeric == true) { #>
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T ret);
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T? ret);
return ret;
<# } else { #>
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out bool ret);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ private Int64DataFrameColumn GetSortIndices(IComparer<T> comparer, out Int64Data
for (int i = 0; i < sortIndices.Length; i++)
{
int localSortIndex = sortIndices[i];
if (_columnContainer.IsValid(nullBitMapSpan, localSortIndex))
if (BitmapHelper.IsValid(nullBitMapSpan, localSortIndex))
{
nonNullSortIndices.Add(sortIndices[i]);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ public override Dictionary<TKey, ICollection<long>> GroupColumnValues<TKey>(out
for (int i = 0; i < readOnlySpan.Length; i++)
{
long currentLength = i + previousLength;
if (_columnContainer.IsValid(nullBitMapSpan, i))
if (BitmapHelper.IsValid(nullBitMapSpan, i))
{
bool containsKey = multimap.TryGetValue(readOnlySpan[i], out ICollection<long> values);
if (containsKey)
Expand Down
Loading