Skip to content

Commit

Permalink
Fix DataFrame to allow to store columns with size more than 2 Gb (#6710)
Browse files Browse the repository at this point in the history
* Fix error with allocating more than MaxCapacity of Byte Memory Buffer

* Remove Unit test as it consumes too much memory

* Fix issue with increasing buffer capacity over limit when double it size
  • Loading branch information
asmirnov82 committed Jul 6, 2023
1 parent 53c0f26 commit 26c2446
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 4 deletions.
5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrameBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ public void EnsureCapacity(int numberOfValues)

if (newLength > Capacity)
{
var newCapacity = Math.Max(newLength * Size, ReadOnlyBuffer.Length * 2);
//Double buffer size, but not higher than MaxByteCapacity
var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, MaxCapacityInBytes);
var newCapacity = Math.Max(newLength * Size, doubledSize);

var memory = new Memory<byte>(new byte[newCapacity]);
_memory.CopyTo(memory);
_memory = memory;
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ public void AppendMany(T? value, long count)
}

DataFrameBuffer<T> mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1);
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity);

//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
mutableLastBuffer.EnsureCapacity(allocatable);

DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
Expand All @@ -205,7 +207,6 @@ public void AppendMany(T? value, long count)
_modifyNullCountWhileIndexing = true;
}


remaining -= allocatable;
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ public ReadOnlyMemory<T> RawReadOnlyMemory

protected int Capacity => ReadOnlyBuffer.Length / Size;

//The maximum size in any single dimension for byte array is 0x7FFFFFc7 - 2147483591
//See https://learn.microsoft.com/en-us/dotnet/framework/configure-apps/file-schema/runtime/gcallowverylargeobjects-element
public const int MaxCapacityInBytes = 2147483591;

public static int MaxCapacity => Int32.MaxValue / Size;
public static int MaxCapacity => MaxCapacityInBytes / Size;

public ReadOnlySpan<T> ReadOnlySpan
{
Expand Down
22 changes: 22 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/BufferTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,28 @@ public void TestArrowStringColumnClone()
Assert.Null(clone[i]);
}

/* Don't run tests during build as they fail, because build if build machine doesn't have enought memory
[Fact]
public void TestAppend_SizeMoreThanMaxBufferCapacity()
{
//Check appending value, than can increase buffer size over MaxCapacity (default strategy is to double buffer capacity)
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", int.MaxValue / 2 - 1);
intColumn.Append(10);
}
[Fact]
public void TestAppendMany_SizeMoreThanMaxBufferCapacity()
{
const int MaxCapacityInBytes = 2147483591;
//Check appending values with extending column size over MaxCapacity of ReadOnlyDataFrameBuffer
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", MaxCapacityInBytes - 5);
intColumn.AppendMany(5, 10);
Assert.Equal(MaxCapacityInBytes + 5, intColumn.Length);
}
*/

//#if !NETFRAMEWORK // https://github.com/dotnet/corefxlab/issues/2796
// [Fact]
// public void TestPrimitiveColumnGetReadOnlyBuffers()
Expand Down

0 comments on commit 26c2446

Please sign in to comment.