Skip to content

Fix DataFrame to allow to store columns with size more than 2 Gb #6710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrameBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ public void EnsureCapacity(int numberOfValues)

if (newLength > Capacity)
{
var newCapacity = Math.Max(newLength * Size, ReadOnlyBuffer.Length * 2);
//Double buffer size, but not higher than MaxByteCapacity
var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, MaxCapacityInBytes);
var newCapacity = Math.Max(newLength * Size, doubledSize);

var memory = new Memory<byte>(new byte[newCapacity]);
_memory.CopyTo(memory);
_memory = memory;
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ public void AppendMany(T? value, long count)
}

DataFrameBuffer<T> mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1);
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity);

//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
mutableLastBuffer.EnsureCapacity(allocatable);

DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
Expand All @@ -205,7 +207,6 @@ public void AppendMany(T? value, long count)
_modifyNullCountWhileIndexing = true;
}


remaining -= allocatable;
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ public ReadOnlyMemory<T> RawReadOnlyMemory

protected int Capacity => ReadOnlyBuffer.Length / Size;

//The maximum size in any single dimension for byte array is 0x7FFFFFc7 - 2147483591
//See https://learn.microsoft.com/en-us/dotnet/framework/configure-apps/file-schema/runtime/gcallowverylargeobjects-element
public const int MaxCapacityInBytes = 2147483591;

public static int MaxCapacity => Int32.MaxValue / Size;
public static int MaxCapacity => MaxCapacityInBytes / Size;

public ReadOnlySpan<T> ReadOnlySpan
{
Expand Down
22 changes: 22 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/BufferTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,28 @@ public void TestArrowStringColumnClone()
Assert.Null(clone[i]);
}

/* Don't run tests during build as they fail, because build if build machine doesn't have enought memory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@asmirnov82 are you sure this is the reason?

Can we add a flag so these still run locally?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be able to do a #if and then set a flag in build definition

Copy link
Contributor

@JakeRadMSFT JakeRadMSFT Jul 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@asmirnov82 can you open an issue. I don't want to block on this.

@tannergooding said this might actually be a netfx thing? There might need to be something set to support > 2gb. Comment below.

"For the more than 2GB one, it will be allowed but will then fail on .NET Framework anyways due to GC limitations

You have to opt in with gcAllowVeryLargeObjects iirc, and that's only for 64-bit"

Copy link
Contributor Author

@asmirnov82 asmirnov82 Jul 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JakeRadMSFT I fixed tests using X64FactAttribute. Please take a look at #6758 and #6759

[Fact]
public void TestAppend_SizeMoreThanMaxBufferCapacity()
{
//Check appending value, than can increase buffer size over MaxCapacity (default strategy is to double buffer capacity)
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", int.MaxValue / 2 - 1);
intColumn.Append(10);
}

[Fact]
public void TestAppendMany_SizeMoreThanMaxBufferCapacity()
{
const int MaxCapacityInBytes = 2147483591;

//Check appending values with extending column size over MaxCapacity of ReadOnlyDataFrameBuffer
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", MaxCapacityInBytes - 5);
intColumn.AppendMany(5, 10);

Assert.Equal(MaxCapacityInBytes + 5, intColumn.Length);
}
*/

//#if !NETFRAMEWORK // https://github.com/dotnet/corefxlab/issues/2796
// [Fact]
// public void TestPrimitiveColumnGetReadOnlyBuffers()
Expand Down