Skip to content

Commit 26c2446

Browse files
authored
Fix DataFrame to allow to store columns with size more than 2 Gb (#6710)
* Fix error with allocating more than MaxCapacity of Byte Memory Buffer * Remove Unit test as it consumes too much memory * Fix issue with increasing buffer capacity over limit when double it size
1 parent 53c0f26 commit 26c2446

File tree

4 files changed

+33
-4
lines changed

4 files changed

+33
-4
lines changed

src/Microsoft.Data.Analysis/DataFrameBuffer.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ public void EnsureCapacity(int numberOfValues)
6666

6767
if (newLength > Capacity)
6868
{
69-
var newCapacity = Math.Max(newLength * Size, ReadOnlyBuffer.Length * 2);
69+
//Double buffer size, but not higher than MaxByteCapacity
70+
var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, MaxCapacityInBytes);
71+
var newCapacity = Math.Max(newLength * Size, doubledSize);
72+
7073
var memory = new Memory<byte>(new byte[newCapacity]);
7174
_memory.CopyTo(memory);
7275
_memory = memory;

src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ public void AppendMany(T? value, long count)
181181
}
182182

183183
DataFrameBuffer<T> mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1);
184-
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity);
184+
185+
//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
186+
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
185187
mutableLastBuffer.EnsureCapacity(allocatable);
186188

187189
DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
@@ -205,7 +207,6 @@ public void AppendMany(T? value, long count)
205207
_modifyNullCountWhileIndexing = true;
206208
}
207209

208-
209210
remaining -= allocatable;
210211
}
211212
}

src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,11 @@ public ReadOnlyMemory<T> RawReadOnlyMemory
3636

3737
protected int Capacity => ReadOnlyBuffer.Length / Size;
3838

39+
//The maximum size in any single dimension for byte array is 0x7FFFFFc7 - 2147483591
40+
//See https://learn.microsoft.com/en-us/dotnet/framework/configure-apps/file-schema/runtime/gcallowverylargeobjects-element
41+
public const int MaxCapacityInBytes = 2147483591;
3942

40-
public static int MaxCapacity => Int32.MaxValue / Size;
43+
public static int MaxCapacity => MaxCapacityInBytes / Size;
4144

4245
public ReadOnlySpan<T> ReadOnlySpan
4346
{

test/Microsoft.Data.Analysis.Tests/BufferTests.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,28 @@ public void TestArrowStringColumnClone()
188188
Assert.Null(clone[i]);
189189
}
190190

191+
/* Don't run tests during build as they fail, because build if build machine doesn't have enought memory
192+
[Fact]
193+
public void TestAppend_SizeMoreThanMaxBufferCapacity()
194+
{
195+
//Check appending value, than can increase buffer size over MaxCapacity (default strategy is to double buffer capacity)
196+
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", int.MaxValue / 2 - 1);
197+
intColumn.Append(10);
198+
}
199+
200+
[Fact]
201+
public void TestAppendMany_SizeMoreThanMaxBufferCapacity()
202+
{
203+
const int MaxCapacityInBytes = 2147483591;
204+
205+
//Check appending values with extending column size over MaxCapacity of ReadOnlyDataFrameBuffer
206+
PrimitiveDataFrameColumn<byte> intColumn = new PrimitiveDataFrameColumn<byte>("Byte1", MaxCapacityInBytes - 5);
207+
intColumn.AppendMany(5, 10);
208+
209+
Assert.Equal(MaxCapacityInBytes + 5, intColumn.Length);
210+
}
211+
*/
212+
191213
//#if !NETFRAMEWORK // https://github.com/dotnet/corefxlab/issues/2796
192214
// [Fact]
193215
// public void TestPrimitiveColumnGetReadOnlyBuffers()

0 commit comments

Comments
 (0)