Skip to content

Commit 0f8a080

Browse files
Yicong-HuangYicong Huang
andauthored
GH-343: Fix ListVector offset buffer not properly serialized for nested empty arrays (#967)
## What's Changed Fix `ListVector`/`LargeListVector` IPC serialization when `valueCount` is 0. ### Problem When `valueCount == 0`, `setReaderAndWriterIndex()` was setting `offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC serializer uses `readableBytes()` to determine buffer size, so 0 bytes were written to the IPC stream. This crashes IPC readers in other libraries because Arrow spec requires offset buffer to have at least one entry `[0]`. @viirya: > The offset buffers are allocated properly. But during IPC serialization, they are ignored. > ``` > public long readableBytes() { > return writerIndex - readerIndex; > } > ``` > So when ListVector.setReaderAndWriterIndex() sets writerIndex(0) and readerIndex(0), readableBytes() returns 0 - 0 = 0. > > Then when MessageSerializer.writeBatchBuffers() calls WriteChannel.write(buffer), it writes 0 bytes. > > So the flow is: > > valueCount=0 → ListVector.setReaderAndWriterIndex() sets offsetBuffer.writerIndex(0) > VectorUnloader.getFieldBuffers() returns the buffer with writerIndex=0 > MessageSerializer.writeBatchBuffers() writes the buffer > WriteChannel.write(buffer) checks buffer.readableBytes() which is 0 > 0 bytes are written to the IPC stream > PyArrow read the batch with the missing buffer → crash when other libraries to read ### Fix Simplify `setReaderAndWriterIndex()` to always use `(valueCount + 1) * OFFSET_WIDTH` for offset buffer's `writerIndex`. When `valueCount == 0`, this correctly sets `writerIndex` to `OFFSET_WIDTH`, ensuring `offset[0]` is included in serialization. ### Testing Added tests for nested empty lists verifying offset buffer has correct `readableBytes()`. Closes #343. --------- Co-authored-by: Yicong Huang <yicong.huang+data@databricks.com>
1 parent ccaac9a commit 0f8a080

File tree

4 files changed

+50
-4
lines changed

4 files changed

+50
-4
lines changed

vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,11 +309,14 @@ private void setReaderAndWriterIndex() {
309309
offsetBuffer.readerIndex(0);
310310
if (valueCount == 0) {
311311
validityBuffer.writerIndex(0);
312-
offsetBuffer.writerIndex(0);
313312
} else {
314313
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
315-
offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
316314
}
315+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
316+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
317+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
318+
// is [0].
319+
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
317320
}
318321

319322
/**

vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,11 +267,14 @@ private void setReaderAndWriterIndex() {
267267
offsetBuffer.readerIndex(0);
268268
if (valueCount == 0) {
269269
validityBuffer.writerIndex(0);
270-
offsetBuffer.writerIndex(0);
271270
} else {
272271
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
273-
offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
274272
}
273+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
274+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
275+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
276+
// is [0].
277+
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
275278
}
276279

277280
/**

vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,26 @@ public void testCopyValueSafeForExtensionType() throws Exception {
11001100
}
11011101
}
11021102

1103+
@Test
1104+
public void testEmptyLargeListOffsetBuffer() {
1105+
// Test that LargeListVector has correct readableBytes after allocation.
1106+
// According to Arrow spec, offset buffer must have N+1 entries.
1107+
// Even when N=0, it should contain [0].
1108+
try (LargeListVector list = LargeListVector.empty("list", allocator)) {
1109+
list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
1110+
list.allocateNew();
1111+
list.setValueCount(0);
1112+
1113+
List<ArrowBuf> buffers = list.getFieldBuffers();
1114+
assertTrue(
1115+
buffers.get(1).readableBytes() >= LargeListVector.OFFSET_WIDTH,
1116+
"Offset buffer should have at least "
1117+
+ LargeListVector.OFFSET_WIDTH
1118+
+ " bytes for offset[0]");
1119+
assertEquals(0L, list.getOffsetBuffer().getLong(0));
1120+
}
1121+
}
1122+
11031123
private void writeIntValues(UnionLargeListWriter writer, int[] values) {
11041124
writer.startList();
11051125
for (int v : values) {

vector/src/test/java/org/apache/arrow/vector/TestListVector.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,6 +1379,26 @@ public void testCopyValueSafeForExtensionType() throws Exception {
13791379
}
13801380
}
13811381

1382+
@Test
1383+
public void testEmptyListOffsetBuffer() {
1384+
// Test that ListVector has correct readableBytes after allocation.
1385+
// According to Arrow spec, offset buffer must have N+1 entries.
1386+
// Even when N=0, it should contain [0].
1387+
try (ListVector list = ListVector.empty("list", allocator)) {
1388+
list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
1389+
list.allocateNew();
1390+
list.setValueCount(0);
1391+
1392+
List<ArrowBuf> buffers = list.getFieldBuffers();
1393+
assertTrue(
1394+
buffers.get(1).readableBytes() >= BaseRepeatedValueVector.OFFSET_WIDTH,
1395+
"Offset buffer should have at least "
1396+
+ BaseRepeatedValueVector.OFFSET_WIDTH
1397+
+ " bytes for offset[0]");
1398+
assertEquals(0, list.getOffsetBuffer().getInt(0));
1399+
}
1400+
}
1401+
13821402
private void writeIntValues(UnionListWriter writer, int[] values) {
13831403
writer.startList();
13841404
for (int v : values) {

0 commit comments

Comments
 (0)