Skip to content

Commit 76e9eef

Browse files
julienledemkou
authored andcommitted
ARROW-398: Java file format requires bitmaps of all 1's to be written…
… when there are no nulls Author: Julien Le Dem <julien@dremio.com> Closes apache#222 from julienledem/empty_buf and squashes the following commits: c29da53 [Julien Le Dem] fix extraneous bits 4e87d88 [Julien Le Dem] ARROW-398: Java file format requires bitmaps of all 1's to be written when there are no nulls
1 parent 588a916 commit 76e9eef

File tree

11 files changed

+116
-30
lines changed

11 files changed

+116
-30
lines changed

vector/src/main/codegen/templates/NullableValueVectors.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ public List<FieldVector> getChildrenFromFields() {
144144

145145
@Override
146146
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
147-
org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
147+
org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
148148
bits.valueCount = fieldNode.getLength();
149149
}
150150

vector/src/main/codegen/templates/UnionVector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public List<FieldVector> getChildrenFromFields() {
105105

106106
@Override
107107
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
108-
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
108+
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
109109
this.valueCount = fieldNode.getLength();
110110
}
111111

vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.List;
2222

2323
import org.apache.arrow.memory.BufferAllocator;
24+
import org.apache.arrow.vector.schema.ArrowFieldNode;
2425

2526
import io.netty.buffer.ArrowBuf;
2627

@@ -29,13 +30,13 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf
2930

3031
protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this
3132

32-
public static void load(List<BufferBacked> vectors, List<ArrowBuf> buffers) {
33+
public static void load(ArrowFieldNode fieldNode, List<BufferBacked> vectors, List<ArrowBuf> buffers) {
3334
int expectedSize = vectors.size();
3435
if (buffers.size() != expectedSize) {
3536
throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size());
3637
}
3738
for (int i = 0; i < expectedSize; i++) {
38-
vectors.get(i).load(buffers.get(i));
39+
vectors.get(i).load(fieldNode, buffers.get(i));
3940
}
4041
}
4142

@@ -106,7 +107,7 @@ public ArrowBuf getBuffer() {
106107
}
107108

108109
@Override
109-
public void load(ArrowBuf data) {
110+
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
110111
this.data.release();
111112
this.data = data.retain(allocator);
112113
}

vector/src/main/java/org/apache/arrow/vector/BitVector.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.arrow.vector.complex.reader.FieldReader;
2323
import org.apache.arrow.vector.holders.BitHolder;
2424
import org.apache.arrow.vector.holders.NullableBitHolder;
25+
import org.apache.arrow.vector.schema.ArrowFieldNode;
2526
import org.apache.arrow.vector.types.Types.MinorType;
2627
import org.apache.arrow.vector.types.pojo.Field;
2728
import org.apache.arrow.vector.util.OversizedAllocationException;
@@ -48,6 +49,41 @@ public BitVector(String name, BufferAllocator allocator) {
4849
super(name, allocator);
4950
}
5051

52+
@Override
53+
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
54+
// When the vector is all nulls or all defined, the content of the buffer can be omitted
55+
if (data.readableBytes() == 0 && fieldNode.getLength() != 0) {
56+
data.release();
57+
int count = fieldNode.getLength();
58+
allocateNew(count);
59+
int n = getSizeFromCount(count);
60+
if (fieldNode.getNullCount() == 0) {
61+
// all defined
62+
// create an all 1s buffer
63+
// set full bytes
64+
int fullBytesCount = count / 8;
65+
for (int i = 0; i < fullBytesCount; ++i) {
66+
this.data.setByte(i, 0xFF);
67+
}
68+
int remainder = count % 8;
69+
// set remaining bits
70+
if (remainder > 0) {
71+
byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7));;
72+
this.data.setByte(fullBytesCount, bitMask);
73+
}
74+
} else if (fieldNode.getNullCount() == fieldNode.getLength()) {
75+
// all null
76+
// create an all 0s buffer
77+
zeroVector();
78+
} else {
79+
throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined");
80+
}
81+
this.data.writerIndex(n);
82+
} else {
83+
super.load(fieldNode, data);
84+
}
85+
}
86+
5187
@Override
5288
public Field getField() {
5389
throw new UnsupportedOperationException("internal vector");

vector/src/main/java/org/apache/arrow/vector/BufferBacked.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@
1717
*/
1818
package org.apache.arrow.vector;
1919

20+
import org.apache.arrow.vector.schema.ArrowFieldNode;
21+
2022
import io.netty.buffer.ArrowBuf;
2123

2224
/**
2325
* Content is backed by a buffer and can be loaded/unloaded
2426
*/
2527
public interface BufferBacked {
2628

27-
void load(ArrowBuf data);
29+
void load(ArrowFieldNode fieldNode, ArrowBuf data);
2830

2931
ArrowBuf unLoad();
3032

vector/src/main/java/org/apache/arrow/vector/ValueVector.java

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
130130
*/
131131
FieldReader getReader();
132132

133-
/**
134-
* Get the metadata for this field. Used in serialization
135-
*
136-
* @return FieldMetadata for this field.
137-
*/
138-
// SerializedField getMetadata();
139-
140133
/**
141134
* Returns the number of bytes that is used by this vector instance.
142135
*/
@@ -166,16 +159,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
166159
*/
167160
ArrowBuf[] getBuffers(boolean clear);
168161

169-
/**
170-
* Load the data provided in the buffer. Typically used when deserializing from the wire.
171-
*
172-
* @param metadata
173-
* Metadata used to decode the incoming buffer.
174-
* @param buffer
175-
* The buffer that contains the ValueVector.
176-
*/
177-
// void load(SerializedField metadata, DrillBuf buffer);
178-
179162
/**
180163
* An abstraction that is used to read from this vector instance.
181164
*/

vector/src/main/java/org/apache/arrow/vector/VectorLoader.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ private void loadBuffers(FieldVector vector, Field field, Iterator<ArrowBuf> buf
8181
try {
8282
vector.loadFieldBuffers(fieldNode, ownBuffers);
8383
} catch (RuntimeException e) {
84-
e.printStackTrace();
8584
throw new IllegalArgumentException("Could not load buffers for field " +
8685
field + " error message" + e.getMessage(), e);
8786
}

vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ public List<FieldVector> getChildrenFromFields() {
9393

9494
@Override
9595
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
96-
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
96+
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
9797
}
9898

9999
@Override

vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public NullableMapVector(String name, BufferAllocator allocator, CallBack callBa
6262

6363
@Override
6464
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
65-
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
65+
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
6666
this.valueCount = fieldNode.getLength();
6767
}
6868

vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,13 @@
1717
*/
1818
package org.apache.arrow.vector;
1919

20+
import static java.util.Arrays.asList;
21+
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertFalse;
23+
import static org.junit.Assert.assertTrue;
24+
2025
import java.io.IOException;
26+
import java.util.Collections;
2127
import java.util.List;
2228

2329
import org.apache.arrow.memory.BufferAllocator;
@@ -29,12 +35,17 @@
2935
import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter;
3036
import org.apache.arrow.vector.complex.writer.BigIntWriter;
3137
import org.apache.arrow.vector.complex.writer.IntWriter;
38+
import org.apache.arrow.vector.schema.ArrowFieldNode;
3239
import org.apache.arrow.vector.schema.ArrowRecordBatch;
40+
import org.apache.arrow.vector.types.pojo.ArrowType;
41+
import org.apache.arrow.vector.types.pojo.Field;
3342
import org.apache.arrow.vector.types.pojo.Schema;
3443
import org.junit.AfterClass;
3544
import org.junit.Assert;
3645
import org.junit.Test;
3746

47+
import io.netty.buffer.ArrowBuf;
48+
3849
public class TestVectorUnloadLoad {
3950

4051
static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
@@ -88,6 +99,64 @@ public void testUnloadLoad() throws IOException {
8899
}
89100
}
90101

102+
/**
103+
* The validity buffer can be empty if:
104+
* - all values are defined
105+
* - all values are null
106+
* @throws IOException
107+
*/
108+
@Test
109+
public void testLoadEmptyValidityBuffer() throws IOException {
110+
Schema schema = new Schema(asList(
111+
new Field("intDefined", true, new ArrowType.Int(32, true), Collections.<Field>emptyList()),
112+
new Field("intNull", true, new ArrowType.Int(32, true), Collections.<Field>emptyList())
113+
));
114+
int count = 10;
115+
ArrowBuf validity = allocator.getEmpty();
116+
ArrowBuf values = allocator.buffer(count * 4); // integers
117+
for (int i = 0; i < count; i++) {
118+
values.setInt(i * 4, i);
119+
}
120+
try (
121+
ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values, validity, values));
122+
BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE);
123+
VectorSchemaRoot newRoot = new VectorSchemaRoot(schema, finalVectorsAllocator);
124+
) {
125+
126+
// load it
127+
VectorLoader vectorLoader = new VectorLoader(newRoot);
128+
129+
vectorLoader.load(recordBatch);
130+
131+
NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined");
132+
NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull");
133+
for (int i = 0; i < count; i++) {
134+
assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i));
135+
assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i));
136+
assertTrue("#" + i, intNullVector.getAccessor().isNull(i));
137+
}
138+
intDefinedVector.getMutator().setSafe(count + 10, 1234);
139+
assertTrue(intDefinedVector.getAccessor().isNull(count + 1));
140+
// empty slots should still default to unset
141+
intDefinedVector.getMutator().setSafe(count + 1, 789);
142+
assertFalse(intDefinedVector.getAccessor().isNull(count + 1));
143+
assertEquals(789, intDefinedVector.getAccessor().get(count + 1));
144+
assertTrue(intDefinedVector.getAccessor().isNull(count));
145+
assertTrue(intDefinedVector.getAccessor().isNull(count + 2));
146+
assertTrue(intDefinedVector.getAccessor().isNull(count + 3));
147+
assertTrue(intDefinedVector.getAccessor().isNull(count + 4));
148+
assertTrue(intDefinedVector.getAccessor().isNull(count + 5));
149+
assertTrue(intDefinedVector.getAccessor().isNull(count + 6));
150+
assertTrue(intDefinedVector.getAccessor().isNull(count + 7));
151+
assertTrue(intDefinedVector.getAccessor().isNull(count + 8));
152+
assertTrue(intDefinedVector.getAccessor().isNull(count + 9));
153+
assertFalse(intDefinedVector.getAccessor().isNull(count + 10));
154+
assertEquals(1234, intDefinedVector.getAccessor().get(count + 10));
155+
} finally {
156+
values.release();
157+
}
158+
}
159+
91160
public static VectorUnloader newVectorUnloader(FieldVector root) {
92161
Schema schema = new Schema(root.getField().getChildren());
93162
int valueCount = root.getAccessor().getValueCount();

0 commit comments

Comments
 (0)