Skip to content

Commit a2ead2f

Browse files
zeapowesm
authored andcommitted
ARROW-380: [Java] optimize null count when serializing vectors
I added `getNullCount()` to the `Accessor` interface. I don't know if this is the best way to achieve this. Hence, we'll have both ValueCount and NullCount immediately accessible from the accessor. Author: Mohamed Zenadi <mohamed@zenadi.com> Closes #207 from zeapo/ARROW-380 and squashes the following commits: 27c0342 [Mohamed Zenadi] implement missing getNullCount implementation for NullableMapVector 9ff3355 [Mohamed Zenadi] implement the base case of getNullCount() ad3f24a [Mohamed Zenadi] the used size is not the same as the allocated size e858432 [Mohamed Zenadi] use the valueCount as basis for counting nulls rather than allocated bytes 0530c85 [Mohamed Zenadi] test the null count byte by byte and the odd length case 95667d3 [Mohamed Zenadi] fix the comment b12a2a5 [Mohamed Zenadi] fix wrong value returned by the method f264250 [Mohamed Zenadi] use getNullCount() rather than isNull baca69c [Mohamed Zenadi] Add methods to count the number null values in the vector
1 parent cfb544d commit a2ead2f

File tree

8 files changed

+99
-9
lines changed

8 files changed

+99
-9
lines changed

java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,18 @@ protected BaseAccessor() { }
7272
public boolean isNull(int index) {
7373
return false;
7474
}
75+
76+
@Override
77+
// override this in case your implementation is faster, see BitVector
78+
public int getNullCount() {
79+
int nullCount = 0;
80+
for (int i = 0; i < getValueCount(); i++) {
81+
if (isNull(i)) {
82+
nullCount ++;
83+
}
84+
}
85+
return nullCount;
86+
}
7587
}
7688

7789
public abstract static class BaseMutator implements ValueVector.Mutator {

java/vector/src/main/java/org/apache/arrow/vector/BitVector.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,28 @@ public final void get(int index, NullableBitHolder holder) {
379379
holder.isSet = 1;
380380
holder.value = get(index);
381381
}
382+
383+
/**
384+
* Get the number nulls, this correspond to the number of bits set to 0 in the vector
385+
* @return the number of bits set to 0
386+
*/
387+
@Override
388+
public final int getNullCount() {
389+
int count = 0;
390+
int sizeInBytes = getSizeFromCount(valueCount);
391+
392+
for (int i = 0; i < sizeInBytes; ++i) {
393+
byte byteValue = data.getByte(i);
394+
// Java uses two's complement binary representation, hence 11111111_b which is -1 when converted to Int
395+
// will have 32bits set to 1. Masking the MSB and then adding it back solves the issue.
396+
count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7);
397+
}
398+
int nullCount = (sizeInBytes * 8) - count;
399+
// if the valueCount is not a multiple of 8, the bits on the right were counted as null bits
400+
int remainder = valueCount % 8;
401+
nullCount -= remainder == 0 ? 0 : 8 - remainder;
402+
return nullCount;
403+
}
382404
}
383405

384406
/**

java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,11 @@ interface Accessor {
180180
* Returns true if the value at the given index is null, false otherwise.
181181
*/
182182
boolean isNull(int index);
183+
184+
/**
185+
* Returns the number of null values
186+
*/
187+
int getNullCount();
183188
}
184189

185190
/**

java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,7 @@ public ArrowRecordBatch getRecordBatch() {
6060

6161
private void appendNodes(FieldVector vector, List<ArrowFieldNode> nodes, List<ArrowBuf> buffers) {
6262
Accessor accessor = vector.getAccessor();
63-
int nullCount = 0;
64-
// TODO: should not have to do that
65-
// we can do that a lot more efficiently (for example with Long.bitCount(i))
66-
for (int i = 0; i < accessor.getValueCount(); i++) {
67-
if (accessor.isNull(i)) {
68-
nullCount ++;
69-
}
70-
}
71-
nodes.add(new ArrowFieldNode(accessor.getValueCount(), nullCount));
63+
nodes.add(new ArrowFieldNode(accessor.getValueCount(), accessor.getNullCount()));
7264
List<ArrowBuf> fieldBuffers = vector.getFieldBuffers();
7365
List<ArrowVectorType> expectedBuffers = vector.getField().getTypeLayout().getVectorTypes();
7466
if (fieldBuffers.size() != expectedBuffers.size()) {

java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ public int getValueCount() {
6969
public boolean isNull(int index) {
7070
return true;
7171
}
72+
73+
@Override
74+
public int getNullCount() {
75+
return 0;
76+
}
7277
};
7378

7479
private final Mutator defaultMutator = new Mutator() {

java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,11 @@ public Object getObject(int index) {
310310
public boolean isNull(int index) {
311311
return bits.getAccessor().get(index) == 0;
312312
}
313+
314+
@Override
315+
public int getNullCount() {
316+
return bits.getAccessor().getNullCount();
317+
}
313318
}
314319

315320
public class Mutator extends BaseRepeatedMutator {

java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ public void get(int index, ComplexHolder holder) {
203203
super.get(index, holder);
204204
}
205205

206+
@Override
207+
public int getNullCount() {
208+
return bits.getAccessor().getNullCount();
209+
}
210+
206211
@Override
207212
public boolean isNull(int index) {
208213
return isSet(index) == 0;

java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,19 +288,24 @@ public void testBitVector() {
288288
try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) {
289289
final BitVector.Mutator m = vector.getMutator();
290290
vector.allocateNew(1024);
291+
m.setValueCount(1024);
291292

292293
// Put and set a few values
293294
m.set(0, 1);
294295
m.set(1, 0);
295296
m.set(100, 0);
296297
m.set(1022, 1);
297298

299+
m.setValueCount(1024);
300+
298301
final BitVector.Accessor accessor = vector.getAccessor();
299302
assertEquals(1, accessor.get(0));
300303
assertEquals(0, accessor.get(1));
301304
assertEquals(0, accessor.get(100));
302305
assertEquals(1, accessor.get(1022));
303306

307+
assertEquals(1022, accessor.getNullCount());
308+
304309
// test setting the same value twice
305310
m.set(0, 1);
306311
m.set(0, 1);
@@ -315,8 +320,47 @@ public void testBitVector() {
315320
assertEquals(0, accessor.get(0));
316321
assertEquals(1, accessor.get(1));
317322

323+
// should not change
324+
assertEquals(1022, accessor.getNullCount());
325+
318326
// Ensure unallocated space returns 0
319327
assertEquals(0, accessor.get(3));
328+
329+
// unset the previously set bits
330+
m.set(1, 0);
331+
m.set(1022, 0);
332+
// this should set all the array to 0
333+
assertEquals(1024, accessor.getNullCount());
334+
335+
// set all the array to 1
336+
for (int i = 0; i < 1024; ++i) {
337+
assertEquals(1024 - i, accessor.getNullCount());
338+
m.set(i, 1);
339+
}
340+
341+
assertEquals(0, accessor.getNullCount());
342+
343+
vector.allocateNew(1015);
344+
m.setValueCount(1015);
345+
346+
// ensure it has been zeroed
347+
assertEquals(1015, accessor.getNullCount());
348+
349+
m.set(0, 1);
350+
m.set(1014, 1); // ensure that the last item of the last byte is allocated
351+
352+
assertEquals(1013, accessor.getNullCount());
353+
354+
vector.zeroVector();
355+
assertEquals(1015, accessor.getNullCount());
356+
357+
// set all the array to 1
358+
for (int i = 0; i < 1015; ++i) {
359+
assertEquals(1015 - i, accessor.getNullCount());
360+
m.set(i, 1);
361+
}
362+
363+
assertEquals(0, accessor.getNullCount());
320364
}
321365
}
322366

0 commit comments

Comments
 (0)