Skip to content

GH-79: Move splitAndTransferValidityBuffer to BaseValueVector #777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector

protected final Field field;
private int allocationMonitor;
protected ArrowBuf validityBuffer;
protected ArrowBuf valueBuffer;
protected int valueCount;

/**
* Constructs a new instance.
Expand Down Expand Up @@ -87,7 +85,7 @@ public String getName() {

/* TODO:
* Once the entire hierarchy has been refactored, move common functions
* like getNullCount(), splitAndTransferValidityBuffer to top level
* like getNullCount() to top level
* base class BaseValueVector.
*
* Along with this, some class members (validityBuffer) can also be
Expand Down Expand Up @@ -342,9 +340,9 @@ private void allocateBytes(int valueCount) {
* slice the source buffer so we have to explicitly allocate the validityBuffer of the target
* vector. This is unlike the databuffer which we can always slice for the target vector.
*/
private void allocateValidityBuffer(final int validityBufferSize) {
validityBuffer = allocator.buffer(validityBufferSize);
validityBuffer.readerIndex(0);
@Override
protected void allocateValidityBuffer(final long validityBufferSize) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given many of these are very similar maybe they should delegate to a base implementation with super.allocateValidityBuffer then call any vector-specific logic at the end?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, there is a slight behavior change though -- the validity buffer in BaseFixedWidthVector was not zeroing out its elements post allocation, it is now. Should be fine though I am thinking.

super.allocateValidityBuffer(validityBufferSize);
refreshValueCapacity();
}

Expand Down Expand Up @@ -656,72 +654,18 @@ private void splitAndTransferValueBuffer(
target.refreshValueCapacity();
}

/**
* Validity buffer has multiple cases of split and transfer depending on the starting position of
* the source index.
*/
private void splitAndTransferValidityBuffer(
int startIndex, int length, BaseFixedWidthVector target) {
int firstByteSource = BitVectorHelper.byteIndex(startIndex);
int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
int byteSizeTarget = BitVectorHelper.getValidityBufferSizeFromCount(length);
int offset = startIndex % 8;

if (length > 0) {
if (offset == 0) {
/* slice */
if (target.validityBuffer != null) {
target.validityBuffer.getReferenceManager().release();
}
ArrowBuf slicedValidityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
target.validityBuffer = transferBuffer(slicedValidityBuffer, target.allocator);
target.refreshValueCapacity();
} else {
/* Copy data
* When the first bit starts from the middle of a byte (offset != 0),
* copy data from src BitVector.
* Each byte in the target is composed by a part in i-th byte,
* another part in (i+1)-th byte.
*/
target.allocateValidityBuffer(byteSizeTarget);

for (int i = 0; i < byteSizeTarget - 1; i++) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + i, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(
this.validityBuffer, firstByteSource + i + 1, offset);

target.validityBuffer.setByte(i, (b1 + b2));
}

/* Copying the last piece is done in the following manner:
* if the source vector has 1 or more bytes remaining, we copy
* the last piece as a byte formed by shifting data
* from the current byte and the next byte.
*
* if the source vector has no more bytes remaining
* (we are at the last byte), we copy the last piece as a byte
* by shifting data from the current byte.
*/
if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(
this.validityBuffer, firstByteSource + byteSizeTarget, offset);

target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2);
} else {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
target.validityBuffer.setByte(byteSizeTarget - 1, b1);
}
}
@Override
protected void sliceAndTransferValidityBuffer(
int startIndex, int length, BaseValueVector target) {
final int firstByteSource = BitVectorHelper.byteIndex(startIndex);
final int byteSizeTarget = BitVectorHelper.getValidityBufferSizeFromCount(length);

if (target.validityBuffer != null) {
target.validityBuffer.getReferenceManager().release();
}
ArrowBuf slicedValidityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
target.validityBuffer = transferBuffer(slicedValidityBuffer, target.allocator);
((BaseFixedWidthVector) target).refreshValueCapacity();
}

/*----------------------------------------------------------------*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,8 @@ public abstract class BaseLargeVariableWidthVector extends BaseValueVector
/* protected members */
public static final int OFFSET_WIDTH = 8; /* 8 byte unsigned int to track offsets */
protected static final byte[] emptyByteArray = new byte[] {};
protected ArrowBuf validityBuffer;
protected ArrowBuf valueBuffer;
protected ArrowBuf offsetBuffer;
protected int valueCount;
protected int lastSet;
protected final Field field;

Expand Down Expand Up @@ -501,10 +499,9 @@ private ArrowBuf allocateOffsetBuffer(final long size) {
}

/* allocate validity buffer */
private void allocateValidityBuffer(final long size) {
validityBuffer = allocator.buffer(size);
validityBuffer.readerIndex(0);
initValidityBuffer();
@Override
protected void allocateValidityBuffer(final long size) {
super.allocateValidityBuffer(size);
}

/**
Expand Down Expand Up @@ -809,69 +806,17 @@ private void splitAndTransferOffsetBuffer(
target.valueBuffer = transferBuffer(slicedBuffer, target.allocator);
}

/*
* Transfer the validity.
*/
private void splitAndTransferValidityBuffer(
int startIndex, int length, BaseLargeVariableWidthVector target) {
int firstByteSource = BitVectorHelper.byteIndex(startIndex);
int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
int byteSizeTarget = BitVectorHelper.getValidityBufferSizeFromCount(length);
int offset = startIndex % 8;
@Override
protected void sliceAndTransferValidityBuffer(
int startIndex, int length, BaseValueVector target) {
final int firstByteSource = BitVectorHelper.byteIndex(startIndex);
final int byteSizeTarget = BitVectorHelper.getValidityBufferSizeFromCount(length);

if (length > 0) {
if (offset == 0) {
// slice
if (target.validityBuffer != null) {
target.validityBuffer.getReferenceManager().release();
}
target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
target.validityBuffer.getReferenceManager().retain();
} else {
/* Copy data
* When the first bit starts from the middle of a byte (offset != 0),
* copy data from src BitVector.
* Each byte in the target is composed by a part in i-th byte,
* another part in (i+1)-th byte.
*/
target.allocateValidityBuffer(byteSizeTarget);

for (int i = 0; i < byteSizeTarget - 1; i++) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + i, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(
this.validityBuffer, firstByteSource + i + 1, offset);

target.validityBuffer.setByte(i, (b1 + b2));
}
/* Copying the last piece is done in the following manner:
* if the source vector has 1 or more bytes remaining, we copy
* the last piece as a byte formed by shifting data
* from the current byte and the next byte.
*
* if the source vector has no more bytes remaining
* (we are at the last byte), we copy the last piece as a byte
* by shifting data from the current byte.
*/
if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(
this.validityBuffer, firstByteSource + byteSizeTarget, offset);

target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2);
} else {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
target.validityBuffer.setByte(byteSizeTarget - 1, b1);
}
}
if (target.validityBuffer != null) {
target.validityBuffer.getReferenceManager().release();
}
target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
target.validityBuffer.getReferenceManager().retain();
}

/*----------------------------------------------------------------*
Expand Down
116 changes: 116 additions & 0 deletions vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ public abstract class BaseValueVector implements ValueVector {

protected volatile FieldReader fieldReader;

protected ArrowBuf validityBuffer;

protected int valueCount;

protected BaseValueVector(BufferAllocator allocator) {
this.allocator = Preconditions.checkNotNull(allocator, "allocator cannot be null");
}
Expand Down Expand Up @@ -255,4 +259,116 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) {
public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) {
throw new UnsupportedOperationException();
}

/**
* Transfer the validity buffer from `validityBuffer` to the target vector's `validityBuffer`.
* Start at `startIndex` and copy `length` number of elements. If the starting index is 8 byte
* aligned, then the buffer is sliced from that index and ownership is transferred. If not,
* individual bytes are copied.
*
* @param startIndex starting index
* @param length number of elements to be copied
* @param target target vector
*/
protected void splitAndTransferValidityBuffer(
int startIndex, int length, BaseValueVector target) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accepting BaseValueVector and not ValueVector here to keep the API (int, int, vector). If I accept ValueVector, then there there needs to be some lambda wrangling to pass in the vector-specific allocateValidityBuffer somehow, or that function needs to be added to ValueVector, both of which don't seem that great. From what I can tell everything except StructVector seems to be descend from BaseValueVector so chose this as part of the API.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checked that UnionVector doesn't extend it either (which is good - as it doesn't have a validity buffer)

int offset = startIndex % 8;

if (length <= 0) {
return;
}
if (offset == 0) {
sliceAndTransferValidityBuffer(startIndex, length, target);
} else {
copyValidityBuffer(startIndex, length, target);
}
}

/**
* If the start index is 8 byte aligned, slice `validityBuffer` and transfer ownership to
* `target`'s `validityBuffer`.
*
* @param startIndex starting index
* @param length number of elements to be copied
* @param target target vector
*/
protected void sliceAndTransferValidityBuffer(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation is the one being used by the vectors inside the complex/ directory. BaseValueVector#validityBuffer is protected, so overridden implementations there can't call target.validityBuffer to set a new value. Vectors in the top level org/apache/arrow/vector directory can add their overridden implementations. If at some point some other vector in complex/ wants to have a different version of this function, then I am afraid that vector will need to override splitAndTransferValidityBuffer entirely. Presumably don't want to expose a public setValidityBuffer on BaseValueVector.

int startIndex, int length, BaseValueVector target) {
final int firstByteSource = BitVectorHelper.byteIndex(startIndex);
final int byteSizeTarget = getValidityBufferSizeFromCount(length);

if (target.validityBuffer != null) {
target.validityBuffer.getReferenceManager().release();
}
target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
target.validityBuffer.getReferenceManager().retain(1);
}

/**
* Allocate new validity buffer for `target` and copy bytes from `validityBuffer`. Precise details
* in the comments below.
*
* @param startIndex starting index
* @param length number of elements to be copied
* @param target target vector
*/
protected void copyValidityBuffer(int startIndex, int length, BaseValueVector target) {
final int firstByteSource = BitVectorHelper.byteIndex(startIndex);
final int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
final int byteSizeTarget = getValidityBufferSizeFromCount(length);
final int offset = startIndex % 8;

/* Copy data
* When the first bit starts from the middle of a byte (offset != 0),
* copy data from src BitVector.
* Each byte in the target is composed by a part in i-th byte,
* another part in (i+1)-th byte.
*/
target.allocateValidityBuffer(byteSizeTarget);

for (int i = 0; i < byteSizeTarget - 1; i++) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset);

target.validityBuffer.setByte(i, (b1 + b2));
}

/* Copying the last piece is done in the following manner:
* if the source vector has 1 or more bytes remaining, we copy
* the last piece as a byte formed by shifting data
* from the current byte and the next byte.
*
* if the source vector has no more bytes remaining
* (we are at the last byte), we copy the last piece as a byte
* by shifting data from the current byte.
*/
if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
byte b2 =
BitVectorHelper.getBitsFromNextByte(
this.validityBuffer, firstByteSource + byteSizeTarget, offset);

target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2);
} else {
byte b1 =
BitVectorHelper.getBitsFromCurrentByte(
this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
target.validityBuffer.setByte(byteSizeTarget - 1, b1);
}
}

/**
* Allocate new validity buffer for when the bytes need to be copied over.
*
* @param byteSizeTarget desired size of the buffer
*/
protected void allocateValidityBuffer(long byteSizeTarget) {
validityBuffer = allocator.buffer(byteSizeTarget);
validityBuffer.readerIndex(0);
validityBuffer.setZero(0, validityBuffer.capacity());
}
}
Loading
Loading