Skip to content

Commit 71c418c

Browse files
authored
GH-948: Use buffer indexing for UUID vector (#949)
## What's Changed The current UUID vector implementation creates new buffer slices when reading values through holders, which has several drawbacks: - Memory overhead: Each slice creates a new ArrowBuf object - Performance impact: Buffer slicing is slower than direct buffer indexing - Inconsistency: Other fixed-width types (like Decimal) use buffer indexing with a `start` offset field ### Proposed Changes 1. Add `start` field to UUID holders to track buffer offsets: - `UuidHolder`: Add `public int start = 0;` - `NullableUuidHolder`: Add `public int start = 0;` 2. Update `UuidVector` to use buffer indexing 3. Update readers and writers ### Related Work - Original UUID extension type implementation: GH-825 (#903) Closes #948
1 parent db9fff8 commit 71c418c

File tree

13 files changed

+649
-143
lines changed

13 files changed

+649
-143
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.arrow.vector;
18+
19+
import java.util.UUID;
20+
import java.util.concurrent.TimeUnit;
21+
import org.apache.arrow.memory.BufferAllocator;
22+
import org.apache.arrow.memory.RootAllocator;
23+
import org.apache.arrow.vector.complex.impl.UuidWriterImpl;
24+
import org.apache.arrow.vector.holders.NullableUuidHolder;
25+
import org.openjdk.jmh.annotations.Benchmark;
26+
import org.openjdk.jmh.annotations.BenchmarkMode;
27+
import org.openjdk.jmh.annotations.Mode;
28+
import org.openjdk.jmh.annotations.OutputTimeUnit;
29+
import org.openjdk.jmh.annotations.Scope;
30+
import org.openjdk.jmh.annotations.Setup;
31+
import org.openjdk.jmh.annotations.State;
32+
import org.openjdk.jmh.annotations.TearDown;
33+
import org.openjdk.jmh.profile.GCProfiler;
34+
import org.openjdk.jmh.runner.Runner;
35+
import org.openjdk.jmh.runner.RunnerException;
36+
import org.openjdk.jmh.runner.options.Options;
37+
import org.openjdk.jmh.runner.options.OptionsBuilder;
38+
39+
/** Benchmarks for {@link UuidVector}. */
40+
@State(Scope.Benchmark)
41+
public class UuidVectorBenchmarks {
42+
// checkstyle:off: MissingJavadocMethod
43+
44+
private static final int VECTOR_LENGTH = 10_000;
45+
46+
private static final int ALLOCATOR_CAPACITY = 1024 * 1024;
47+
48+
private BufferAllocator allocator;
49+
50+
private UuidVector vector;
51+
52+
private UUID[] testUuids;
53+
54+
@Setup
55+
public void prepare() {
56+
allocator = new RootAllocator(ALLOCATOR_CAPACITY);
57+
vector = new UuidVector("vector", allocator);
58+
vector.allocateNew(VECTOR_LENGTH);
59+
vector.setValueCount(VECTOR_LENGTH);
60+
61+
// Pre-generate UUIDs for consistent benchmarking
62+
testUuids = new UUID[VECTOR_LENGTH];
63+
for (int i = 0; i < VECTOR_LENGTH; i++) {
64+
testUuids[i] = new UUID(i, i * 2L);
65+
}
66+
}
67+
68+
@TearDown
69+
public void tearDown() {
70+
vector.close();
71+
allocator.close();
72+
}
73+
74+
@Benchmark
75+
@BenchmarkMode(Mode.AverageTime)
76+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
77+
public void setWithHolder() {
78+
NullableUuidHolder holder = new NullableUuidHolder();
79+
for (int i = 0; i < VECTOR_LENGTH; i++) {
80+
vector.get(i, holder);
81+
vector.setSafe(i, holder);
82+
}
83+
}
84+
85+
@Benchmark
86+
@BenchmarkMode(Mode.AverageTime)
87+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
88+
public void setUuidDirectly() {
89+
for (int i = 0; i < VECTOR_LENGTH; i++) {
90+
vector.setSafe(i, testUuids[i]);
91+
}
92+
}
93+
94+
@Benchmark
95+
@BenchmarkMode(Mode.AverageTime)
96+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
97+
public void setWithWriter() {
98+
UuidWriterImpl writer = new UuidWriterImpl(vector);
99+
for (int i = 0; i < VECTOR_LENGTH; i++) {
100+
writer.writeExtension(testUuids[i]);
101+
}
102+
}
103+
104+
@Benchmark
105+
@BenchmarkMode(Mode.AverageTime)
106+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
107+
public void getWithUuidHolder() {
108+
NullableUuidHolder holder = new NullableUuidHolder();
109+
for (int i = 0; i < VECTOR_LENGTH; i++) {
110+
vector.get(i, holder);
111+
}
112+
}
113+
114+
@Benchmark
115+
@BenchmarkMode(Mode.AverageTime)
116+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
117+
public void getUuidDirectly() {
118+
for (int i = 0; i < VECTOR_LENGTH; i++) {
119+
UUID uuid = vector.getObject(i);
120+
}
121+
}
122+
123+
public static void main(String[] args) throws RunnerException {
124+
Options opt =
125+
new OptionsBuilder()
126+
.include(UuidVectorBenchmarks.class.getSimpleName())
127+
.forks(1)
128+
.addProfiler(GCProfiler.class)
129+
.build();
130+
131+
new Runner(opt).run();
132+
}
133+
// checkstyle:on: MissingJavadocMethod
134+
}

vector/src/main/java/org/apache/arrow/vector/UuidVector.java

Lines changed: 46 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import org.apache.arrow.memory.ArrowBuf;
2424
import org.apache.arrow.memory.BufferAllocator;
2525
import org.apache.arrow.memory.util.ArrowBufPointer;
26+
import org.apache.arrow.memory.util.ByteFunctionHelpers;
2627
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
28+
import org.apache.arrow.util.Preconditions;
2729
import org.apache.arrow.vector.complex.impl.UuidReaderImpl;
2830
import org.apache.arrow.vector.complex.reader.FieldReader;
2931
import org.apache.arrow.vector.extension.UuidType;
@@ -132,7 +134,8 @@ public int hashCode(int index) {
132134

133135
@Override
134136
public int hashCode(int index, ArrowBufHasher hasher) {
135-
return getUnderlyingVector().hashCode(index, hasher);
137+
int start = this.getStartOffset(index);
138+
return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, start + UUID_BYTE_WIDTH);
136139
}
137140

138141
/**
@@ -145,45 +148,31 @@ public int isSet(int index) {
145148
return getUnderlyingVector().isSet(index);
146149
}
147150

148-
/**
149-
* Gets the UUID value at the given index as an ArrowBuf.
150-
*
151-
* @param index the index to retrieve
152-
* @return a buffer slice containing the 16-byte UUID
153-
* @throws IllegalStateException if the value at the index is null and null checking is enabled
154-
*/
155-
public ArrowBuf get(int index) throws IllegalStateException {
156-
if (NullCheckingForGet.NULL_CHECKING_ENABLED && this.isSet(index) == 0) {
157-
throw new IllegalStateException("Value at index is null");
158-
} else {
159-
return getBufferSlicePostNullCheck(index);
160-
}
161-
}
162-
163151
/**
164152
* Reads the UUID value at the given index into a NullableUuidHolder.
165153
*
166154
* @param index the index to read from
167155
* @param holder the holder to populate with the UUID data
168156
*/
169157
public void get(int index, NullableUuidHolder holder) {
170-
if (NullCheckingForGet.NULL_CHECKING_ENABLED && this.isSet(index) == 0) {
158+
Preconditions.checkArgument(index >= 0, "Cannot get negative index in UUID vector.");
159+
if (isSet(index) == 0) {
171160
holder.isSet = 0;
172-
} else {
173-
holder.isSet = 1;
174-
holder.buffer = getBufferSlicePostNullCheck(index);
161+
return;
175162
}
163+
holder.isSet = 1;
164+
holder.buffer = getDataBuffer();
165+
holder.start = getStartOffset(index);
176166
}
177167

178168
/**
179-
* Reads the UUID value at the given index into a UuidHolder.
169+
* Calculates the byte offset for a given index in the data buffer.
180170
*
181-
* @param index the index to read from
182-
* @param holder the holder to populate with the UUID data
171+
* @param index the index of the UUID value
172+
* @return the byte offset in the data buffer
183173
*/
184-
public void get(int index, UuidHolder holder) {
185-
holder.isSet = 1;
186-
holder.buffer = getBufferSlicePostNullCheck(index);
174+
public final int getStartOffset(int index) {
175+
return index * UUID_BYTE_WIDTH;
187176
}
188177

189178
/**
@@ -207,7 +196,7 @@ public void set(int index, UUID value) {
207196
* @param holder the holder containing the UUID data
208197
*/
209198
public void set(int index, UuidHolder holder) {
210-
this.set(index, holder.isSet, holder.buffer);
199+
this.set(index, holder.buffer, holder.start);
211200
}
212201

213202
/**
@@ -217,28 +206,11 @@ public void set(int index, UuidHolder holder) {
217206
* @param holder the holder containing the UUID data
218207
*/
219208
public void set(int index, NullableUuidHolder holder) {
220-
this.set(index, holder.isSet, holder.buffer);
221-
}
222-
223-
/**
224-
* Sets the UUID value at the given index with explicit null flag.
225-
*
226-
* @param index the index to set
227-
* @param isSet 1 if the value is set, 0 if null
228-
* @param buffer the buffer containing the 16-byte UUID data
229-
*/
230-
public void set(int index, int isSet, ArrowBuf buffer) {
231-
getUnderlyingVector().set(index, isSet, buffer);
232-
}
233-
234-
/**
235-
* Sets the UUID value at the given index from an ArrowBuf.
236-
*
237-
* @param index the index to set
238-
* @param value the buffer containing the 16-byte UUID data
239-
*/
240-
public void set(int index, ArrowBuf value) {
241-
getUnderlyingVector().set(index, value);
209+
if (holder.isSet == 0) {
210+
getUnderlyingVector().setNull(index);
211+
} else {
212+
this.set(index, holder.buffer, holder.start);
213+
}
242214
}
243215

244216
/**
@@ -249,10 +221,12 @@ public void set(int index, ArrowBuf value) {
249221
* @param sourceOffset the offset in the source buffer where the UUID data starts
250222
*/
251223
public void set(int index, ArrowBuf source, int sourceOffset) {
252-
// Copy bytes from source buffer to target vector data buffer
253-
ArrowBuf dataBuffer = getUnderlyingVector().getDataBuffer();
254-
dataBuffer.setBytes((long) index * UUID_BYTE_WIDTH, source, sourceOffset, UUID_BYTE_WIDTH);
255-
getUnderlyingVector().setIndexDefined(index);
224+
Preconditions.checkNotNull(source, "Cannot set UUID vector, the source buffer is null.");
225+
226+
BitVectorHelper.setBit(getUnderlyingVector().getValidityBuffer(), index);
227+
getUnderlyingVector()
228+
.getDataBuffer()
229+
.setBytes((long) index * UUID_BYTE_WIDTH, source, sourceOffset, UUID_BYTE_WIDTH);
256230
}
257231

258232
/**
@@ -286,25 +260,34 @@ public void setSafe(int index, UUID value) {
286260
* @param holder the holder containing the UUID data, or null to set a null value
287261
*/
288262
public void setSafe(int index, NullableUuidHolder holder) {
289-
if (holder != null) {
290-
getUnderlyingVector().setSafe(index, holder.isSet, holder.buffer);
291-
} else {
263+
if (holder == null || holder.isSet == 0) {
292264
getUnderlyingVector().setNull(index);
265+
} else {
266+
this.setSafe(index, holder.buffer, holder.start);
293267
}
294268
}
295269

296270
/**
297271
* Sets the UUID value at the given index from a UuidHolder, expanding capacity if needed.
298272
*
299273
* @param index the index to set
300-
* @param holder the holder containing the UUID data, or null to set a null value
274+
* @param holder the holder containing the UUID data
301275
*/
302276
public void setSafe(int index, UuidHolder holder) {
303-
if (holder != null) {
304-
getUnderlyingVector().setSafe(index, holder.isSet, holder.buffer);
305-
} else {
306-
getUnderlyingVector().setNull(index);
307-
}
277+
this.setSafe(index, holder.buffer, holder.start);
278+
}
279+
280+
/**
281+
* Sets the UUID value at the given index by copying from a source buffer, expanding capacity if
282+
* needed.
283+
*
284+
* @param index the index to set
285+
* @param buffer the source buffer to copy from
286+
* @param start the offset in the source buffer where the UUID data starts
287+
*/
288+
public void setSafe(int index, ArrowBuf buffer, int start) {
289+
getUnderlyingVector().handleSafe(index);
290+
this.set(index, buffer, start);
308291
}
309292

310293
/**
@@ -400,15 +383,9 @@ public TransferPair getTransferPair(BufferAllocator allocator) {
400383
return getTransferPair(this.getField().getName(), allocator);
401384
}
402385

403-
private ArrowBuf getBufferSlicePostNullCheck(int index) {
404-
return getUnderlyingVector()
405-
.getDataBuffer()
406-
.slice((long) index * UUID_BYTE_WIDTH, UUID_BYTE_WIDTH);
407-
}
408-
409386
@Override
410387
public int getTypeWidth() {
411-
return getUnderlyingVector().getTypeWidth();
388+
return UUID_BYTE_WIDTH;
412389
}
413390

414391
/** {@link TransferPair} for {@link UuidVector}. */

0 commit comments

Comments
 (0)