Skip to content

Commit cec2294

Browse files
committed
[libc] Coalesce bitfield access in GPU malloc
Summary: This improves performance by reducing the amount of RMW operations we need to do to a single slot. This improves repeated allocations without much contention about ten percent.
1 parent b78bc35 commit cec2294

File tree

1 file changed

+25
-12
lines changed

1 file changed

+25
-12
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "allocator.h"
1818

19+
#include "src/__support/CPP/algorithm.h"
1920
#include "src/__support/CPP/atomic.h"
2021
#include "src/__support/CPP/bit.h"
2122
#include "src/__support/CPP/new.h"
@@ -229,24 +230,36 @@ struct Slab {
229230

230231
// The uniform mask represents which lanes contain a uniform target pointer.
231232
// We attempt to place these next to each other.
232-
// TODO: We should coalesce these bits and use the result of `fetch_or` to
233-
// search for free bits in parallel.
234233
void *result = nullptr;
235234
for (uint64_t mask = lane_mask; mask;
236235
mask = gpu::ballot(lane_mask, !result)) {
237-
uint32_t id = impl::lane_count(uniform & mask);
238-
uint32_t index =
239-
(gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
240-
usable_bits(chunk_size);
236+
if (result)
237+
continue;
238+
239+
uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
241240

241+
uint32_t id = impl::lane_count(uniform & mask);
242+
uint32_t index = (start + id) % usable_bits(chunk_size);
242243
uint32_t slot = index / BITS_IN_WORD;
243244
uint32_t bit = index % BITS_IN_WORD;
244-
if (!result) {
245-
uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
246-
.fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
247-
if (~before & (1 << bit))
248-
result = ptr_from_index(index, chunk_size);
249-
}
245+
246+
// Get the mask of bits destined for the same slot and coalesce it.
247+
uint64_t match = uniform & gpu::match_any(mask, slot);
248+
uint32_t length = cpp::popcount(match);
249+
uint32_t bitmask =
250+
static_cast<uint32_t>(
251+
(uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
252+
<< bit;
253+
254+
uint32_t before = 0;
255+
if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
256+
before = cpp::AtomicRef(get_bitfield()[slot])
257+
.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
258+
before = gpu::shuffle(mask, cpp::countr_zero(match), before);
259+
if (~before & (1 << bit))
260+
result = ptr_from_index(index, chunk_size);
261+
else
262+
sleep_briefly();
250263
}
251264

252265
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);

0 commit comments

Comments
 (0)