@@ -229,24 +229,34 @@ struct Slab {
229
229
230
230
// The uniform mask represents which lanes contain a uniform target pointer.
231
231
// We attempt to place these next to each other.
232
- // TODO: We should coalesce these bits and use the result of `fetch_or` to
233
- // search for free bits in parallel.
234
232
void *result = nullptr ;
235
233
for (uint64_t mask = lane_mask; mask;
236
234
mask = gpu::ballot (lane_mask, !result)) {
237
- uint32_t id = impl::lane_count (uniform & mask);
238
- uint32_t index =
239
- ( gpu::broadcast_value (lane_mask, impl::xorshift32 (state)) + id) %
240
- usable_bits (chunk_size );
235
+ if (result)
236
+ continue ;
237
+
238
+ uint32_t start = gpu::broadcast_value (lane_mask, impl::xorshift32 (state) );
241
239
240
+ uint32_t id = impl::lane_count (uniform & mask);
241
+ uint32_t index = (start + id) % usable_bits (chunk_size);
242
242
uint32_t slot = index / BITS_IN_WORD;
243
243
uint32_t bit = index % BITS_IN_WORD;
244
- if (!result) {
245
- uint32_t before = cpp::AtomicRef (get_bitfield ()[slot])
246
- .fetch_or (1u << bit, cpp::MemoryOrder::RELAXED);
247
- if (~before & (1 << bit))
248
- result = ptr_from_index (index, chunk_size);
249
- }
244
+
245
+ // Get the mask of bits destined for the same slot and coalesce it.
246
+ uint64_t match = uniform & gpu::match_any (mask, slot);
247
+ uint32_t length = cpp::popcount (match);
248
+ uint32_t bitmask = static_cast <uint32_t >((uint64_t (1 ) << length) - 1 )
249
+ << bit;
250
+
251
+ uint32_t before = 0 ;
252
+ if (gpu::get_lane_id () == static_cast <uint32_t >(cpp::countr_zero (match)))
253
+ before = cpp::AtomicRef (get_bitfield ()[slot])
254
+ .fetch_or (bitmask, cpp::MemoryOrder::RELAXED);
255
+ before = gpu::shuffle (mask, cpp::countr_zero (match), before);
256
+ if (~before & (1 << bit))
257
+ result = ptr_from_index (index, chunk_size);
258
+ else
259
+ sleep_briefly ();
250
260
}
251
261
252
262
cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
0 commit comments