|
16 | 16 |
|
17 | 17 | #include "allocator.h"
|
18 | 18 |
|
| 19 | +#include "src/__support/CPP/algorithm.h" |
19 | 20 | #include "src/__support/CPP/atomic.h"
|
20 | 21 | #include "src/__support/CPP/bit.h"
|
21 | 22 | #include "src/__support/CPP/new.h"
|
@@ -229,24 +230,36 @@ struct Slab {
|
229 | 230 |
|
230 | 231 | // The uniform mask represents which lanes contain a uniform target pointer.
|
231 | 232 | // We attempt to place these next to each other.
|
232 |
| - // TODO: We should coalesce these bits and use the result of `fetch_or` to |
233 |
| - // search for free bits in parallel. |
234 | 233 | void *result = nullptr;
|
235 | 234 | for (uint64_t mask = lane_mask; mask;
|
236 | 235 | mask = gpu::ballot(lane_mask, !result)) {
|
237 |
| - uint32_t id = impl::lane_count(uniform & mask); |
238 |
| - uint32_t index = |
239 |
| - (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) % |
240 |
| - usable_bits(chunk_size); |
| 236 | + if (result) |
| 237 | + continue; |
| 238 | + |
| 239 | + uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); |
241 | 240 |
|
| 241 | + uint32_t id = impl::lane_count(uniform & mask); |
| 242 | + uint32_t index = (start + id) % usable_bits(chunk_size); |
242 | 243 | uint32_t slot = index / BITS_IN_WORD;
|
243 | 244 | uint32_t bit = index % BITS_IN_WORD;
|
244 |
| - if (!result) { |
245 |
| - uint32_t before = cpp::AtomicRef(get_bitfield()[slot]) |
246 |
| - .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED); |
247 |
| - if (~before & (1 << bit)) |
248 |
| - result = ptr_from_index(index, chunk_size); |
249 |
| - } |
| 245 | + |
| 246 | + // Get the mask of bits destined for the same slot and coalesce it. |
| 247 | + uint64_t match = uniform & gpu::match_any(mask, slot); |
| 248 | + uint32_t length = cpp::popcount(match); |
| 249 | + uint32_t bitmask = |
| 250 | + static_cast<uint32_t>( |
| 251 | + (uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1) |
| 252 | + << bit; |
| 253 | + |
| 254 | + uint32_t before = 0; |
| 255 | + if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match))) |
| 256 | + before = cpp::AtomicRef(get_bitfield()[slot]) |
| 257 | + .fetch_or(bitmask, cpp::MemoryOrder::RELAXED); |
| 258 | + before = gpu::shuffle(mask, cpp::countr_zero(match), before); |
| 259 | + if (~before & (1 << bit)) |
| 260 | + result = ptr_from_index(index, chunk_size); |
| 261 | + else |
| 262 | + sleep_briefly(); |
250 | 263 | }
|
251 | 264 |
|
252 | 265 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
|
|
0 commit comments