|
| 1 | +//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===// |
| 2 | +// |
| 3 | +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#include <assert.h> |
| 10 | +#include <atomic> |
| 11 | +#include <chrono> |
| 12 | +#include <immintrin.h> |
| 13 | +#include <omp.h> |
| 14 | +#include <stdarg.h> |
| 15 | + |
| 16 | +#define likely(x) __builtin_expect(!!(x), 1) |
| 17 | +#define unlikely(x) __builtin_expect(!!(x), 0) |
| 18 | + |
| 19 | +#define WEAK_SYMBOL __attribute__((weak)) |
| 20 | + |
| 21 | +namespace { |
| 22 | +struct barrier_t { |
| 23 | + alignas(64) std::atomic<int32_t> pending_; |
| 24 | + std::atomic<int32_t> rounds_; |
| 25 | + uint64_t total_; |
| 26 | + // pad barrier to size of cacheline to avoid false sharing |
| 27 | + char padding_[64 - 4 * sizeof(int32_t)]; |
| 28 | +}; |
| 29 | + |
| 30 | +using barrier_idle_func = uint64_t (*)(std::atomic<int32_t> *remaining, |
| 31 | + int32_t expected_remain, int32_t tid, |
| 32 | + void *args); |
| 33 | +} // namespace |
| 34 | + |
| 35 | +extern "C" { |
| 36 | +int gc_runtime_keep_alive = 0; |
| 37 | +void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func, |
| 38 | + void *idle_args) { |
| 39 | + auto cur_round = b->rounds_.load(std::memory_order_acquire); |
| 40 | + auto cnt = --b->pending_; |
| 41 | + assert(cnt >= 0); |
| 42 | + if (cnt == 0) { |
| 43 | + b->pending_.store(b->total_); |
| 44 | + b->rounds_.store(cur_round + 1); |
| 45 | + } else { |
| 46 | + if (idle_func) { |
| 47 | + if (cur_round != b->rounds_.load()) { |
| 48 | + return; |
| 49 | + } |
| 50 | + idle_func(&b->rounds_, cur_round + 1, -1, idle_args); |
| 51 | + } |
| 52 | + while (cur_round == b->rounds_.load()) { |
| 53 | + _mm_pause(); |
| 54 | + } |
| 55 | + } |
| 56 | +} |
| 57 | + |
| 58 | +static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte"); |
| 59 | + |
| 60 | +void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) { |
| 61 | + for (int i = 0; i < num_barriers; i++) { |
| 62 | + b[i].total_ = thread_count; |
| 63 | + b[i].pending_.store(thread_count); |
| 64 | + b[i].rounds_.store(0); |
| 65 | + } |
| 66 | +} |
| 67 | + |
| 68 | +#if GC_NEEDS_OMP_WRAPPER |
| 69 | +void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) { |
| 70 | +#pragma omp barrier |
| 71 | +} |
| 72 | + |
| 73 | +int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) { |
| 74 | + return omp_get_thread_num(); |
| 75 | +} |
| 76 | + |
| 77 | +// The implementation was extracted and simplified from LLVM libomp |
| 78 | +// at openmp/runtime/src/kmp_sched.cpp |
| 79 | +void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid, |
| 80 | + int32_t schedtype, |
| 81 | + int32_t *plastiter, uint64_t *plower, |
| 82 | + uint64_t *pupper, int64_t *pstride, |
| 83 | + int64_t incr, int64_t chunk) { |
| 84 | + if (unlikely(schedtype != 34)) { |
| 85 | + std::abort(); |
| 86 | + } |
| 87 | + const int32_t FALSE = 0; |
| 88 | + const int32_t TRUE = 1; |
| 89 | + using UT = uint64_t; |
| 90 | + // using ST = int64_t; |
| 91 | + /* this all has to be changed back to TID and such.. */ |
| 92 | + uint32_t tid = gtid; |
| 93 | + uint32_t nth = omp_get_num_threads(); |
| 94 | + UT trip_count; |
| 95 | + |
| 96 | + /* special handling for zero-trip loops */ |
| 97 | + if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { |
| 98 | + if (plastiter != nullptr) |
| 99 | + *plastiter = FALSE; |
| 100 | + /* leave pupper and plower set to entire iteration space */ |
| 101 | + *pstride = incr; /* value should never be used */ |
| 102 | + return; |
| 103 | + } |
| 104 | + |
| 105 | + if (nth == 1) { |
| 106 | + if (plastiter != nullptr) |
| 107 | + *plastiter = TRUE; |
| 108 | + *pstride = |
| 109 | + (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1)); |
| 110 | + return; |
| 111 | + } |
| 112 | + |
| 113 | + /* compute trip count */ |
| 114 | + if (incr == 1) { |
| 115 | + trip_count = *pupper - *plower + 1; |
| 116 | + } else if (incr == -1) { |
| 117 | + trip_count = *plower - *pupper + 1; |
| 118 | + } else if (incr > 0) { |
| 119 | + // upper-lower can exceed the limit of signed type |
| 120 | + trip_count = (UT)(*pupper - *plower) / incr + 1; |
| 121 | + } else { |
| 122 | + trip_count = (UT)(*plower - *pupper) / (-incr) + 1; |
| 123 | + } |
| 124 | + if (trip_count < nth) { |
| 125 | + if (tid < trip_count) { |
| 126 | + *pupper = *plower = *plower + tid * incr; |
| 127 | + } else { |
| 128 | + // set bounds so non-active threads execute no iterations |
| 129 | + *plower = *pupper + (incr > 0 ? 1 : -1); |
| 130 | + } |
| 131 | + if (plastiter != nullptr) |
| 132 | + *plastiter = (tid == trip_count - 1); |
| 133 | + } else { |
| 134 | + UT small_chunk = trip_count / nth; |
| 135 | + UT extras = trip_count % nth; |
| 136 | + *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras)); |
| 137 | + *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr); |
| 138 | + if (plastiter != nullptr) |
| 139 | + *plastiter = (tid == nth - 1); |
| 140 | + } |
| 141 | + *pstride = trip_count; |
| 142 | +} |
| 143 | + |
| 144 | +void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {} |
| 145 | + |
| 146 | +static thread_local int next_num_threads = 0; |
| 147 | + |
| 148 | +/*! |
| 149 | +@ingroup PARALLEL |
| 150 | +The type for a microtask which gets passed to @ref __kmpc_fork_call(). |
| 151 | +The arguments to the outlined function are |
| 152 | +@param global_tid the global thread identity of the thread executing the |
| 153 | +function. |
| 154 | +@param bound_tid the local identity of the thread executing the function |
| 155 | +@param ... pointers to shared variables accessed by the function. |
| 156 | +*/ |
| 157 | +using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...); |
| 158 | +void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) { |
| 159 | + if (unlikely(argc != 1 && argc != 0)) { |
| 160 | + std::abort(); |
| 161 | + } |
| 162 | + va_list ap; |
| 163 | + va_start(ap, pfunc); |
| 164 | + void *c = va_arg(ap, void *); |
| 165 | + int32_t global_tid = 0; |
| 166 | + if (unlikely(next_num_threads)) { |
| 167 | +#pragma omp parallel num_threads(next_num_threads) |
| 168 | + { |
| 169 | + kmpc_micro func = (kmpc_micro)(pfunc); |
| 170 | + func(&global_tid, nullptr, c); |
| 171 | + } |
| 172 | + next_num_threads = 0; |
| 173 | + } else { |
| 174 | +#pragma omp parallel |
| 175 | + { |
| 176 | + kmpc_micro func = (kmpc_micro)(pfunc); |
| 177 | + func(&global_tid, nullptr, c); |
| 178 | + } |
| 179 | + } |
| 180 | + va_end(ap); |
| 181 | +} |
| 182 | + |
| 183 | +void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid, |
| 184 | + int32_t num_threads) { |
| 185 | + next_num_threads = num_threads; |
| 186 | +} |
| 187 | +#endif |
| 188 | +} |
0 commit comments