Skip to content

Opaque Width #133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions include/drjit-core/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,11 @@ template <JitBackend Backend_, typename Value_> struct JitArray {
return jit_var_size(m_index);
}

auto opaque_width_() {
using UInt32 = JitArray<Backend_, uint32_t>;
return UInt32::steal(jit_var_opaque_width(m_index));
}

void resize(size_t size) {
uint32_t index = jit_var_resize(m_index, size);
jit_var_dec_ref(m_index);
Expand Down
17 changes: 17 additions & 0 deletions include/drjit-core/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,16 @@ JIT_INLINE void jit_var_dec_ref(uint32_t index) JIT_NOEXCEPT {
#define jit_var_inc_ref jit_var_inc_ref_impl
#endif

/**
* \brief Lock the recursive state mutex.
*/
extern JIT_EXPORT void jit_state_lock();
/**
* \brief Unlock the recursive state mutex.
* This should never be called from a thread that has not locked the mutex.
*/
extern JIT_EXPORT void jit_state_unlock();

/// Query the a variable's reference count (used by the test suite)
extern JIT_EXPORT uint32_t jit_var_ref(uint32_t index);

Expand Down Expand Up @@ -1179,6 +1189,9 @@ extern JIT_EXPORT uint32_t jit_var_data(uint32_t index, void **ptr_out);
/// Query the size of a given variable
extern JIT_EXPORT size_t jit_var_size(uint32_t index);

/// Query the size of a given variable, as an opaque variable.
extern JIT_EXPORT uint32_t jit_var_opaque_width(uint32_t index);

/// Query the type of a given variable
extern JIT_EXPORT JIT_ENUM VarType jit_var_type(uint32_t index);

Expand Down Expand Up @@ -2423,6 +2436,10 @@ struct VarInfo {
*/
extern JIT_EXPORT VarInfo jit_set_backend(uint32_t index) JIT_NOEXCEPT;

/// Same as \c jit_set_backend without setting the backend.
/// This improves performance, as no tls access is performed.
extern JIT_EXPORT VarInfo jit_var_info(uint32_t index) JIT_NOEXCEPT;

/**
* \brief Inform Dr.Jit about the current source code location
*
Expand Down
4 changes: 2 additions & 2 deletions include/drjit-core/nanostl.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,11 @@ template <typename T> struct vector {
m_size++;
}

template <typename... Args> void emplace_back(Args &&...args) {
template <typename... Args> T &emplace_back(Args &&...args) {
if (m_size == m_capacity)
expand();
new (&m_data[m_size]) T(std::forward<Args>(args)...);
m_size++;
return m_data[m_size++];
}

bool operator==(const vector &s) const {
Expand Down
49 changes: 34 additions & 15 deletions src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ int jit_flag(JitFlag flag) {
return jitc_flag(flag);
}

void jit_state_lock() { lock_acquire(state.lock); }

void jit_state_unlock() { lock_release(state.lock); }

uint32_t jit_record_checkpoint(JitBackend backend) {
uint32_t result = (uint32_t) thread_state(backend)->side_effects_symbolic.size();
if (jit_flag(JitFlag::SymbolicScope))
Expand Down Expand Up @@ -610,6 +614,28 @@ size_t jit_var_size(uint32_t index) {
return (size_t) jitc_var(index)->size;
}

uint32_t jit_var_opaque_width(uint32_t index) {
if (index == 0)
return 0;

lock_guard guard(state.lock);

Variable *var = jitc_var(index);
JitBackend backend = (JitBackend) var->backend;
uint32_t var_size = var->size;

// The variable has to be evaluated, to notify the ThreadState
jitc_var_eval(index);

uint32_t width_index =
jitc_var_literal(backend, VarType::UInt32, &var_size, 1, true);

ThreadState *ts = thread_state(backend);
ts->notify_opaque_width(index, width_index);

return width_index;
}

VarState jit_var_state(uint32_t index) {
if (index == 0)
return VarState::Invalid;
Expand Down Expand Up @@ -1357,26 +1383,19 @@ const char *jit_type_name(VarType type) noexcept {
}

VarInfo jit_set_backend(uint32_t index) noexcept {
VarInfo info;

lock_guard guard(state.lock);
Variable *var = jitc_var(index);
default_backend = (JitBackend) var->backend;

info.backend = (JitBackend)var->backend;
info.type = (VarType)var->type;
info.state = jitc_var_state(index);
info.size = var->size;
info.is_array = var->is_array();
info.unaligned = var->unaligned;
if(info.state == VarState::Literal)
info.literal = var->literal;
else if (info.state == VarState::Evaluated)
info.data = var->data;

VarInfo info = jitc_var_info(index);
default_backend = info.backend;
return info;
}

VarInfo jit_var_info(uint32_t index) noexcept {
lock_guard guard(state.lock);

return jitc_var_info(index);
}

uint32_t jit_var_loop_start(const char *name, bool symbolic, size_t n_indices, uint32_t *indices) {
lock_guard guard(state.lock);
return jitc_var_loop_start(name, symbolic, n_indices, indices);
Expand Down
24 changes: 24 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
license that can be found in the LICENSE file.
*/

#include "drjit-core/jit.h"
#include "internal.h"
#include "cuda_ts.h"
#include "llvm_ts.h"
Expand Down Expand Up @@ -768,3 +769,26 @@ void ThreadState::reset_state() {
}
void ThreadState::notify_free(const void *) { }
void ThreadState::notify_expand(uint32_t) { }
void ThreadState::notify_opaque_width(uint32_t, uint32_t) {}
void ThreadState::notify_init_undefined(uint32_t) {}
void ThreadState::reduce_bool_async_4(uint8_t *values, uint32_t size,
uint8_t *out, ReduceOp op) {
/* When \c size is not a multiple of 4, the implementation will initialize
up to 3 bytes beyond the end of the supplied range so that an efficient
32 bit reduction algorithm can be used. This is fine for allocations made
using
\ref jit_malloc(), which allow for this. */

uint32_t size_4 = ceil_div(size, 4),
trailing = size_4 * 4 - size;

jitc_log(Debug, "jit_%s(" DRJIT_PTR ", size=%u)",
op == ReduceOp::Or ? "any" : "all", (uintptr_t) values, size);

if (trailing) {
bool filler = op == ReduceOp::Or ? false : true;
memset_async(values + size, trailing, sizeof(bool), &filler);
}

block_reduce(VarType::UInt32, op, size_4, size_4, values, out);
}
17 changes: 17 additions & 0 deletions src/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,23 @@ struct ThreadState : public ThreadStateBase {
virtual void reduce_expanded(VarType vt, ReduceOp op, void *data,
uint32_t exp, uint32_t size) = 0;

/// Reduces an array of booleans by filling trailing elements and applying a
/// UInt32 reduction.
virtual void reduce_bool_async_4(uint8_t *values, uint32_t size,
uint8_t *out, ReduceOp op);

/// Some kernels use the width of an array in a computation. When using the
/// kernel freezing feature, this requires special precautions to ensure
/// that the resulting capture remains usable with different array sizes.
/// This notification function exists so that this special-case handling can
/// be realized.
virtual void notify_opaque_width(uint32_t index, uint32_t width_index);

/// Notifies the thread state that an allocation should not be initialized
/// as part of the evaluation of an undefined variable. This is required for
/// frozen functions to handle undefined variables.
virtual void notify_init_undefined(uint32_t index);

/// Notify the \c ThreadState that \c jitc_free has been called on a pointer.
/// This is required for kernel freezing.
virtual void notify_free(const void *ptr);
Expand Down
105 changes: 91 additions & 14 deletions src/lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,112 @@

#if defined(__linux__) && !defined(DRJIT_USE_STD_MUTEX)
#include <pthread.h>
using Lock = pthread_spinlock_t;

struct Lock{
pthread_spinlock_t lock;
pthread_t owner;
int recursion_count;
};

// Danger zone: the drjit-core locks are held for an extremely short amount of
// time and normally uncontended. Switching to a spin lock cuts tracing time 8-10%
inline void lock_init(Lock &lock) { pthread_spin_init(&lock, PTHREAD_PROCESS_PRIVATE); }
inline void lock_destroy(Lock &lock) { pthread_spin_destroy(&lock); }
inline void lock_acquire(Lock &lock) { pthread_spin_lock(&lock); }
inline void lock_release(Lock &lock) { pthread_spin_unlock(&lock); }
inline void lock_init(Lock &lock) {
pthread_spin_init(&lock.lock, PTHREAD_PROCESS_PRIVATE);
lock.owner = 0;
lock.recursion_count = 0;
}
inline void lock_destroy(Lock &lock) { pthread_spin_destroy(&lock.lock); }
inline void lock_acquire(Lock &lock) {
pthread_t self = pthread_self();
if (pthread_equal(lock.owner, self)){
lock.recursion_count++;
return;
}

pthread_spin_lock(&lock.lock);
lock.owner = self;
lock.recursion_count = 1;
}
inline void lock_release(Lock &lock) {
lock.recursion_count--;
if(lock.recursion_count == 0){
lock.owner = 0;
pthread_spin_unlock(&lock.lock);
}
}
#elif defined(__APPLE__) && !defined(DRJIT_USE_STD_MUTEX)
#include <os/lock.h>

using Lock = os_unfair_lock_s;
inline void lock_init(Lock &lock) { lock = OS_UNFAIR_LOCK_INIT; }
struct Lock {
os_unfair_lock_s lock;
pthread_t owner;
int recursion_count;
};

inline void lock_init(Lock &lock) {
lock.lock = OS_UNFAIR_LOCK_INIT;
lock.owner = 0;
lock.recursion_count = 0;
}
inline void lock_destroy(Lock &) { }
inline void lock_acquire(Lock &lock) { os_unfair_lock_lock(&lock); }
inline void lock_release(Lock &lock) { os_unfair_lock_unlock(&lock); }
inline void lock_acquire(Lock &lock) {
pthread_t self = pthread_self();
if (pthread_equal(lock.owner, self)){
lock.recursion_count++;
return;
}

os_unfair_lock_lock(&lock.lock);
lock.owner = self;
lock.recursion_count = 1;
}
inline void lock_release(Lock &lock) {
lock.recursion_count--;
if(lock.recursion_count == 0){
lock.owner = 0;
os_unfair_lock_unlock(&lock.lock);
}
}
#else
#if defined(_WIN32)
#include <shared_mutex>
using Lock = std::shared_mutex; // Based on the faster Win7 SRWLOCK
struct Lock{
std::shared_mutex lock; // Based on the faster Win7 SRWLOCK
std::thread::id owner;
int recursion_count;
};
#else
#include <mutex>
using Lock = std::mutex;
struct Lock{
std::mutex lock; // Based on the faster Win7 SRWLOCK
std::thread::id owner;
int recursion_count;
};
#endif

inline void lock_init(Lock &) { }
inline void lock_init(Lock &lock) {
lock.owner = std::thread::id();
lock.recursion_count = 0;
}
inline void lock_destroy(Lock &) { }
inline void lock_acquire(Lock &lock) { lock.lock(); }
inline void lock_release(Lock &lock) { lock.unlock(); }
inline void lock_acquire(Lock &lock) {
std::thread::id self = std::this_thread::get_id();
if (lock.owner == self){
lock.recursion_count++;
return;
}

lock.lock.lock();
lock.owner = self;
lock.recursion_count = 1;
}
inline void lock_release(Lock &lock) {
lock.recursion_count--;
if(lock.recursion_count == 0){
lock.owner = std::thread::id();
lock.lock.unlock();
}
}
#endif

/// RAII helper for scoped lock acquisition
Expand Down
Loading