Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use aligned_alloc() as default allocator for HalideBuffer.h on most platforms #7190

Merged
merged 13 commits into from
Dec 7, 2022
109 changes: 100 additions & 9 deletions src/runtime/HalideBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@
#include <atomic>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <memory>
#include <vector>

#ifdef __APPLE__
#include <AvailabilityVersions.h>
#include <TargetConditionals.h>
#endif

#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
Expand All @@ -39,6 +45,60 @@
#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
#endif

#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
// Conservatively align buffer allocations to 128 bytes by default.
// This is enough alignment for all the platforms currently in use.
// Redefine this in your compiler settings if you desire more/less alignment.
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
#endif

// Unfortunately, not all C++17 runtimes support aligned_alloc
// (it may depends on OS/SDK version); this is provided as an opt-out
// if you are compiling on a platform that doesn't provide a (good)
// implementation.
#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC

// clang-format off
#ifdef _MSC_VER

// MSVC doesn't implement aligned_alloc(), even in C++17 mode, and
// has stated they probably never will, so, always default it off here.
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28

// Android doesn't provide aligned_alloc until API 28
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif defined(__APPLE__)

#if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_15)

// macOS doesn't provide aligned_alloc until 10.15
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MAX_ALLOWED < 101500)

// iOS doesn't provide aligned_alloc until 14.0
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#else

// Assume it's ok on all other Apple targets
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1

#endif

#else

// Not Windows, Android, or Apple: just asuume it's ok
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1

#endif
// clang-format on

#endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC

namespace Halide {
namespace Runtime {

Expand Down Expand Up @@ -803,25 +863,56 @@ class Buffer {
* owned memory. */
void allocate(void *(*allocate_fn)(size_t) = nullptr,
void (*deallocate_fn)(void *) = nullptr) {
// Drop any existing allocation
deallocate();

// Conservatively align images to (usually) 128 bytes. This is enough
// alignment for all the platforms we might use. Also ensure that the allocation
// is such that the logical size is an integral multiple of 128 bytes (or a bit more).
constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;

const auto align_up = [=](size_t value) -> size_t {
return (value + alignment - 1) & ~(alignment - 1);
};

size_t size = size_in_bytes();

#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
// Only use aligned_alloc() if no custom allocators are specified.
if (!allocate_fn && !deallocate_fn) {
// As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
// on any supported platform, so we will just overallocate by 'alignment'
// so that the user storage also starts at an aligned point. This is a bit
// wasteful, but probably not a big deal.
static_assert(sizeof(AllocationHeader) <= alignment);
void *alloc_storage = std::aligned_alloc(alignment, align_up(size) + alignment);
assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
alloc = new (alloc_storage) AllocationHeader(free);
buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
return;
}
// else fall thru
#endif
if (!allocate_fn) {
allocate_fn = malloc;
}
if (!deallocate_fn) {
deallocate_fn = free;
}

// Drop any existing allocation
deallocate();
static_assert(sizeof(AllocationHeader) <= alignment);

// Conservatively align images to 128 bytes. This is enough
// alignment for all the platforms we might use.
size_t size = size_in_bytes();
const size_t alignment = 128;
size = (size + alignment - 1) & ~(alignment - 1);
void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1);
// malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
// make sure this is OK for AllocationHeader, since it always goes at the start
static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));

const size_t requested_size = align_up(size + alignment +
std::max(0, (int)sizeof(AllocationHeader) -
(int)sizeof(std::max_align_t)));
void *alloc_storage = allocate_fn(requested_size);
alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1));
buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
}

/** Drop reference to any owned host or device memory, possibly
Expand Down