Skip to content

Commit

Permalink
[uTVM][AOT] Adding workspace byte alignment (#8019)
Browse files Browse the repository at this point in the history
* Adding workspace byte alignment

* This commit adds byte alignment support for workspaces
* Updating AoT tests to use calculate workspaces

Change-Id: I88380d875269e1ffa4a51a9cceefd51b3042f1a7

* Adding workspace byte alignment

* fixed aot_memory cpp tests
* add new error type for stack allocator bad frees

Change-Id: Iadb4770ac761ef5edb80308e18120443d269c83d

* Adding workspace byte alignment

* addressing comments + LIFO change

Change-Id: I1e8ad47e11e220f879bf936da2abb3d111db89f0

* Adding workspace byte alignment

* addressing comments further

Change-Id: Idb07d28b55520d8897d7dbcb9ef4aad5e3e7b35c

* Adding workspace byte alignment

* addressing comments - add a default constant to alignment

Change-Id: Id3f486bfdc0bd57d54b3c4097885cb54675196ca
  • Loading branch information
manupak authored May 14, 2021
1 parent 6069a70 commit 9272c02
Show file tree
Hide file tree
Showing 16 changed files with 245 additions and 92 deletions.
1 change: 1 addition & 0 deletions include/tvm/runtime/crt/error_codes.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ typedef enum {
kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
kTvmErrorPlatformStackAllocBadFree = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 5),

// Common error codes returned from generated functions.
kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
Expand Down
49 changes: 48 additions & 1 deletion include/tvm/runtime/crt/stack_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,61 @@ typedef struct {
size_t workspace_size; // Total number of bytes in the workspace
} tvm_workspace_t;

/*!
* \brief Initialize the stack-based memory manager
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param g_aot_memory The memory buffer used to allocate within
* \param workspace_size The total size of the workspace buffer workspace
*/
tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
uint8_t* g_aot_memory, size_t workspace_size);

/*!
* \brief The intended user-facing function to allocate within the buffer. It wraps
* StackMemoryManager_Allocate_Body enable and disable the LIFO check that is useful for debugging
* the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param nbytes The number of bytes required for the allocation
* \param current_alloc The pointer-to-pointer to be populated with the allocated address
*/
tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void**);
void** current_alloc);

/*!
* \brief The internal function that accepts allocate inputs and an extra byte to say to enable the
* LIFO check that is useful in debugging for debugging the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param nbytes The number of bytes required for the allocation
* \param current_alloc The pointer-to-pointer to be populated with the allocated address
* \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
*/
tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
int32_t nbytes, void** current_alloc,
uint8_t do_lifo_check);

/*!
* \brief The intended user-facing function to free the tensor within the buffer. It wraps
* StackMemoryManager_Free_Body enable and disable the stack allocator
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param ptr The base pointer of the tensor to be free'd
*/
tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);

/*!
* \brief The internal function that accepts free inputs and an extra byte to say to enable the LIFO
* check that is useful in debugging for debugging the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param ptr The base pointer of tensor to be free'd within the workspace buffer
* \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
*/
tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
uint8_t do_lifo_check);

#ifdef __cplusplus
} // extern "C"
#endif
Expand Down
4 changes: 4 additions & 0 deletions include/tvm/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ constexpr int kTempAllocaAlignment = 128;
/*! \brief Maximum size that can be allocated on stack */
constexpr int kMaxStackAlloca = 1024;

/*! \brief Number of bytes each allocation must align to by default in the workspace buffer to
* service intermediate tensors */
constexpr int kDefaultWorkspaceAlignment = 1;

/*!
* \brief TVM Runtime Device API, abstracts the device
* specific interface for memory management.
Expand Down
5 changes: 4 additions & 1 deletion include/tvm/tir/analysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,11 @@ TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr);
/*!
* \brief Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc
* \param func The TIR PrimFunc for which the workspace size to be calculated
* \param workspace_byte_alignment The byte alignment required for each tensor allocated in this
* workspace
*/
TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func);
TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func,
const Integer& workspace_byte_alignment);

/*!
* \brief Detect the lowest common ancestor(LCA) of buffer access, including both high-level
Expand Down
6 changes: 4 additions & 2 deletions python/tvm/tir/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,23 @@ def get_block_access_region(block, buffer_var_map):
return _ffi_api.get_block_access_region(block, buffer_var_map)


def calculate_workspace_bytes(func: PrimFunc):
def calculate_workspace_bytes(func: PrimFunc, workspace_byte_alignment: int):
"""Calculate the workspace size in bytes needed by the TIR allocates inside the TIR
PrimFunc.
Parameters
----------
func: tvm.tir.PrimFunc
The function to be detected.
workspace_byte_alignment : int
The byte alignment required for each tensor
Returns
-------
result : int
Workspace size in bytes.
"""
return _ffi_api.calculate_workspace_bytes(func)
return _ffi_api.calculate_workspace_bytes(func, workspace_byte_alignment)


def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
Expand Down
8 changes: 6 additions & 2 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,9 @@ class AOTExecutorCodegen : public ExprVisitor {
* \param func The main function that contains calls to operator tir primitive functions
*/
void UpdateMainWorkspaceSize(const tir::PrimFunc& primfunc, const relay::Function& func) {
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment = target_host_->GetAttr<Integer>("workspace-byte-alignment")
.value_or(tvm::runtime::kDefaultWorkspaceAlignment);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
// Populate FunctionInfo
auto fi_node = make_object<FunctionInfoNode>();
// Initialize all target workspaces to zero
Expand Down Expand Up @@ -318,7 +320,9 @@ class AOTExecutorCodegen : public ExprVisitor {
auto fi_node = make_object<FunctionInfoNode>();
for (const auto& kv : cfunc->funcs->functions) {
auto primfunc = Downcast<tir::PrimFunc>(kv.second);
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment =
target_host_->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
Target primfunc_target = relay_target;
if (primfunc->attrs->dict.count("target")) {
primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
Expand Down
4 changes: 3 additions & 1 deletion src/relay/backend/graph_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
auto fi_node = make_object<FunctionInfoNode>();
for (const auto& kv : cfunc->funcs->functions) {
auto primfunc = Downcast<tir::PrimFunc>(kv.second);
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment = relay_target->GetAttr<Integer>("workspace-byte-alignment")
.value_or(tvm::runtime::kDefaultWorkspaceAlignment);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
Target primfunc_target = relay_target;
if (primfunc->attrs->dict.count("target")) {
primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
Expand Down
3 changes: 0 additions & 3 deletions src/runtime/crt/host/crt_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@
/*! \brief Maximum length of a PackedFunc function name. */
#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30

/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */
#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK

// #define TVM_CRT_FRAMER_ENABLE_LOGS

#endif // TVM_RUNTIME_CRT_HOST_CRT_CONFIG_H_
57 changes: 38 additions & 19 deletions src/runtime/crt/memory/stack_allocator.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@
*/
// LINT_C_FILE
#include <tvm/runtime/crt/stack_allocator.h>
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
#include <tvm/runtime/crt/logging.h>
#endif

tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void** current_alloc) {
tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
int32_t nbytes, void** current_alloc,
uint8_t do_lifo_check) {
// reserve bytes at the end of the allocation such that
// next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0.
uint32_t offset_bytes =
Expand All @@ -34,30 +32,51 @@ tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspa
}
(*current_alloc) = tvm_runtime_workspace->next_alloc;
uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
return kTvmErrorPlatformNoMemory;
if (do_lifo_check != 0) {
if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
return kTvmErrorPlatformNoMemory;
}
const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
*((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
}
const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
*((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
#endif

tvm_runtime_workspace->next_alloc = next_alloc;
return kTvmErrorNoError;
}

tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering");
tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void** current_alloc) {
uint8_t do_lifo_check = 0;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
do_lifo_check = 1;
#endif
tvm_runtime_workspace->next_alloc = ptr;
return StackMemoryManager_Allocate_Body(tvm_runtime_workspace, nbytes, current_alloc,
do_lifo_check);
}

tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
uint8_t do_lifo_check) {
if (do_lifo_check != 0) {
uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
if (expected_size != actual_size) {
return kTvmErrorPlatformStackAllocBadFree;
}
}
tvm_runtime_workspace->next_alloc = (uint8_t*)ptr;
return kTvmErrorNoError;
}

tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
uint8_t do_lifo_check = 0;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
do_lifo_check = 1;
#endif
return StackMemoryManager_Free_Body(tvm_runtime_workspace, ptr, do_lifo_check);
}

tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
uint8_t* g_aot_memory, size_t workspace_size) {
tvm_runtime_workspace->next_alloc = g_aot_memory;
Expand Down
1 change: 1 addition & 0 deletions src/target/target_kind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
.add_attr_option<String>("mcpu")
.add_attr_option<String>("march")
.add_attr_option<String>("executor")
.add_attr_option<Integer>("workspace-byte-alignment")
.set_default_keys({"cpu"});

TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
Expand Down
19 changes: 14 additions & 5 deletions src/tir/analysis/calculate_workspace.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* \brief Calculate any intermediary memory required by PrimFuncs.
*/
#include <tvm/arith/analyzer.h>
#include <tvm/runtime/device_api.h>
#include <tvm/tir/analysis.h>
#include <tvm/tir/function.h>
#include <tvm/tir/stmt_functor.h>
Expand All @@ -33,10 +34,12 @@ class WorkspaceCalculator : public StmtExprVisitor {
public:
WorkspaceCalculator() = default;
size_t operator()(const PrimFunc& func);
size_t byte_alignment = tvm::runtime::kDefaultWorkspaceAlignment;

private:
void VisitStmt_(const AllocateNode* op) override;
size_t CalculateExtentsSize(const AllocateNode* op);
size_t GetByteAlignedSize(size_t non_aligned_size);
size_t current_size = 0;
size_t max_size = 0;
};
Expand All @@ -46,6 +49,10 @@ size_t WorkspaceCalculator::operator()(const PrimFunc& func) {
return this->max_size;
}

size_t WorkspaceCalculator::GetByteAlignedSize(size_t non_aligned_size) {
return ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment;
}

size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
size_t element_size_bytes = op->dtype.bytes();
size_t num_elements = 1;
Expand All @@ -57,7 +64,7 @@ size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
num_elements = 0;
}
}
return num_elements * element_size_bytes;
return GetByteAlignedSize(num_elements * element_size_bytes);
}

void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
Expand All @@ -70,14 +77,16 @@ void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
current_size -= size;
}

size_t CalculateWorkspaceBytes(const PrimFunc& func) {
size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& workspace_byte_alignment) {
WorkspaceCalculator wc;
wc.byte_alignment = workspace_byte_alignment->value;
return wc(func);
}

TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes").set_body_typed([](PrimFunc func) {
return static_cast<int>(CalculateWorkspaceBytes(func));
});
TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes")
.set_body_typed([](PrimFunc func, Integer workspace_byte_alignment) {
return static_cast<int>(CalculateWorkspaceBytes(func, workspace_byte_alignment));
});

} // namespace tir
} // namespace tvm
Loading

0 comments on commit 9272c02

Please sign in to comment.