Skip to content

Commit 33e854b

Browse files
committed
[OpenCL][Texture] Improved texture memmory planning and runtime memory allocation
Motivated form the fact that textures can be allocated over a clBuffer object and the size of backing clBuffer can be computed based on hardware image pitch alignment. This optimizes the overall memory allocation on device and helps greately the models with large memory requirements. Improvised the graph memory planner to not differentiate buffer and texture storage tokens and reuse them across. The texture pool in OpenCL runtime is rebranded as memory pool that handles allocation for both buffer and image objects. NDArray to DeviceAPI interface is extended with AllocDataSpaceView and FreeDataSpaceView. These new API's acommodates accessing same physical memory as clBuffer / clImage objects.
1 parent 2d2b727 commit 33e854b

File tree

20 files changed

+614
-489
lines changed

20 files changed

+614
-489
lines changed

apps/android_camera/app/src/main/jni/tvm_runtime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
#include "../src/runtime/opencl/opencl_device_api.cc"
6464
#include "../src/runtime/opencl/opencl_module.cc"
6565
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
66-
#include "../src/runtime/opencl/texture_pool.cc"
66+
#include "../src/runtime/opencl/memory_pool.cc"
6767
#include "../src/runtime/source_utils.cc"
6868
#endif
6969

apps/android_deploy/app/src/main/jni/tvm_runtime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,6 @@
4848
#include "../src/runtime/opencl/opencl_device_api.cc"
4949
#include "../src/runtime/opencl/opencl_module.cc"
5050
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
51-
#include "../src/runtime/opencl/texture_pool.cc"
51+
#include "../src/runtime/opencl/memory_pool.cc"
5252
#include "../src/runtime/source_utils.cc"
5353
#endif

apps/android_rpc/app/src/main/jni/tvm_runtime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
#include "../src/runtime/opencl/opencl_device_api.cc"
6666
#include "../src/runtime/opencl/opencl_module.cc"
6767
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
68-
#include "../src/runtime/opencl/texture_pool.cc"
68+
#include "../src/runtime/opencl/memory_pool.cc"
6969
#include "../src/runtime/source_utils.cc"
7070
#endif
7171

include/tvm/runtime/device_api.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ enum DeviceAttrKind : int {
4848
kMaxRegistersPerBlock = 9,
4949
kGcnArch = 10,
5050
kApiVersion = 11,
51-
kDriverVersion = 12
51+
kDriverVersion = 12,
52+
kImagePitchAlignment = 13
5253
};
5354

5455
#ifdef TVM_KALLOC_ALIGNMENT
@@ -124,12 +125,33 @@ class TVM_DLL DeviceAPI {
124125
*/
125126
virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
126127
Optional<String> mem_scope = NullOpt);
128+
129+
/*!
130+
* \brief Create a new view with given spec over existing tensor.
131+
* \param dev The device device to perform operation.
132+
* \param data The source array.
133+
* \param ndim The number of dimension of allocated tensor.
134+
* \param shape The shape of allocated tensor.
135+
* \param dtype The type of elements.
136+
* \param mem_scope The memory scope of allocated tensor.
137+
* \return The allocated device pointer.
138+
*/
139+
virtual void* AllocDataSpaceView(Device dev, void* data, int ndim, const int64_t* shape,
140+
DLDataType dtype, Optional<String> mem_scope = NullOpt);
127141
/*!
128142
* \brief Free a data space on device.
129143
* \param dev The device device to perform operation.
130144
* \param ptr The data space.
131145
*/
132146
virtual void FreeDataSpace(Device dev, void* ptr) = 0;
147+
148+
/*!
149+
* \brief Free a view data space on device.
150+
* \param dev The device device to perform operation.
151+
* \param ptr The data space view.
152+
*/
153+
virtual void FreeDataSpaceView(Device dev, void* ptr);
154+
133155
/*!
134156
* \brief copy data from one place to another
135157
* \note This API is designed to support special memory with shape dependent layout.

include/tvm/runtime/ndarray.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,11 @@ class NDArray : public ObjectRef {
128128
* \brief Create a NDArray that shares the data memory with the current one.
129129
* \param shape The shape of the new array.
130130
* \param dtype The data type of the new array.
131+
* \param mem_scope The memory scope of the array.
131132
* \note The memory size of new array must be smaller than the current one.
132133
*/
133-
TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype);
134+
TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype,
135+
Optional<String> mem_scope = NullOpt);
134136
/*!
135137
* \brief Create a reference view of NDArray that
136138
* represents as DLManagedTensor.

src/relay/backend/graph_plan_memory.cc

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor {
120120
protected:
121121
/*! \brief internal token map */
122122
std::unordered_map<const ExprNode*, std::vector<StorageToken*>> token_map_;
123+
/*! \brief the virtual device map */
124+
std::unordered_map<const ExprNode*, VirtualDevice> virtual_device_map_;
123125
/*! \brief empty token map */
124126
const std::vector<StorageToken*> no_tokens_;
125127

@@ -246,13 +248,12 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
246248
sid_sizes_byte.reserve(kv.second.size());
247249

248250
for (StorageToken* tok : kv.second) {
249-
VLOG(1) << "token: " << tok->ToString();
250251
if (tok->is_valid()) {
251252
num_annotated_nodes++;
252253
}
253254
num_nodes++;
254255
storage_ids.push_back(tok->storage_id);
255-
virtual_devices.push_back(tok->virtual_device);
256+
virtual_devices.push_back(virtual_device_map_[kv.first]);
256257
sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
257258
}
258259
auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
@@ -293,6 +294,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
293294
}
294295
}
295296
token_map_[op] = tokens;
297+
virtual_device_map_[op] = virtual_device;
296298
}
297299

298300
// Mark op to reuse the input_token
@@ -356,34 +358,27 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
356358

357359
class TokenAllocator {
358360
public:
359-
StorageToken* Alloc(StorageToken* proto) {
360-
return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
361-
: token_1d_.Alloc(proto, storage_ids_++);
362-
}
361+
StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); }
363362
StorageToken* Request(StorageToken* proto) {
364-
StorageToken* token =
365-
Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
363+
StorageToken* token = token_mixed_.Request(proto);
366364
return token ? token : this->Alloc(proto);
367365
}
368-
void CheckForRelease(StorageToken* tok) {
369-
return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
370-
}
366+
void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); }
371367

372368
size_t GetMemorySize(StorageToken* tok) {
373369
// TODO(amalyshe): figure out who requries sizes and for what
374370
// size in case of texture is not enough - we can return any value if it
375371
// assumed to be used for memory allocatoion or we can return real size
376372
// if it is just for information
377-
return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
373+
return token_mixed_.GetMemorySize(tok);
378374
}
379375
static bool Is2DStorage(StorageToken* tok) {
380376
return relay::Is2DStorage(tok->virtual_device->memory_scope);
381377
}
382378

383379
private:
384380
int64_t storage_ids_{0};
385-
TokenAllocator1D token_1d_;
386-
TokenAllocator2D token_2d_;
381+
TokenAllocatorMixed token_mixed_;
387382
};
388383

389384
private:

src/relay/backend/token_allocator.cc

Lines changed: 55 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,30 @@
3131

3232
namespace tvm {
3333
namespace relay {
34+
constexpr auto Is2DStorage = runtime::IsTextureStorage;
3435

35-
size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
36+
/*
37+
* Mixed mode memory allocator
38+
*/
39+
size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) {
3640
TensorType ttype = prototype->ttype;
3741
ICHECK(ttype.defined());
3842
size_t size = 1;
39-
for (IndexExpr dim : ttype->shape) {
40-
const int64_t* pval = tir::as_const_int(dim);
41-
ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
42-
ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
43-
size *= static_cast<size_t>(pval[0]);
43+
if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) {
44+
size = GetSize2D(prototype);
45+
} else {
46+
for (IndexExpr dim : ttype->shape) {
47+
const int64_t* pval = tir::as_const_int(dim);
48+
ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
49+
ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
50+
size *= static_cast<size_t>(pval[0]);
51+
}
52+
size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
4453
}
45-
size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
4654
return size;
4755
}
4856

49-
StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
57+
StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) {
5058
// calculate the size;
5159
size_t size = GetMemorySize(prototype);
5260
// search memory block in [size / match_range_, size * match_range_)
@@ -56,145 +64,77 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
5664
auto begin = free_.lower_bound(size / match_range_);
5765
auto mid = free_.lower_bound(size);
5866
auto end = free_.upper_bound(size * match_range_);
67+
auto prototype_keys = prototype->virtual_device->target->GetKeys();
68+
bool is_prototype_adreno =
69+
std::find(prototype_keys.begin(), prototype_keys.end(), "adreno") != prototype_keys.end();
5970
// search for memory blocks larger than requested
6071
for (auto it = mid; it != end; ++it) {
6172
StorageToken* tok = it->second;
62-
if (!tok->is_compatible(*prototype)) continue;
63-
ICHECK_EQ(tok->ref_counter, 0);
64-
// Use exect matching strategy
65-
tok->max_bytes = std::max(size, tok->max_bytes);
66-
tok->ref_counter = prototype->ref_counter;
67-
// find a exact match, erase from map and return
68-
free_.erase(it);
69-
return tok;
73+
// TODO(Siva): We need a additional ways of comparing VirtualDevice
74+
auto tok_keys = tok->virtual_device->target->GetKeys();
75+
bool is_tok_adreno = std::find(tok_keys.begin(), tok_keys.end(), "adreno") != tok_keys.end();
76+
77+
if (tok->is_compatible(*prototype) || (is_prototype_adreno && is_tok_adreno)) {
78+
ICHECK_EQ(tok->ref_counter, 0);
79+
// Use exect matching strategy
80+
tok->max_bytes = std::max(size, tok->max_bytes);
81+
tok->ref_counter = prototype->ref_counter;
82+
// find a exact match, erase from map and return
83+
free_.erase(it);
84+
return tok;
85+
}
7086
}
7187
// then search for memory blocks smaller than requested space
7288
for (auto it = mid; it != begin;) {
7389
--it;
7490
StorageToken* tok = it->second;
75-
if (!tok->is_compatible(*prototype)) continue;
76-
ICHECK_EQ(tok->ref_counter, 0);
77-
// Use exect matching strategy
78-
tok->max_bytes = std::max(size, tok->max_bytes);
79-
tok->ref_counter = prototype->ref_counter;
80-
// erase from map and return
81-
free_.erase(it);
82-
return tok;
91+
auto tok_keys = tok->virtual_device->target->GetKeys();
92+
bool is_tok_adreno = std::find(tok_keys.begin(), tok_keys.end(), "adreno") != tok_keys.end();
93+
if (tok->is_compatible(*prototype) || (is_prototype_adreno && is_tok_adreno)) {
94+
ICHECK_EQ(tok->ref_counter, 0);
95+
// Use exect matching strategy
96+
tok->max_bytes = std::max(size, tok->max_bytes);
97+
tok->ref_counter = prototype->ref_counter;
98+
// erase from map and return
99+
free_.erase(it);
100+
return tok;
101+
}
83102
}
84103
return nullptr;
85104
}
86105

87-
StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
106+
StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) {
88107
size_t size = GetMemorySize(prototype);
89108
prototype->max_bytes = size;
90109
prototype->storage_id = storage_id;
91110
data_.push_back(prototype);
92111
return prototype;
93112
}
94113

95-
void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
114+
void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) {
96115
ICHECK_GE(tok->storage_id, 0);
97116
ICHECK_GE(tok->ref_counter, 0);
98117
if (tok->ref_counter == 0) {
99118
free_.insert({tok->max_bytes, tok});
100119
}
101120
}
102121

103-
StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
104-
auto shape = GetSize2D(prototype);
105-
const int64_t max_ratio = 5;
106-
int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
107-
int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
108-
int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
109-
int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
110-
int64_t best_storage_id = -1;
111-
MemBlock new_mem;
112-
for (int64_t free_id : free_list_) {
113-
MemBlock& cached = blocks_[free_id];
114-
// Can only reuse texture 2d blocks of the same type
115-
if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
116-
continue;
117-
}
118-
// Can only reuse texture 2d blocks of the same scope
119-
// Because reusing textures with different memory scope may lead to
120-
// accuracy issues, because the data will be packed in a different way for
121-
// different memory scopes.
122-
if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
123-
continue;
124-
}
125-
// avoid reusing too small and too big textures
126-
if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
127-
shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
128-
continue;
129-
}
130-
int64_t new_width = std::max(cached.x_, shape.width);
131-
int64_t new_height = std::max(cached.y_, shape.height);
132-
int64_t added_size_x = new_width - cached.x_;
133-
int64_t added_size_y = new_height - cached.y_;
134-
int64_t wasted_size_x = new_width - shape.width;
135-
int64_t wasted_size_y = new_height - shape.height;
136-
// Prioritize minimization of added size first, then minimize
137-
// wasted size among blocks which would not require expansion
138-
if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
139-
(min_added_size_y > 0 && added_size_y < min_added_size_y) ||
140-
(min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
141-
(min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
142-
min_added_size_x = added_size_x;
143-
min_added_size_y = added_size_y;
144-
min_wasted_size_x = wasted_size_x;
145-
min_wasted_size_y = wasted_size_y;
146-
best_storage_id = free_id;
147-
new_mem.x_ = new_width;
148-
new_mem.y_ = new_height;
149-
}
150-
}
151-
152-
if (min_added_size_x == 0 && min_added_size_y == 0) {
153-
// use existing block
154-
free_list_.erase(best_storage_id);
155-
blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
156-
return blocks_[best_storage_id].token_;
157-
} else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
158-
// Reset the reference counter of the now live token
159-
free_list_.erase(best_storage_id);
160-
new_mem.token_ = prototype;
161-
new_mem.token_->ref_counter += 1;
162-
new_mem.token_->storage_id = best_storage_id;
163-
blocks_[best_storage_id] = new_mem;
164-
return new_mem.token_;
165-
}
166-
return nullptr;
167-
}
168-
169-
StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
170-
auto shape = GetSize2D(prototype);
171-
MemBlock block;
172-
block.x_ = shape.width;
173-
block.y_ = shape.height;
174-
prototype->storage_id = storage_id;
175-
block.token_ = prototype;
176-
blocks_[prototype->storage_id] = block;
177-
return prototype;
178-
}
179-
180-
void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
181-
ICHECK_GE(tok->storage_id, 0);
182-
ICHECK_GE(tok->ref_counter, 0);
183-
if (tok->ref_counter == 0) {
184-
free_list_.insert(tok->storage_id);
185-
}
186-
}
187-
188-
runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
122+
size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) {
189123
TensorType ttype = prototype->ttype;
190124
ICHECK(ttype.defined());
191-
size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
192-
prototype->virtual_device->memory_scope);
193125
struct Shape {
194126
const Array<PrimExpr>& shape;
195127
int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
128+
int size() { return this->shape.size(); }
196129
};
197-
return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
130+
auto shape = Shape{ttype->shape};
131+
int image_row_align =
132+
prototype->virtual_device->target->GetAttr<Integer>("image_base_address_alignment")
133+
.value_or(Integer(64))
134+
->value;
135+
return runtime::GetTextureMemorySize<Shape>(shape, ttype->dtype.bits(), ttype->dtype.lanes(),
136+
prototype->virtual_device->memory_scope,
137+
image_row_align);
198138
}
199139

200140
} // namespace relay

0 commit comments

Comments
 (0)