Skip to content

Commit 2b48572

Browse files
committed
[OpenCL][Texture] Improved texture memmory planning and runtime memory allocation
Motivated form the fact that textures can be allocated over a clBuffer object and the size of backing clBuffer can be computed based on hardware image pitch alignment. This optimizes the overall memory allocation on device and helps greately the models with large memory requirements. Improvised the graph memory planner to not differentiate buffer and texture storage tokens and reuse them across. The texture pool in OpenCL runtime is rebranded as memory pool that handles allocation for both buffer and image objects. NDArray to DeviceAPI interface is extended with AllocDataSpaceView and FreeDataSpaceView. These new API's acommodates accessing same physical memory as clBuffer / clImage objects. * MemoryPool test cases and lint errors. * test cases and fallback support. * bug fix and cpp-runtime tests cases for texture views. * various cl device info organized * fix graph plan memory bug and correct the testcase. * device attribute handling * Some fallback for texture plan on devices w/o cl_khr_image2d_from_buffer * Memory Manager Move the VM memory manager to the runtime level. Use this memory manager for graph runtime. * Resolve conflicts for VerifyDataType and Buffer * review comments
1 parent e754bc2 commit 2b48572

34 files changed

+942
-818
lines changed

apps/android_camera/app/src/main/jni/tvm_runtime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,10 @@
6161
#include "../src/runtime/workspace_pool.cc"
6262

6363
#ifdef TVM_OPENCL_RUNTIME
64+
#include "../src/runtime/opencl/memory_pool.cc"
6465
#include "../src/runtime/opencl/opencl_device_api.cc"
6566
#include "../src/runtime/opencl/opencl_module.cc"
6667
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
67-
#include "../src/runtime/opencl/texture_pool.cc"
6868
#include "../src/runtime/source_utils.cc"
6969
#endif
7070

apps/android_deploy/app/src/main/jni/tvm_runtime.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,5 @@
4949
#include "../src/runtime/opencl/opencl_device_api.cc"
5050
#include "../src/runtime/opencl/opencl_module.cc"
5151
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
52-
#include "../src/runtime/opencl/texture_pool.cc"
5352
#include "../src/runtime/source_utils.cc"
5453
#endif

apps/android_rpc/app/src/main/jni/tvm_runtime.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@
6666
#include "../src/runtime/opencl/opencl_device_api.cc"
6767
#include "../src/runtime/opencl/opencl_module.cc"
6868
#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
69-
#include "../src/runtime/opencl/texture_pool.cc"
7069
#include "../src/runtime/source_utils.cc"
7170
#endif
7271

include/tvm/runtime/device_api.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum DeviceAttrKind : int {
5050
kApiVersion = 11,
5151
kDriverVersion = 12,
5252
kL2CacheSizeBytes = 13,
53+
kImagePitchAlignment = 14
5354
};
5455

5556
#ifdef TVM_KALLOC_ALIGNMENT
@@ -133,12 +134,32 @@ class TVM_DLL DeviceAPI {
133134
*/
134135
virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
135136
Optional<String> mem_scope = NullOpt);
137+
138+
/*!
139+
* \brief Create a new view with given spec over existing tensor.
140+
* \param dev The device device to perform operation.
141+
* \param data The source array.
142+
* \param shape The shape of allocated tensor.
143+
* \param dtype The type of elements.
144+
* \param mem_scope The memory scope of allocated tensor.
145+
* \return The allocated device pointer.
146+
*/
147+
virtual void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
148+
Optional<String> mem_scope = NullOpt);
136149
/*!
137150
* \brief Free a data space on device.
138151
* \param dev The device device to perform operation.
139152
* \param ptr The data space.
140153
*/
141154
virtual void FreeDataSpace(Device dev, void* ptr) = 0;
155+
156+
/*!
157+
* \brief Free a view data space on device.
158+
* \param dev The device device to perform operation.
159+
* \param ptr The data space view.
160+
*/
161+
virtual void FreeDataSpaceView(Device dev, void* ptr);
162+
142163
/*!
143164
* \brief copy data from one place to another
144165
* \note This API is designed to support special memory with shape dependent layout.

include/tvm/runtime/memory/memory_manager.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,12 @@ class Storage : public ObjectRef {
160160
};
161161

162162
} // namespace memory
163+
164+
using memory::Allocator;
165+
using memory::AllocatorType;
166+
using memory::MemoryManager;
167+
using memory::StorageObj;
168+
163169
} // namespace runtime
164170
} // namespace tvm
165171

include/tvm/runtime/ndarray.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,11 @@ class NDArray : public ObjectRef {
129129
* \brief Create a NDArray that shares the data memory with the current one.
130130
* \param shape The shape of the new array.
131131
* \param dtype The data type of the new array.
132+
* \param mem_scope The memory scope of the array.
132133
* \note The memory size of new array must be smaller than the current one.
133134
*/
134-
TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype);
135+
TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype,
136+
Optional<String> mem_scope = NullOpt);
135137
/*!
136138
* \brief Create a reference view of NDArray that
137139
* represents as DLManagedTensor.

src/relay/backend/graph_plan_memory.cc

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,16 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
229229
VLOG_CONTEXT << "StorageAllocator";
230230
VLOG(1) << "planning:" << std::endl << PrettyPrint(func);
231231
prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
232+
// Backup the virtual devices as token reuse might lost the original memory scope
233+
std::unordered_map<const ExprNode*, std::vector<VirtualDevice>> virtual_device_map_;
234+
for (const auto& kv : prototype_) {
235+
std::vector<VirtualDevice> virtual_devices;
236+
virtual_devices.reserve(kv.second.size());
237+
for (StorageToken* tok : kv.second) {
238+
virtual_devices.push_back(tok->virtual_device);
239+
}
240+
virtual_device_map_.insert({kv.first, virtual_devices});
241+
}
232242
this->Run(func);
233243

234244
// The value of smap contains two integer arrays where the first array
@@ -252,9 +262,13 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
252262
}
253263
num_nodes++;
254264
storage_ids.push_back(tok->storage_id);
255-
virtual_devices.push_back(tok->virtual_device);
256265
sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
257266
}
267+
ICHECK(kv.second.size() == virtual_device_map_[kv.first].size())
268+
<< "Mismatch of tokens and virtual devices";
269+
for (auto vdev : virtual_device_map_[kv.first]) {
270+
virtual_devices.push_back(vdev);
271+
}
258272
auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
259273
std::move(sid_sizes_byte));
260274
smap.Set(GetRef<Expr>(kv.first), storage_info);
@@ -356,34 +370,27 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
356370

357371
class TokenAllocator {
358372
public:
359-
StorageToken* Alloc(StorageToken* proto) {
360-
return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
361-
: token_1d_.Alloc(proto, storage_ids_++);
362-
}
373+
StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); }
363374
StorageToken* Request(StorageToken* proto) {
364-
StorageToken* token =
365-
Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
375+
StorageToken* token = token_mixed_.Request(proto);
366376
return token ? token : this->Alloc(proto);
367377
}
368-
void CheckForRelease(StorageToken* tok) {
369-
return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
370-
}
378+
void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); }
371379

372380
size_t GetMemorySize(StorageToken* tok) {
373381
// TODO(amalyshe): figure out who requries sizes and for what
374382
// size in case of texture is not enough - we can return any value if it
375383
// assumed to be used for memory allocatoion or we can return real size
376384
// if it is just for information
377-
return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
385+
return token_mixed_.GetMemorySize(tok);
378386
}
379387
static bool Is2DStorage(StorageToken* tok) {
380388
return relay::Is2DStorage(tok->virtual_device->memory_scope);
381389
}
382390

383391
private:
384392
int64_t storage_ids_{0};
385-
TokenAllocator1D token_1d_;
386-
TokenAllocator2D token_2d_;
393+
TokenAllocatorMixed token_mixed_;
387394
};
388395

389396
private:

src/relay/backend/token_allocator.cc

Lines changed: 66 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,39 @@
3131

3232
namespace tvm {
3333
namespace relay {
34+
constexpr auto Is2DStorage = runtime::IsTextureStorage;
3435

35-
size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
36+
/*
37+
* Mixed mode memory allocator
38+
*/
39+
size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) {
3640
TensorType ttype = prototype->ttype;
3741
ICHECK(ttype.defined());
3842
size_t size = 1;
39-
for (IndexExpr dim : ttype->shape) {
40-
const int64_t* pval = tir::as_const_int(dim);
41-
ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
42-
ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
43-
size *= static_cast<size_t>(pval[0]);
43+
if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) {
44+
size = GetSize2D(prototype);
45+
} else {
46+
for (IndexExpr dim : ttype->shape) {
47+
const int64_t* pval = tir::as_const_int(dim);
48+
ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
49+
ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
50+
size *= static_cast<size_t>(pval[0]);
51+
}
52+
size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
4453
}
45-
size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
4654
return size;
4755
}
4856

49-
StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
57+
bool IsTargetContainsKey(StorageToken* tok, String key) {
58+
Target null_tgt{nullptr};
59+
if (null_tgt == tok->virtual_device->target) {
60+
return false;
61+
}
62+
auto prototype_keys = tok->virtual_device->target->GetKeys();
63+
return std::find(prototype_keys.begin(), prototype_keys.end(), key) != prototype_keys.end();
64+
}
65+
66+
StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) {
5067
// calculate the size;
5168
size_t size = GetMemorySize(prototype);
5269
// search memory block in [size / match_range_, size * match_range_)
@@ -57,144 +74,78 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
5774
auto mid = free_.lower_bound(size);
5875
auto end = free_.upper_bound(size * match_range_);
5976
// search for memory blocks larger than requested
77+
bool is_prototype_adreno = IsTargetContainsKey(prototype, "adreno");
6078
for (auto it = mid; it != end; ++it) {
6179
StorageToken* tok = it->second;
62-
if (!tok->is_compatible(*prototype)) continue;
63-
ICHECK_EQ(tok->ref_counter, 0);
64-
// Use exect matching strategy
65-
tok->max_bytes = std::max(size, tok->max_bytes);
66-
tok->ref_counter = prototype->ref_counter;
67-
// find a exact match, erase from map and return
68-
free_.erase(it);
69-
return tok;
80+
// TODO(Siva): We need a additional ways of comparing VirtualDevice
81+
bool is_tok_adreno = IsTargetContainsKey(tok, "adreno");
82+
83+
if (tok->is_compatible(*prototype) || (is_prototype_adreno && is_tok_adreno)) {
84+
ICHECK_EQ(tok->ref_counter, 0);
85+
// Use exect matching strategy
86+
if (size > tok->max_bytes) {
87+
tok->max_bytes = size;
88+
tok->ttype = prototype->ttype;
89+
}
90+
tok->ref_counter = prototype->ref_counter;
91+
// find a exact match, erase from map and return
92+
free_.erase(it);
93+
return tok;
94+
}
7095
}
7196
// then search for memory blocks smaller than requested space
7297
for (auto it = mid; it != begin;) {
7398
--it;
7499
StorageToken* tok = it->second;
75-
if (!tok->is_compatible(*prototype)) continue;
76-
ICHECK_EQ(tok->ref_counter, 0);
77-
// Use exect matching strategy
78-
tok->max_bytes = std::max(size, tok->max_bytes);
79-
tok->ref_counter = prototype->ref_counter;
80-
// erase from map and return
81-
free_.erase(it);
82-
return tok;
100+
bool is_tok_adreno = IsTargetContainsKey(tok, "adreno");
101+
if (tok->is_compatible(*prototype) || (is_prototype_adreno && is_tok_adreno)) {
102+
ICHECK_EQ(tok->ref_counter, 0);
103+
// Use exect matching strategy
104+
if (size > tok->max_bytes) {
105+
tok->max_bytes = size;
106+
tok->ttype = prototype->ttype;
107+
}
108+
tok->ref_counter = prototype->ref_counter;
109+
// erase from map and return
110+
free_.erase(it);
111+
return tok;
112+
}
83113
}
84114
return nullptr;
85115
}
86116

87-
StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
117+
StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) {
88118
size_t size = GetMemorySize(prototype);
89119
prototype->max_bytes = size;
90120
prototype->storage_id = storage_id;
91121
data_.push_back(prototype);
92122
return prototype;
93123
}
94124

95-
void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
125+
void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) {
96126
ICHECK_GE(tok->storage_id, 0);
97127
ICHECK_GE(tok->ref_counter, 0);
98128
if (tok->ref_counter == 0) {
99129
free_.insert({tok->max_bytes, tok});
100130
}
101131
}
102132

103-
StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
104-
auto shape = GetSize2D(prototype);
105-
const int64_t max_ratio = 5;
106-
int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
107-
int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
108-
int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
109-
int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
110-
int64_t best_storage_id = -1;
111-
MemBlock new_mem;
112-
for (int64_t free_id : free_list_) {
113-
MemBlock& cached = blocks_[free_id];
114-
// Can only reuse texture 2d blocks of the same type
115-
if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
116-
continue;
117-
}
118-
// Can only reuse texture 2d blocks of the same scope
119-
// Because reusing textures with different memory scope may lead to
120-
// accuracy issues, because the data will be packed in a different way for
121-
// different memory scopes.
122-
if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
123-
continue;
124-
}
125-
// avoid reusing too small and too big textures
126-
if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
127-
shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
128-
continue;
129-
}
130-
int64_t new_width = std::max(cached.x_, shape.width);
131-
int64_t new_height = std::max(cached.y_, shape.height);
132-
int64_t added_size_x = new_width - cached.x_;
133-
int64_t added_size_y = new_height - cached.y_;
134-
int64_t wasted_size_x = new_width - shape.width;
135-
int64_t wasted_size_y = new_height - shape.height;
136-
// Prioritize minimization of added size first, then minimize
137-
// wasted size among blocks which would not require expansion
138-
if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
139-
(min_added_size_y > 0 && added_size_y < min_added_size_y) ||
140-
(min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
141-
(min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
142-
min_added_size_x = added_size_x;
143-
min_added_size_y = added_size_y;
144-
min_wasted_size_x = wasted_size_x;
145-
min_wasted_size_y = wasted_size_y;
146-
best_storage_id = free_id;
147-
new_mem.x_ = new_width;
148-
new_mem.y_ = new_height;
149-
}
150-
}
151-
152-
if (min_added_size_x == 0 && min_added_size_y == 0) {
153-
// use existing block
154-
free_list_.erase(best_storage_id);
155-
blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
156-
return blocks_[best_storage_id].token_;
157-
} else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
158-
// Reset the reference counter of the now live token
159-
free_list_.erase(best_storage_id);
160-
new_mem.token_ = prototype;
161-
new_mem.token_->ref_counter += 1;
162-
new_mem.token_->storage_id = best_storage_id;
163-
blocks_[best_storage_id] = new_mem;
164-
return new_mem.token_;
165-
}
166-
return nullptr;
167-
}
168-
169-
StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
170-
auto shape = GetSize2D(prototype);
171-
MemBlock block;
172-
block.x_ = shape.width;
173-
block.y_ = shape.height;
174-
prototype->storage_id = storage_id;
175-
block.token_ = prototype;
176-
blocks_[prototype->storage_id] = block;
177-
return prototype;
178-
}
179-
180-
void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
181-
ICHECK_GE(tok->storage_id, 0);
182-
ICHECK_GE(tok->ref_counter, 0);
183-
if (tok->ref_counter == 0) {
184-
free_list_.insert(tok->storage_id);
185-
}
186-
}
187-
188-
runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
133+
size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) {
189134
TensorType ttype = prototype->ttype;
190135
ICHECK(ttype.defined());
191-
size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
192-
prototype->virtual_device->memory_scope);
193136
struct Shape {
194137
const Array<PrimExpr>& shape;
195138
int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
139+
int size() { return this->shape.size(); }
196140
};
197-
return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
141+
auto shape = Shape{ttype->shape};
142+
int image_row_align =
143+
prototype->virtual_device->target->GetAttr<Integer>("image_base_address_alignment")
144+
.value_or(Integer(64))
145+
->value;
146+
return runtime::GetTextureMemorySize<Shape>(shape, ttype->dtype.bits(), ttype->dtype.lanes(),
147+
prototype->virtual_device->memory_scope,
148+
image_row_align);
198149
}
199150

200151
} // namespace relay

0 commit comments

Comments
 (0)