Skip to content

Commit

Permalink
[VM][Textures] Enable OpenCL textures for VM (apache#15419)
Browse files Browse the repository at this point in the history
* [VM][Textures] Enable OpenCL textures for VM

This commit introduces memory scope to VM and enables using textures.

The following changes have been made:
  - AnnotateMemoryScope pass is used in VM compilation pipeline
  - VM allows to use more than one device with the same device type.
    Also, virtual devices in VM contains information about memory
    scope.
  - Instructions LoadConst and AllocStorage were extended to support
    textures.
  - VM bytecode was updated to support memory scope.
  - Annotate texture storage pass was updated to support dynamic shape.
  - Some other minor changes have been made.

* Implement tests for vm

* Fix lint

* Fix tests

* Use union in allocate_storage struct

* Apply comments

* Fix copy ctor and assignment operator
  • Loading branch information
echuraev authored Aug 8, 2023
1 parent 8cadd1f commit 34cacb0
Show file tree
Hide file tree
Showing 27 changed files with 996 additions and 311 deletions.
8 changes: 5 additions & 3 deletions include/tvm/runtime/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,10 @@ class NDArray : public ObjectRef {
/*!
* \brief Copy the data to another device.
* \param dev The target device.
* \param mem_scope The memory scope of the target array.
* \return The array under another device.
*/
inline NDArray CopyTo(const Device& dev) const;
inline NDArray CopyTo(const Device& dev, Optional<String> mem_scope = NullOpt) const;
/*!
* \brief Load NDArray from stream
* \param stream The input data stream
Expand Down Expand Up @@ -398,10 +399,11 @@ inline void NDArray::CopyTo(const NDArray& other) const {
CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
}

inline NDArray NDArray::CopyTo(const Device& dev) const {
inline NDArray NDArray::CopyTo(const Device& dev, Optional<String> mem_scope) const {
ICHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
NDArray ret = Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev);
NDArray ret =
Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev, mem_scope);
this->CopyTo(ret);
return ret;
}
Expand Down
19 changes: 15 additions & 4 deletions include/tvm/runtime/vm/bytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ struct Instruction {
struct /* LoadConst Operands */ {
/* \brief The index into the constant pool. */
Index const_index;
/*! \brief The index of the device on which the load will be made. */
Index device_index;
};
struct /* LoadConsti Operands */ {
/* \brief The index into the constant pool. */
Expand Down Expand Up @@ -195,12 +197,18 @@ struct Instruction {
RegName* free_vars;
};
struct /* AllocStorage Operands */ {
/*! \brief The size of the allocation. */
RegName allocation_size;
/*! \brief The alignment of the allocation. */
Index alignment;
/*! \brief The hint of the dtype. */
DLDataType dtype_hint;
/*! \brief The number of dimensions. */
uint32_t ndim;
union {
/*! \brief The shape of tensor. */
int64_t* shape;
/*! \brief The size of the allocation. */
RegName allocation_size;
};
/*! \brief The index of the device on which the allocation will be made. */
Index device_index;
} alloc_storage;
Expand Down Expand Up @@ -332,10 +340,11 @@ struct Instruction {
/*!
* \brief Construct a load constant instruction.
* \param const_index The index of the constant.
* \param device_index The index of the device to load on.
* \param dst The destination register.
* \return The load constant instruction.
*/
static Instruction LoadConst(Index const_index, RegName dst);
static Instruction LoadConst(Index const_index, Index device_index, RegName dst);
/*!
* \brief Construct a load_constanti instruction.
* \param val The interger constant value.
Expand All @@ -356,11 +365,13 @@ struct Instruction {
* \param alignment The allocation's alignment.
* \param dtype_hint The data type hint for the allocator.
* \param device_index The index of the device to allocate on.
* \param shape The shape of the allocation.
* \param dst The destination to place the storage.
* \return The alloc storage instruction.
*/
static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint,
Index device_index, RegName dst);
Index device_index, const std::vector<int64_t>& shape,
RegName dst);
/*!
* \brief Get the shape of an input tensor.
* \param tensor The input tensor.
Expand Down
5 changes: 3 additions & 2 deletions include/tvm/runtime/vm/executable.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

namespace tvm {
Expand Down Expand Up @@ -262,9 +263,9 @@ class TVM_DLL Executable : public ModuleNode {

/*!
* \brief The (compile-time, virtual) devices corresponding to each device index.
* Currently we only support at most one device per device type.
* This vector contains a pair Device and its memory_scope.
*/
std::vector<Device> virtual_devices;
std::vector<std::pair<Device, std::string>> virtual_devices;
/*!
* \brief The device index corresponding to the 'host' device. That will hold and evaluate
* shape-related data and code.
Expand Down
41 changes: 19 additions & 22 deletions src/relay/backend/vm/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,19 +352,6 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
return 0;
}

// However, otherwise we allow at most one VirtualDevice per device type.
// TODO(mbs): This will eventually need to account for memory scopes somehow so device_copy
// instructions can do the right thing.
itr = std::find_if(context_->virtual_devices_.begin() + 1, context_->virtual_devices_.end(),
[&virtual_device](const VirtualDevice& existing_virtual_device) {
return existing_virtual_device->device_type() ==
virtual_device->device_type();
});
CHECK(itr == context_->virtual_devices_.end())
<< "The VM does not currently support using more than one device with the same device type "
"for primitives, however the program is using the distinct scopes "
<< virtual_device << " and " << *itr << " of device type " << virtual_device->device_type();

ICHECK(virtual_device != host_virtual_device_);
Index index = context_->virtual_devices_.size();
VLOG(2) << "virtual_device[" << index << "] = " << virtual_device;
Expand All @@ -384,7 +371,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
VLOG(2) << "constant[" << const_index << "] on device[" << device_index << "]";
context_->const_device_indexes.push_back(device_index);
context_->constants.push_back(const_node->data);
Emit(Instruction::LoadConst(const_index, NewRegister()));
Emit(Instruction::LoadConst(const_index, device_index, NewRegister()));
}

void VisitExpr_(const VarNode* var_node) final {
Expand Down Expand Up @@ -602,13 +589,21 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
})
.Match("memory.alloc_storage",
[this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
ICHECK_EQ(args.size(), 2);
ICHECK_EQ(args.size(), 3);
// Compute the size of the allocation.
this->VisitExpr(args[0]);
auto size_register = last_register_;

ICHECK(args[1].as<ConstantNode>()); // Always a literal.
NDArray alignment_arr = args[1].as<ConstantNode>()->data;
auto const_shape = AsIgnoringOnDevice<ConstantNode>(args[1]);
std::vector<int64_t> raw_shape;
if (const_shape) {
NDArray shape = const_shape->data;
// TODO(@jroesch): we need to get an RFC done to standarize shape dtype
raw_shape = ToAllocTensorShape(shape);
}

ICHECK(args[2].as<ConstantNode>()); // Always a literal.
NDArray alignment_arr = args[2].as<ConstantNode>()->data;
ICHECK_EQ(alignment_arr->dtype.code, 0U)
<< "The dtype of constant shape must be int32 or int64, but got "
<< DLDataType2String(alignment_arr->dtype);
Expand All @@ -622,7 +617,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {

Emit(Instruction::AllocStorage(size_register, alignment, dtype,
GetDeviceIndex(alloc_attrs->virtual_device),
NewRegister()));
raw_shape, NewRegister()));
})
.Match("vm.shape_of",
[this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
Expand Down Expand Up @@ -739,7 +734,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {

/*!
* \brief Compile a match value
* Generate byte code that compute the value specificed in val
* Generate byte code that compute the value specified in val
*
* \return The register number assigned for the final value
*/
Expand Down Expand Up @@ -946,9 +941,10 @@ void VMCompiler::LowerImpl(IRModule mod) {
for (const auto& virtual_device : context_.virtual_devices_) {
ICHECK(!virtual_device->IsFullyUnconstrained());
ICHECK_GT(virtual_device->device_type(), 0);
// TODO(mbs): We forget the memory scope.
exec_->virtual_devices.push_back(Device{/*device_type=*/virtual_device->device_type(),
/*device_id=*/virtual_device->virtual_device_id});
exec_->virtual_devices.push_back(
std::make_pair(Device{/*device_type=*/virtual_device->device_type(),
/*device_id=*/virtual_device->virtual_device_id},
virtual_device->memory_scope));
}
exec_->host_device_index = kHostDeviceIndex;

Expand Down Expand Up @@ -1068,6 +1064,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
}

pass_seqs.push_back(transform::FuseOps());
pass_seqs.push_back(transform::AnnotateMemoryScope());

// Do layout rewrite for auto-scheduler.
transform::PassContext pass_ctx = PassContext::Current();
Expand Down
4 changes: 3 additions & 1 deletion src/relay/backend/vm/manifest_lifetimes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ class AliasEliminator : public MixedModeMutator {
if (copy_props.src_virtual_device->device_type() ==
copy_props.dst_virtual_device->device_type() &&
copy_props.src_virtual_device->virtual_device_id ==
copy_props.dst_virtual_device->virtual_device_id) {
copy_props.dst_virtual_device->virtual_device_id &&
copy_props.src_virtual_device->memory_scope ==
copy_props.dst_virtual_device->memory_scope) {
Expr to_copy = Downcast<Call>(unwrapped)->args[0];
if (const VarNode* alias_of_n = to_copy.as<VarNode>()) {
alias_[var] = Downcast<Var>(VisitExpr_(alias_of_n));
Expand Down
20 changes: 14 additions & 6 deletions src/relay/op/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,32 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
// The passing value in attrs and args doesn't seem super great.
// We should consider a better solution, i.e the type relation
// being able to see the arguments as well?
Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint) {
Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
DataType dtype_hint) {
auto attrs = make_object<AllocStorageAttrs>();
attrs->dtype = dtype_hint;
attrs->virtual_device = std::move(virtual_device);
static const Op& op = Op::Get("memory.alloc_storage");
return Call(op, {std::move(size), std::move(alignment)}, Attrs(std::move(attrs)), {});
return Call(op, {std::move(size), std::move(shape), std::move(alignment)},
Attrs(std::move(attrs)), {});
}

TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage);

bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
const TypeReporter& reporter) {
ICHECK_EQ(types.size(), 3u);
ICHECK_EQ(types.size(), 4u);
auto size_type = types[0];
auto tensor_type = size_type.as<TensorTypeNode>();
ICHECK(tensor_type != nullptr);
ICHECK_EQ(tensor_type->dtype, DataType::Int(64));
ICHECK_EQ(tensor_type->shape.size(), 0);
auto align_type = types[1];

// Tensor shape
auto tt = types[1].as<TensorTypeNode>();
ICHECK(tt != nullptr) << "must be tensor type";

auto align_type = types[2];
auto align_ttype = align_type.as<TensorTypeNode>();
ICHECK(align_ttype != nullptr);
ICHECK_EQ(align_ttype->dtype, DataType::Int(64));
Expand All @@ -77,14 +84,15 @@ bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attr
ICHECK(mod.defined());
auto storage_name = mod->GetGlobalTypeVar("Storage");
auto storage = TypeCall(storage_name, {});
reporter->Assign(types[2], storage);
reporter->Assign(types[3], storage);
return true;
}

RELAY_REGISTER_OP("memory.alloc_storage")
.describe(R"code(Explicitly allocate storage to be used by tensors.)code" TVM_ADD_FILELINE)
.set_num_inputs(2)
.set_num_inputs(3)
.add_argument("size", "Tensor", "The size of the storage to allocate.")
.add_argument("shape", "Tensor", "The shape of the storage to allocate.")
.add_argument("alignment", "Tensor", "The alignment of the storage.")
.add_type_rel("AllocStorage", AllocStorageRel)
.set_attrs_type_key("relay.attrs.AllocStorageAttrs")
Expand Down
5 changes: 3 additions & 2 deletions src/relay/op/memory/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@
namespace tvm {
namespace relay {

Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint);
Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
DataType dtype_hint);
/*! \brief Returns the "memory.alloc_tensor" operator. */
const Op& MemoryAllocTensorOp();
Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype,
Array<IndexExpr> assert_shape);
Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);
Expand Down
9 changes: 9 additions & 0 deletions src/relay/transforms/annotate_texture_storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,15 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
if (pattern <= kCommReduce) {
if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
if (ttype->shape.size() == 5) {
auto node0 = ttype->shape[0].as<IntImmNode>();
auto node1 = ttype->shape[1].as<IntImmNode>();
auto node2 = ttype->shape[2].as<IntImmNode>();
auto node3 = ttype->shape[3].as<IntImmNode>();
auto node4 = ttype->shape[4].as<IntImmNode>();
// if tensor has any dimension then textures are not supported
if (!node0 || !node1 || !node2 || !node3 || !node4) {
return false;
}
supports_texture_storage = true;
}
}
Expand Down
7 changes: 4 additions & 3 deletions src/relay/transforms/device_domains.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,13 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
args_and_result.emplace_back(ForVirtualDevice(device_copy_props.body->checked_type(),
device_copy_props.dst_virtual_device));
} else if (call->op == alloc_storage_op) {
ICHECK_EQ(call->args.size(), 2U);
// alloc_storage(size, alignment, virtual_device=<t>)
// alloc_storage: fn(<cpu>, <cpu>):<t>
ICHECK_EQ(call->args.size(), 3U);
// alloc_storage(size, shape, alignment, virtual_device=<t>)
// alloc_storage: fn(<cpu>, <cpu>, <cpu>):<t>
const auto* attrs = call->attrs.as<AllocStorageAttrs>();
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(ForVirtualDevice(call->checked_type(), attrs->virtual_device));
} else if (call->op == alloc_tensor_op) {
ICHECK_EQ(call->args.size(), 3U);
Expand Down
4 changes: 2 additions & 2 deletions src/relay/transforms/memory_alloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
Expr alignment = ComputeAlignment(type->dtype);
// Run type inference later to get the correct type.
Var var("storage_" + name_hint, Type(nullptr));
Expr value = AllocStorage(size, alignment, virtual_device, type->dtype);
Expr value = AllocStorage(size, shape, alignment, virtual_device, type->dtype);
auto sto = scope->Push(var, MaybeOnDeviceFixed(value, virtual_device));

// TODO(@jroesch): There is a bug with typing based on the constant shape.
Expand Down Expand Up @@ -366,7 +366,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
// Alignment is directly captured in the instruction so don't wrap in "on_device".
auto alignment = ComputeAlignment(out_type->dtype);
Var sto_var("storage_" + std::to_string(i), Type(nullptr));
auto val = AllocStorage(size, alignment, virtual_device, out_type->dtype);
auto val = AllocStorage(size, out_shape, alignment, virtual_device, out_type->dtype);
storages.push_back(scope->Push(sto_var, MaybeOnDeviceFixed(val, virtual_device)));
}

Expand Down
2 changes: 1 addition & 1 deletion src/runtime/c_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ static size_t GetDataAlignment(const DLDataType dtype) {

void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
if (!mem_scope.defined() || mem_scope.value() == "" || mem_scope.value() == "global") {
// by default, we can always redirect to the flat memory allocations
DLTensor temp;
temp.data = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,

void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
}
ICHECK(IsTextureStorage(std::string(mem_scope.value())))
Expand Down
Loading

0 comments on commit 34cacb0

Please sign in to comment.