[VM][Textures] Enable OpenCL textures for VM (apache#15419)

* [VM][Textures] Enable OpenCL textures for VM This commit introduces memory scope to VM and enables using textures. The following changes have been made: - AnnotateMemoryScope pass is used in VM compilation pipeline - VM allows to use more than one device with the same device type. Also, virtual devices in VM contains information about memory scope. - Instructions LoadConst and AllocStorage were extended to support textures. - VM bytecode was updated to support memory scope. - Annotate texture storage pass was updated to support dynamic shape. - Some other minor changes have been made. * Implement tests for vm * Fix lint * Fix tests * Use union in allocate_storage struct * Apply comments * Fix copy ctor and assignment operator
cjia4 · Aug 8, 2023 · 34cacb0 · 34cacb0
1 parent 8cadd1f
commit 34cacb0
Show file tree

Hide file tree

Showing 27 changed files with 996 additions and 311 deletions.
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
@@ -110,9 +110,10 @@ class NDArray : public ObjectRef {
   /*!
    * \brief Copy the data to another device.
    * \param dev The target device.
+   * \param mem_scope The memory scope of the target array.
    * \return The array under another device.
    */
-  inline NDArray CopyTo(const Device& dev) const;
+  inline NDArray CopyTo(const Device& dev, Optional<String> mem_scope = NullOpt) const;
   /*!
    * \brief Load NDArray from stream
    * \param stream The input data stream
@@ -398,10 +399,11 @@ inline void NDArray::CopyTo(const NDArray& other) const {
   CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
 }
 
-inline NDArray NDArray::CopyTo(const Device& dev) const {
+inline NDArray NDArray::CopyTo(const Device& dev, Optional<String> mem_scope) const {
   ICHECK(data_ != nullptr);
   const DLTensor* dptr = operator->();
-  NDArray ret = Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev);
+  NDArray ret =
+      Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev, mem_scope);
   this->CopyTo(ret);
   return ret;
 }

diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
@@ -157,6 +157,8 @@ struct Instruction {
     struct /* LoadConst Operands */ {
       /* \brief The index into the constant pool. */
       Index const_index;
+      /*! \brief The index of the device on which the load will be made. */
+      Index device_index;
     };
     struct /* LoadConsti Operands */ {
       /* \brief The index into the constant pool. */
@@ -195,12 +197,18 @@ struct Instruction {
       RegName* free_vars;
     };
     struct /* AllocStorage Operands */ {
-      /*! \brief The size of the allocation. */
-      RegName allocation_size;
       /*! \brief The alignment of the allocation. */
       Index alignment;
       /*! \brief The hint of the dtype. */
       DLDataType dtype_hint;
+      /*! \brief The number of dimensions. */
+      uint32_t ndim;
+      union {
+        /*! \brief The shape of tensor. */
+        int64_t* shape;
+        /*! \brief The size of the allocation. */
+        RegName allocation_size;
+      };
       /*! \brief The index of the device on which the allocation will be made. */
       Index device_index;
     } alloc_storage;
@@ -332,10 +340,11 @@ struct Instruction {
   /*!
    * \brief Construct a load constant instruction.
    * \param const_index The index of the constant.
+   * \param device_index The index of the device to load on.
    * \param dst The destination register.
    * \return The load constant instruction.
    */
-  static Instruction LoadConst(Index const_index, RegName dst);
+  static Instruction LoadConst(Index const_index, Index device_index, RegName dst);
   /*!
    * \brief Construct a load_constanti instruction.
    * \param val The interger constant value.
@@ -356,11 +365,13 @@ struct Instruction {
    * \param alignment The allocation's alignment.
    * \param dtype_hint The data type hint for the allocator.
    * \param device_index The index of the device to allocate on.
+   * \param shape The shape of the allocation.
    * \param dst The destination to place the storage.
    * \return The alloc storage instruction.
    */
   static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint,
-                                  Index device_index, RegName dst);
+                                  Index device_index, const std::vector<int64_t>& shape,
+                                  RegName dst);
   /*!
    * \brief Get the shape of an input tensor.
    * \param tensor The input tensor.

diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
@@ -34,6 +34,7 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -262,9 +263,9 @@ class TVM_DLL Executable : public ModuleNode {
 
   /*!
    * \brief The (compile-time, virtual) devices corresponding to each device index.
-   * Currently we only support at most one device per device type.
+   * This vector contains a pair Device and its memory_scope.
    */
-  std::vector<Device> virtual_devices;
+  std::vector<std::pair<Device, std::string>> virtual_devices;
   /*!
    * \brief The device index corresponding to the 'host' device. That will hold and evaluate
    * shape-related data and code.

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
@@ -352,19 +352,6 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       return 0;
     }
 
-    // However, otherwise we allow at most one VirtualDevice per device type.
-    // TODO(mbs): This will eventually need to account for memory scopes somehow so device_copy
-    // instructions can do the right thing.
-    itr = std::find_if(context_->virtual_devices_.begin() + 1, context_->virtual_devices_.end(),
-                       [&virtual_device](const VirtualDevice& existing_virtual_device) {
-                         return existing_virtual_device->device_type() ==
-                                virtual_device->device_type();
-                       });
-    CHECK(itr == context_->virtual_devices_.end())
-        << "The VM does not currently support using more than one device with the same device type "
-           "for primitives, however the program is using the distinct scopes "
-        << virtual_device << " and " << *itr << " of device type " << virtual_device->device_type();
-
     ICHECK(virtual_device != host_virtual_device_);
     Index index = context_->virtual_devices_.size();
     VLOG(2) << "virtual_device[" << index << "] = " << virtual_device;
@@ -384,7 +371,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
     VLOG(2) << "constant[" << const_index << "] on device[" << device_index << "]";
     context_->const_device_indexes.push_back(device_index);
     context_->constants.push_back(const_node->data);
-    Emit(Instruction::LoadConst(const_index, NewRegister()));
+    Emit(Instruction::LoadConst(const_index, device_index, NewRegister()));
   }
 
   void VisitExpr_(const VarNode* var_node) final {
@@ -602,13 +589,21 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                  })
           .Match("memory.alloc_storage",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   ICHECK_EQ(args.size(), 2);
+                   ICHECK_EQ(args.size(), 3);
                    // Compute the size of the allocation.
                    this->VisitExpr(args[0]);
                    auto size_register = last_register_;
 
-                   ICHECK(args[1].as<ConstantNode>());  // Always a literal.
-                   NDArray alignment_arr = args[1].as<ConstantNode>()->data;
+                   auto const_shape = AsIgnoringOnDevice<ConstantNode>(args[1]);
+                   std::vector<int64_t> raw_shape;
+                   if (const_shape) {
+                     NDArray shape = const_shape->data;
+                     // TODO(@jroesch): we need to get an RFC done to standarize shape dtype
+                     raw_shape = ToAllocTensorShape(shape);
+                   }
+
+                   ICHECK(args[2].as<ConstantNode>());  // Always a literal.
+                   NDArray alignment_arr = args[2].as<ConstantNode>()->data;
                    ICHECK_EQ(alignment_arr->dtype.code, 0U)
                        << "The dtype of constant shape must be int32 or int64, but got "
                        << DLDataType2String(alignment_arr->dtype);
@@ -622,7 +617,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
 
                    Emit(Instruction::AllocStorage(size_register, alignment, dtype,
                                                   GetDeviceIndex(alloc_attrs->virtual_device),
-                                                  NewRegister()));
+                                                  raw_shape, NewRegister()));
                  })
           .Match("vm.shape_of",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
@@ -739,7 +734,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
 
   /*!
    * \brief Compile a match value
-   * Generate byte code that compute the value specificed in val
+   * Generate byte code that compute the value specified in val
    *
    * \return The register number assigned for the final value
    */
@@ -946,9 +941,10 @@ void VMCompiler::LowerImpl(IRModule mod) {
   for (const auto& virtual_device : context_.virtual_devices_) {
     ICHECK(!virtual_device->IsFullyUnconstrained());
     ICHECK_GT(virtual_device->device_type(), 0);
-    // TODO(mbs): We forget the memory scope.
-    exec_->virtual_devices.push_back(Device{/*device_type=*/virtual_device->device_type(),
-                                            /*device_id=*/virtual_device->virtual_device_id});
+    exec_->virtual_devices.push_back(
+        std::make_pair(Device{/*device_type=*/virtual_device->device_type(),
+                              /*device_id=*/virtual_device->virtual_device_id},
+                       virtual_device->memory_scope));
   }
   exec_->host_device_index = kHostDeviceIndex;
 
@@ -1068,6 +1064,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   }
 
   pass_seqs.push_back(transform::FuseOps());
+  pass_seqs.push_back(transform::AnnotateMemoryScope());
 
   // Do layout rewrite for auto-scheduler.
   transform::PassContext pass_ctx = PassContext::Current();

diff --git a/src/relay/backend/vm/manifest_lifetimes.cc b/src/relay/backend/vm/manifest_lifetimes.cc
@@ -167,7 +167,9 @@ class AliasEliminator : public MixedModeMutator {
           if (copy_props.src_virtual_device->device_type() ==
                   copy_props.dst_virtual_device->device_type() &&
               copy_props.src_virtual_device->virtual_device_id ==
-                  copy_props.dst_virtual_device->virtual_device_id) {
+                  copy_props.dst_virtual_device->virtual_device_id &&
+              copy_props.src_virtual_device->memory_scope ==
+                  copy_props.dst_virtual_device->memory_scope) {
             Expr to_copy = Downcast<Call>(unwrapped)->args[0];
             if (const VarNode* alias_of_n = to_copy.as<VarNode>()) {
               alias_[var] = Downcast<Var>(VisitExpr_(alias_of_n));

diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
@@ -50,25 +50,32 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
 // The passing value in attrs and args doesn't seem super great.
 // We should consider a better solution, i.e the type relation
 // being able to see the arguments as well?
-Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint) {
+Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
+                  DataType dtype_hint) {
   auto attrs = make_object<AllocStorageAttrs>();
   attrs->dtype = dtype_hint;
   attrs->virtual_device = std::move(virtual_device);
   static const Op& op = Op::Get("memory.alloc_storage");
-  return Call(op, {std::move(size), std::move(alignment)}, Attrs(std::move(attrs)), {});
+  return Call(op, {std::move(size), std::move(shape), std::move(alignment)},
+              Attrs(std::move(attrs)), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage);
 
 bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  ICHECK_EQ(types.size(), 3u);
+  ICHECK_EQ(types.size(), 4u);
   auto size_type = types[0];
   auto tensor_type = size_type.as<TensorTypeNode>();
   ICHECK(tensor_type != nullptr);
   ICHECK_EQ(tensor_type->dtype, DataType::Int(64));
   ICHECK_EQ(tensor_type->shape.size(), 0);
-  auto align_type = types[1];
+
+  // Tensor shape
+  auto tt = types[1].as<TensorTypeNode>();
+  ICHECK(tt != nullptr) << "must be tensor type";
+
+  auto align_type = types[2];
   auto align_ttype = align_type.as<TensorTypeNode>();
   ICHECK(align_ttype != nullptr);
   ICHECK_EQ(align_ttype->dtype, DataType::Int(64));
@@ -77,14 +84,15 @@ bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   ICHECK(mod.defined());
   auto storage_name = mod->GetGlobalTypeVar("Storage");
   auto storage = TypeCall(storage_name, {});
-  reporter->Assign(types[2], storage);
+  reporter->Assign(types[3], storage);
   return true;
 }
 
 RELAY_REGISTER_OP("memory.alloc_storage")
     .describe(R"code(Explicitly allocate storage to be used by tensors.)code" TVM_ADD_FILELINE)
-    .set_num_inputs(2)
+    .set_num_inputs(3)
     .add_argument("size", "Tensor", "The size of the storage to allocate.")
+    .add_argument("shape", "Tensor", "The shape of the storage to allocate.")
     .add_argument("alignment", "Tensor", "The alignment of the storage.")
     .add_type_rel("AllocStorage", AllocStorageRel)
     .set_attrs_type_key("relay.attrs.AllocStorageAttrs")

diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h
@@ -34,10 +34,11 @@
 namespace tvm {
 namespace relay {
 
-Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint);
+Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
+                  DataType dtype_hint);
 /*! \brief Returns the "memory.alloc_tensor" operator. */
 const Op& MemoryAllocTensorOp();
-Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
+Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype,
                  Array<IndexExpr> assert_shape);
 Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
 std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);

diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
@@ -407,6 +407,15 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
       if (pattern <= kCommReduce) {
         if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
           if (ttype->shape.size() == 5) {
+            auto node0 = ttype->shape[0].as<IntImmNode>();
+            auto node1 = ttype->shape[1].as<IntImmNode>();
+            auto node2 = ttype->shape[2].as<IntImmNode>();
+            auto node3 = ttype->shape[3].as<IntImmNode>();
+            auto node4 = ttype->shape[4].as<IntImmNode>();
+            // if tensor has any dimension then textures are not supported
+            if (!node0 || !node1 || !node2 || !node3 || !node4) {
+              return false;
+            }
             supports_texture_storage = true;
           }
         }

diff --git a/src/relay/transforms/device_domains.cc b/src/relay/transforms/device_domains.cc
@@ -236,12 +236,13 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
     args_and_result.emplace_back(ForVirtualDevice(device_copy_props.body->checked_type(),
                                                   device_copy_props.dst_virtual_device));
   } else if (call->op == alloc_storage_op) {
-    ICHECK_EQ(call->args.size(), 2U);
-    // alloc_storage(size, alignment, virtual_device=<t>)
-    // alloc_storage: fn(<cpu>, <cpu>):<t>
+    ICHECK_EQ(call->args.size(), 3U);
+    // alloc_storage(size, shape, alignment, virtual_device=<t>)
+    // alloc_storage: fn(<cpu>, <cpu>, <cpu>):<t>
     const auto* attrs = call->attrs.as<AllocStorageAttrs>();
     args_and_result.emplace_back(host_domain_);
     args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(host_domain_);
     args_and_result.emplace_back(ForVirtualDevice(call->checked_type(), attrs->virtual_device));
   } else if (call->op == alloc_tensor_op) {
     ICHECK_EQ(call->args.size(), 3U);

diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
@@ -260,7 +260,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
     Expr alignment = ComputeAlignment(type->dtype);
     // Run type inference later to get the correct type.
     Var var("storage_" + name_hint, Type(nullptr));
-    Expr value = AllocStorage(size, alignment, virtual_device, type->dtype);
+    Expr value = AllocStorage(size, shape, alignment, virtual_device, type->dtype);
     auto sto = scope->Push(var, MaybeOnDeviceFixed(value, virtual_device));
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
@@ -366,7 +366,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
       // Alignment is directly captured in the instruction so don't wrap in "on_device".
       auto alignment = ComputeAlignment(out_type->dtype);
       Var sto_var("storage_" + std::to_string(i), Type(nullptr));
-      auto val = AllocStorage(size, alignment, virtual_device, out_type->dtype);
+      auto val = AllocStorage(size, out_shape, alignment, virtual_device, out_type->dtype);
       storages.push_back(scope->Push(sto_var, MaybeOnDeviceFixed(val, virtual_device)));
     }
 

diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
@@ -152,7 +152,7 @@ static size_t GetDataAlignment(const DLDataType dtype) {
 
 void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                 Optional<String> mem_scope) {
-  if (!mem_scope.defined() || mem_scope.value() == "global") {
+  if (!mem_scope.defined() || mem_scope.value() == "" || mem_scope.value() == "global") {
     // by default, we can always redirect to the flat memory allocations
     DLTensor temp;
     temp.data = nullptr;

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
@@ -239,7 +239,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
 
 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                       Optional<String> mem_scope) {
-  if (!mem_scope.defined() || mem_scope.value() == "global") {
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
   ICHECK(IsTextureStorage(std::string(mem_scope.value())))