Merge branch 'pytorch:main' into Arm-backend-Improve-memory-config-an…

…d-documentation-in-the-runtime
pytorch · Sep 24, 2024 · 0f5f9d6 · 0f5f9d6
2 parents 005b43f + 3e79ea4
commit 0f5f9d6
Show file tree

Hide file tree

Showing 27 changed files with 841 additions and 124 deletions.
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -707,8 +707,7 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
   const int dim1_whcn = sizes_.size() - 1 - dim1;
   if (packed_dim_ == dim0_whcn) {
     packed_dim_ = dim1_whcn;
-  }
-  if (packed_dim_ == dim1_whcn) {
+  } else if (packed_dim_ == dim1_whcn) {
     packed_dim_ = dim0_whcn;
   }
 
@@ -719,6 +718,12 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
     VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
     std::iter_swap(
         axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
+    // Update the "identity" of the concatted dimension
+    if (axis_map_.at(3) == dim0_whcn) {
+      axis_map_.at(3) = dim1_whcn;
+    } else if (axis_map_.at(3) == dim1_whcn) {
+      axis_map_.at(3) = dim0_whcn;
+    }
   }
   update_metadata();
 }

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -198,6 +198,32 @@ std::vector<int64_t> ComputeGraph::sizes_of(const ValueRef idx) const {
   VK_THROW("Could not get sizes of value with type ", val.type());
 }
 
+int64_t ComputeGraph::dim_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim();
+  } else if (val.isTensorRef()) {
+    return val.toConstTensorRef().sizes.size();
+  }
+  VK_THROW("Could not get dim of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::dim_order_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim_order();
+  }
+  VK_THROW("Could not get dim order of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::strides_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().strides();
+  }
+  VK_THROW("Could not get strides of value with type ", val.type());
+}
+
 vkapi::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const {
   const Value& val = values_.at(idx);
   if (val.isTensor()) {

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -282,6 +282,12 @@ class ComputeGraph final {
     VK_THROW("Could not get sizes of value with type ", val.type());
   }
 
+  int64_t dim_of(const ValueRef idx) const;
+
+  std::vector<int64_t> dim_order_of(const ValueRef idx) const;
+
+  std::vector<int64_t> strides_of(const ValueRef idx) const;
+
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
   inline const utils::ivec3& logical_limits_of(const ValueRef idx) const {

diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -21,16 +21,16 @@ class ComputeGraph;
  * access permission.
  */
 struct ArgGroup {
-  ArgGroup(const ValueRef ref, const vkapi::MemoryAccessType access)
+  ArgGroup(const ValueRef ref, const vkapi::MemoryAccessFlags access)
       : refs{ref}, access(access) {}
 
   ArgGroup(
       const std::vector<ValueRef>& refs,
-      const vkapi::MemoryAccessType access)
+      const vkapi::MemoryAccessFlags access)
       : refs(refs), access(access) {}
 
   const std::vector<ValueRef> refs;
-  const vkapi::MemoryAccessType access;
+  const vkapi::MemoryAccessFlags access;
 };
 
 /*

diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -20,20 +20,19 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "existing_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(2, "r", "t_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
+${layout_declare_ubo(3, "ivec4", "out_sizes")}
+${layout_declare_ubo(4, "ivec4", "out_axis_map")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec4", "in_axis_map")}
+layout(set = 0, binding = 7) uniform PRECISION restrict CopyArgs {
+  // Operates on (x, y, z) logical extents.
+  ivec3 range;
   // Analogus to range variable in copy. It defines the # of channel being
   // copied.
   int channel_range;
-  int src_channel_offset;
-  int dst_channel_offset;
-  int unused;
-  // Operates on (x, y, z) extents.
-  ivec3 range;
-  int unused1;
   ivec3 dst_offset;
-  int unused2;
+  int dst_channel_offset;
+  int src_channel_offset;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -43,36 +42,36 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   // Note: Unlike other shaders, the range is often not equal to the destination
   // texture extent.
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(pos, range))) {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(lpos, range))) {
     return;
   }
 
-  const ivec3 out_pos = pos + dst_offset;
+  const ivec3 out_lpos = lpos + dst_offset;
 
-  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+  const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim);
 
   // First read the existing values to make sure the boundary values stay.
-  VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
+  VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map);
 
+  ivec4 in_tidx = out_tidx;
   for (int i=0; i<4; i++) {
-    ivec4 in_whcn = out_whcn;
 
-    in_whcn.z = out_whcn.z - dst_channel_offset + i;
+    in_tidx[packed_dim] = out_tidx[packed_dim] - dst_channel_offset + i;
 
     // Handle the partial update for begining of channel in an existing tensor.
     // If the source channel index is below zero or exceeds the range, we skip
     // updating the element to avoid overwriting existing data.
-    if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
+    if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= channel_range)) {
       continue;
     }
 
     // Readjust for the source offset.
-    in_whcn.z = in_whcn.z + src_channel_offset;
+    in_tidx[packed_dim] += src_channel_offset;
 
-    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
-    v[i] = VEC4_T(texelFetch(t_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+    ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim);
+    v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w];
   }
 
-  imageStore(t_out, out_pos, v);
+  write_texel_lpos(t_out, out_lpos, v, out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -139,28 +139,17 @@ void add_copy_channel_offset_node(
     uvec3 local_size = adaptive_work_group_size(global_size);
 
     const struct Block final {
-      utils::ivec4 out_sizes;
-      utils::ivec4 in_sizes;
-      int32_t channel_range;
-      int32_t src_channel_offset;
-      int32_t dst_channel_offset;
-      int32_t unused;
       ivec3 range;
-      int32_t unused1;
+      int32_t channel_range;
       ivec3 dst_offset;
-      int32_t unused2;
-
+      int32_t dst_channel_offset;
+      int32_t src_channel_offset;
     } channel_offset_params{
-        utils::make_whcn_ivec4(out_sizes),
-        utils::make_whcn_ivec4(in_sizes),
-        channel_range,
-        src_channel_offset,
-        dst_channel_offset,
-        0,
         utils::make_ivec3(global_size),
-        0,
+        channel_range,
         dst_offset,
-        0,
+        dst_channel_offset,
+        src_channel_offset,
     };
 
     auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -177,7 +166,13 @@ void add_copy_channel_offset_node(
             {in, vkapi::MemoryAccessType::READ},
         },
         // Parameter buffers
-        {graph.create_params_buffer(channel_offset_params)},
+        {
+            t_out->sizes_ubo(),
+            t_out->axis_map_ubo(),
+            t_in->sizes_ubo(),
+            t_in->axis_map_ubo(),
+            graph.create_params_buffer(channel_offset_params),
+        },
         // Specialization Constants
         {}));
   }