pytorch · SS-JIA · Sep 11, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 13, 2024
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -89,11 +89,11 @@ std::vector<int64_t> calculate_strides(
  * tensor. Thus the axis mapping can be considered to be in WHCN dimension
  * order.
  *
- * The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor
+ * The last value `axis_map.at(3)` indicates the WHCN index of the tensor
  * dimension along which batches will be concatenated. This dimension can be
  * referred to as the "inner dimension" To determine which image texture axis is
  * used for the concatenation, a double lookup will need to be performed
- * (axis_mapping.at(axis_mapping.at(3))).
+ * (axis_map.at(axis_map.at(3))).
  *
  * The reason for strucuring axis mapping this way is because for the batch dim,
  * two things need to be easily derived:
@@ -107,7 +107,7 @@ std::vector<int64_t> calculate_strides(
  *
  * The axis mapping allows for permuted views of texture-backed tensors.
  */
-std::vector<int64_t> default_axis_mapping() {
+std::vector<int64_t> default_axis_map() {
   // Currently, all compute shaders have an assumption that the channels dim is
   // used to combine with the batch dim of a tensor. However, once dim mapping
   // is integrated into the tensor indexing logic for each compute shader, we
@@ -173,40 +173,40 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_mapping,
+    const std::vector<int64_t>& axis_map,
     const utils::GPUMemoryLayout memory_layout) {
   VK_CHECK_COND(padded_sizes.size() == 4);
-  VK_CHECK_COND(axis_mapping.size() == 4);
+  VK_CHECK_COND(axis_map.size() == 4);
 
   utils::uvec3 extents({1, 1, 1});
-  // First three elements of axis_mapping indicate which (X,Y,Z) image axis the
+  // First three elements of axis_map indicate which (X,Y,Z) image axis the
   // width, height, and channels dim of the tensor maps to.
   for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
-    const int64_t axis = axis_mapping.at(whcn_dim);
+    const int64_t axis = axis_map.at(whcn_dim);
     const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
     extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
   }
 
-  // axis_mapping[3] indicates the WHCN index of the dimension used for batch
+  // axis_map[3] indicates the WHCN index of the dimension used for batch
   // concatenation. Thus a double lookup is required to determine the image axis
   // used for batch concatenation.
-  const int64_t concatted_whcn_dim = axis_mapping.at(3);
-  const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim);
+  const int64_t concatted_whcn_dim = axis_map.at(3);
+  const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
   // Multiply the extents of the batch axis by the batch size.
   extents[batch_axis] *= padded_sizes.at(0);
 
   switch (memory_layout) {
     case utils::kWidthPacked:
-      VK_CHECK_COND(extents[0] % 4 == 0);
-      extents[0] /= 4;
+      VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0);
+      extents[axis_map.at(0)] /= 4;
       break;
     case utils::kHeightPacked:
-      VK_CHECK_COND(extents[1] % 4 == 0);
-      extents[1] /= 4;
+      VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0);
+      extents[axis_map.at(1)] /= 4;
       break;
     case utils::kChannelsPacked:
-      VK_CHECK_COND(extents[2] % 4 == 0);
-      extents[2] /= 4;
+      VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0);
+      extents[axis_map.at(2)] /= 4;
       break;
   }
 
@@ -229,25 +229,27 @@ vTensor::vTensor(
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
       dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
-      axis_mapping_(default_axis_mapping()),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
       texture_limits_{{0, 0, 0}},
+      logical_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
+      axis_map_uniform_(),
       texture_limits_uniform_(),
+      logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
           storage_type,
           memory_layout_,
-          axis_mapping_,
+          axis_map_,
           padded_sizes_,
           dtype_,
           allocate_memory) {
@@ -259,6 +261,8 @@ vTensor::vTensor(
         utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
         utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
         utils::safe_downcast<int32_t>(storage_.image_extents_[2])};
+
+    update_logical_limits();
   }
 
   if (dtype == vkapi::kHalf) {
@@ -275,7 +279,7 @@ vTensor::vTensor(const vTensor& other)
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
-      axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()),
+      axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
       padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
@@ -284,12 +288,14 @@ vTensor::vTensor(const vTensor& other)
           other.unsqueezed_strides_.end()},
       padded_numel_(other.padded_numel_),
       texture_limits_{other.texture_limits_},
+      logical_limits_{other.logical_limits_},
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
+      axis_map_uniform_(),
       texture_limits_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
 
@@ -303,19 +309,21 @@ vTensor::vTensor(
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
       dim_order_(dim_order.begin(), dim_order.end()),
-      axis_mapping_(default_axis_mapping()),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
+      texture_limits_{other.texture_limits_},
+      logical_limits_(other.logical_limits_),
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
+      axis_map_uniform_(),
       texture_limits_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
   VK_CHECK_COND(
@@ -356,12 +364,18 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
-utils::uvec3 vTensor::mapped_extents() const {
-  utils::uvec3 m_extents;
-  m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)];
-  m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)];
-  m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)];
-  return m_extents;
+void vTensor::update_logical_limits() {
+  logical_limits_.limits[0] = texture_limits_.limits[axis_map_.at(0)];
+  logical_limits_.limits[1] = texture_limits_.limits[axis_map_.at(1)];
+  logical_limits_.limits[2] = texture_limits_.limits[axis_map_.at(2)];
+}
+
+utils::uvec3 vTensor::logical_extents() const {
+  utils::uvec3 logical_extents(
+      {utils::safe_downcast<uint32_t>(logical_limits_.limits[0]),
+       utils::safe_downcast<uint32_t>(logical_limits_.limits[1]),
+       utils::safe_downcast<uint32_t>(logical_limits_.limits[2])});
+  return logical_extents;
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
@@ -380,12 +394,12 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
   return vkapi::BufferBindInfo(strides_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() {
-  if (!axis_mapping_uniform_.buffer()) {
-    axis_mapping_uniform_ =
-        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_));
+const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
+  if (!axis_map_uniform_.buffer()) {
+    axis_map_uniform_ =
+        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_));
   }
-  return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer());
+  return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
 }
 
 const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
@@ -395,6 +409,13 @@ const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
   return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
 }
 
+const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
+  if (!logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
+  }
+  return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
+}
+
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
   if (!numel_uniform_.buffer()) {
     numel_uniform_ = ParamsBuffer(storage_.context_, numel_);
@@ -465,14 +486,16 @@ void vTensor::update_metadata() {
   // Calculate the extents of the image texture that would have been required
   // for a tensor of the new sizes.
   utils::uvec3 virtual_extents =
-      calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
+      calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
 
   // Update the texture limits to reflect the new virtual extents.
   texture_limits_.limits = utils::ivec3{
       utils::safe_downcast<int32_t>(virtual_extents[0]),
       utils::safe_downcast<int32_t>(virtual_extents[1]),
       utils::safe_downcast<int32_t>(virtual_extents[2])};
 
+  update_logical_limits();
+
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
   }
@@ -482,20 +505,23 @@ void vTensor::update_metadata() {
   if (numel_uniform_.buffer()) {
     numel_uniform_.update(numel_);
   }
-  if (axis_mapping_uniform_.buffer()) {
-    axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_));
+  if (axis_map_uniform_.buffer()) {
+    axis_map_uniform_.update(utils::make_ivec4(axis_map_));
   }
   if (texture_limits_uniform_.buffer()) {
     texture_limits_uniform_.update(texture_limits_);
   }
+  if (logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_.update(logical_limits_);
+  }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
+        calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
 
     bool valid_resize = virtual_extents[0] <= image_extents()[0];
     valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
@@ -546,7 +572,7 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
   update_metadata();
   storage_.discard_and_reallocate(
       calculate_padded_sizes(new_sizes, memory_layout_),
-      axis_mapping_,
+      axis_map_,
       memory_layout_,
       dtype_);
 }
@@ -624,16 +650,14 @@ vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
     const utils::GPUMemoryLayout gpu_memory_layout,
-    const std::vector<int64_t>& axis_mapping,
+    const std::vector<int64_t>& axis_map,
     const std::vector<int64_t>& padded_sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
-      image_extents_(calculate_image_extents(
-          padded_sizes,
-          axis_mapping,
-          gpu_memory_layout)),
+      image_extents_(
+          calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
       buffer_offset_{0},
       image_(allocate_image(
@@ -746,7 +770,7 @@ bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
 
 void vTensorStorage::discard_and_reallocate(
     const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_mapping,
+    const std::vector<int64_t>& axis_map,
     const utils::GPUMemoryLayout gpu_memory_layout,
     const vkapi::ScalarType dtype) {
   const bool image_owns_memory = image_.owns_memory();
@@ -755,7 +779,7 @@ void vTensorStorage::discard_and_reallocate(
   flush();
 
   image_extents_ =
-      calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout);
+      calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout);
   image_ = allocate_image(
       context_,
       image_extents_,