pytorch · jorgep31415 · Mar 8, 2024
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -35,6 +35,9 @@ struct StagingParams final {
 
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
 
+// Expose for the Vulkan Compute API tests.
+StagingParams create_staging_params(const vTensor& t);
+
 } // namespace vulkan
 } // namespace native
 } // namespace at

diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -10,21 +10,14 @@
 
 #include <ATen/native/vulkan/api/api.h>
 
-#include <ATen/native/vulkan/impl/Arithmetic.h>
-#include <ATen/native/vulkan/impl/Common.h>
-#include <ATen/native/vulkan/impl/Packing.h>
-
+#include <executorch/backends/vulkan/runtime/graph/ops/OpUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/StagingUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 using namespace at::native::vulkan;
 
-//
-// Utilities
-//
-
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \
   vTensor(                                           \
       api::context(),                                \
@@ -43,23 +36,159 @@ using namespace at::native::vulkan;
       api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
       allocate_memory);
 
+//
+// Simplified versions of ATen Vulkan legacy functions
+//
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+bool record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_dst));
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      get_nchw_to_image_shader(v_dst),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      params.buffer());
+}
+
+bool record_image_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  api::utils::uvec3 global_size = v_src.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_src));
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      get_image_to_nchw_shader(v_src),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_src.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      dst_buffer,
+      params.buffer());
+}
+
+void record_arithmetic_op(
+    api::Context* const context,
+    const api::ShaderInfo& compute_shader,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst,
+    const float alpha) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  ArithmeticParams block{
+      get_size_as_ivec4(v_dst),
+      get_size_as_ivec4(v_in1),
+      get_size_as_ivec4(v_in2),
+      alpha,
+  };
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      compute_shader,
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      params.buffer());
+}
+
+//
+// Utilities
+//
+
 void fill_vtensor(vTensor& vten, std::vector<float>& data) {
   api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size());
 
   copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_nchw_to_buffer_op(
-        api::context(), staging_buffer.buffer(), vten, {}, VK_NULL_HANDLE);
+    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
   } else {
-    api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(vten);
-    packing::record_nchw_to_image_op(
-        api::context(),
-        compute_shader,
-        staging_buffer.buffer(),
-        vten,
-        {},
-        VK_NULL_HANDLE);
+    record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
   }
 }
 
@@ -75,17 +204,9 @@ void extract_vtensor(vTensor& vten, std::vector<float>& data) {
       api::context(), api::kFloat, vten.gpu_numel());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_buffer_to_nchw_op(
-        api::context(), vten, staging_buffer.buffer(), {}, VK_NULL_HANDLE);
+    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   } else {
-    api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(vten);
-    packing::record_image_to_nchw_op(
-        api::context(),
-        compute_shader,
-        vten,
-        staging_buffer.buffer(),
-        {},
-        VK_NULL_HANDLE);
+    record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   }
 
   api::VulkanFence fence = api::context()->fences().get_fence();
@@ -208,14 +329,14 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
   // Add shader kernel
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Fill input tensors
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Extract output tensor
   std::vector<float> data_out(c.gpu_numel());
@@ -244,7 +365,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   std::vector<float> data_b(b.gpu_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Allocate memory at the last possible opportunity
   api::MemoryAllocation a_mem = allocate_memory_for(a);
@@ -260,7 +381,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   std::vector<float> data_c(c.gpu_numel());
   extract_vtensor(c, data_c);
@@ -310,20 +431,20 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   std::fill(data_d.begin(), data_d.end(), 1.0f);
 
   // Get shader kernel for add
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // First, fill a and b with data
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Now d can be filled with data
   fill_vtensor(d, data_d);
 
   // c + d -> e
-  arithmetic::record_op(api::context(), kernel, c, d, e, 1.0f);
+  record_arithmetic_op(api::context(), kernel, c, d, e, 1.0f);
 
   // Extract data from e
   std::vector<float> data_e(e.gpu_numel());