Update on "[ET-VK] Return fence after waiting is done."

trivedivivek · trivedivivek · commit 2459ecf50da7 · 2025-05-09T10:26:16.000-07:00
This change returns a fence to fence pool after it has been waited on. Differential Revision: [D74484825](https://our.internmc.facebook.com/intern/diff/D74484825/) [ghstack-poisoned]
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -434,9 +434,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="47552"
+        threshold="47560"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
@@ -44,10 +44,20 @@ def pytest_configure(config):
     if getattr(config.option, "fast_fvp", False):
         pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
 
+    pytest._test_options["tosa_version"] = "0.80"  # type: ignore[attr-defined]
     if config.option.arm_run_tosa_version:
         pytest._test_options["tosa_version"] = config.option.arm_run_tosa_version
 
-    pytest._test_options["tosa_ref_model"] = True  # type: ignore[attr-defined]
+    # Not all deployments of ET have the TOSA reference model available.
+    # Make sure we don't try to use it if it's not available.
+    try:
+        if pytest._test_options["tosa_version"] == "0.80":
+            import tosa_tools.v0_80.tosa_reference_model as tosa_reference_model
+        else:
+            import tosa_tools.tosa_ref_model as tosa_reference_model
+    except ImportError:
+        pytest._test_options["tosa_ref_model"] = False  # type: ignore[attr-defined]
+        tosa_reference_model = None  # noqa
 
     logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -885,6 +885,9 @@ class FuseTransposeOrPermuteOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
     Fuse transpose or permute op pairs to a single view op.
     (transpose or permutation) -> (quant or dequant) -> (transpose or permutation)
+    This happens when op2(op1) == identity, modulo unitary dimensions.
+    'unitary dimensions' example: a tensor of shape [1, 5, 30] is equivalent (in memory) to [5, 1, 30]
+    so transpose(1, 2) then transpose(0, 2) is a pseudo identity and should be fused.
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -908,7 +911,7 @@ def can_fuse_for_chain(
         if not super().can_fuse_for_chain(producer, consumer, consumer_op_packets):
             return False
 
-        # checking that permut2(permut1(identify)) == identity
+        # checking that permut2(permut1(identity)) == identity, modulo unitary dimensions
         input_shape = cast(torch.fx.Node, producer.args[0]).meta["val"].shape
         ident_dims = list(range(len(input_shape)))
         # this mapping helps to handle both transpose and permutations
@@ -918,14 +921,20 @@ def can_fuse_for_chain(
         }
         in_dims = f[producer.target](producer, ident_dims)
         out_dims = f[consumer.target](consumer, in_dims)
-        return out_dims == ident_dims
+        # Filtering out unitary dimensions
+        non_unit_ident_dims = [dim for dim in ident_dims if input_shape[dim] != 1]
+        non_unit_out_dims = [dim for dim in out_dims if input_shape[dim] != 1]
+        return non_unit_out_dims == non_unit_ident_dims
 
     def get_fused_node(
         self,
         producer: torch.fx.Node,
         consumer: torch.fx.Node,
         graph_module: torch.fx.GraphModule,
     ) -> torch.fx.Node:
+        # This step is important because of how we can fuse transpositions that are not perfectly
+        # reverse one of another but will be fused if there are unitary dimensions.
+        # The fused operation must have the same output shape as the consumer.
         output_shape = consumer.meta["val"].shape
         with graph_module.graph.inserting_after(consumer):
             view = graph_module.graph.call_function(
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -584,6 +584,28 @@ def _create_operator(
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
                 False,
             ),
+            # transpose -> quant -> transpose is not the reverse BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [0, 2],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [5, 40, 1],
+            ),
+            # transpose -> quant -> transpose is not the reverse, and unitary dimensions
+            # don't help => don't fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [1, 3],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [5, 40, 1, 4],
+            ),
             # permutation -> quant -> opposite permutation => fuse
             (
                 False,
@@ -622,6 +644,28 @@ def _create_operator(
                 False,
                 [4, 4, 4],
             ),
+            # permutation -> quant -> a non reverse permutation BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 2, 1, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [3, 1, 8, 10],
+            ),
+            # permutation -> quant -> a non reverse permutation, and unitary dimensions
+            # don't help => don't fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 1, 2, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [3, 1, 8, 10],
+            ),
             # transpose -> quant -> transpose as a permutation => fuse
             (
                 True,
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
@@ -12,6 +12,7 @@
   _(uint8_t, Byte)                           \
   _(int8_t, Char)
 
+using ::executorch::aten::IntArrayRef;
 using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
@@ -67,6 +68,44 @@ void quantized_linear_per_tensor_out(
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out);
 
+void quantized_conv_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out);
+
+void quantized_conv_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    bool channel_last,
+    Tensor& out);
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
@@ -235,6 +235,15 @@ Context* context() {
           8u, // cmdPoolBatchSize
       };
 
+      const vkapi::DescriptorPoolConfig descriptor_pool_config{
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
+          32u, // descriptorPileSizes
+      };
+
       const vkapi::QueryPoolConfig query_pool_config{
           VULKAN_QUERY_POOL_SIZE, // maxQueryCount
           256u, // initialReserveSize
@@ -243,7 +252,7 @@ Context* context() {
       const ContextConfig config{
           cmd_submit_frequency,
           cmd_config,
-          {},
+          descriptor_pool_config,
           query_pool_config,
       };
 
@@ -257,17 +266,6 @@ Context* context() {
   return context.get();
 }
 
-vkapi::DescriptorPoolConfig default_descriptor_pool_config() {
-  return {
-      VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
-      VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
-      VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
-      VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
-      VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
-      32u, // descriptorPileSizes
-  };
-}
-
 #ifdef VULKAN_DEBUG
 
 #ifdef VK_KHR_pipeline_executable_properties
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
@@ -267,8 +267,6 @@ bool available();
 // a static local variable.
 Context* context();
 
-vkapi::DescriptorPoolConfig default_descriptor_pool_config();
-
 namespace detail {
 
 inline void arg_is_empty(
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -594,13 +594,13 @@ void ComputeGraph::prepare() {
           prepack_descriptor_counts_.field) * \
       config_.descriptor_pool_safety_factor))
 
-  const uint32_t max_sets = MERGE_FIELD(descriptor_pool_max_sets);
-  const vkapi::DescriptorPoolConfig config{
+  uint32_t max_sets = MERGE_FIELD(descriptor_pool_max_sets);
+  vkapi::DescriptorPoolConfig config{
       max_sets,
-      MERGE_FIELD(descriptor_uniform_buffer_count),
-      MERGE_FIELD(descriptor_storage_buffer_count),
-      MERGE_FIELD(descriptor_combined_sampler_count),
-      MERGE_FIELD(descriptor_storage_image_count),
+      std::max(MERGE_FIELD(descriptor_uniform_buffer_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_storage_buffer_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_combined_sampler_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_storage_image_count), max_sets),
       1u,
   };
 
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -33,7 +33,6 @@ DispatchNode::DispatchNode(
       spec_vars_(spec_vars),
       push_constants_(push_constants) {
   graph.update_descriptor_counts(shader, /*execute = */ true);
-  graph.context()->check_device_capabilities(shader_);
 }
 
 void DispatchNode::encode(ComputeGraph* graph) {
@@ -43,6 +42,8 @@ void DispatchNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   vkapi::PipelineBarrier pipeline_barrier{};
 
+  context->check_device_capabilities(shader_);
+
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
   std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -45,7 +45,6 @@ PrepackNode::PrepackNode(
       push_constants_(push_constants) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
-  graph.context()->check_device_capabilities(shader_);
 }
 
 api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
@@ -71,6 +70,8 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
+  context->check_device_capabilities(shader_);
+
   vTensorPtr packed = graph->get_tensor(packed_);
   api::StagingBuffer staging = create_staging_buffer(graph);
 
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -269,7 +269,11 @@ DescriptorPool::DescriptorPool(
       pool_(VK_NULL_HANDLE),
       config_(config),
       mutex_{},
-      piles_{} {}
+      piles_{} {
+  if (config.descriptor_pool_max_sets > 0) {
+    init(config);
+  }
+}
 
 DescriptorPool::~DescriptorPool() {
   if (pool_ == VK_NULL_HANDLE) {
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -87,9 +87,6 @@ class VulkanComputeAPITest : public ::testing::Test {
   void SetUp() override {
     // Make sure we are starting with a clean slate
     EXPECT_TRUE(get_vma_allocation_count() == 0);
-    if (!context()->descriptor_pool()) {
-      context()->descriptor_pool().init(default_descriptor_pool_config());
-    }
   }
 
   void TearDown() override {
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
@@ -66,6 +66,8 @@ def is_const(
         )
     elif isinstance(arg, _PRIMITIVE_TYPES):
         return True
+    elif arg is None:
+        return True
     elif not isinstance(arg, torch.fx.Node):
         return False
     elif arg in const_node_to_tensor:
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -1823,3 +1823,34 @@ def _do_checks(
         self.assertTrue(
             torch.allclose(output_no_dim_order[0], output_no_dim_order_revert[0])
         )
+
+    def test_constant_prop_pass_none(self) -> None:
+        """
+        This checks that None arguments are treated as constants in constant_prop_pass.
+        """
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.cst = torch.ones(3, 3, 3, dtype=torch.int8)
+                self.w = torch.ones(3, 3, 3, dtype=torch.int8)
+
+            def forward(self, x):
+                # Note: using e.g aten.linear would not work as None is not in the graph
+                a = torch.ops.aten.convolution.default(
+                    self.cst, self.w, None, [1], [0], [1], False, [0], 1
+                )
+                return a + x
+
+        mod = M()
+        x = torch.randn([3, 3, 3])
+        mod(x)
+        edge = to_edge(
+            export(mod, (x,), strict=True),
+            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        )
+        # 2 constants: self.w and self.cst
+        self.assertEqual(2, len(edge.exported_program().constants))
+        pass_result = constant_prop_pass(edge.exported_program())
+        # 1 constant: a (= self.w @ self.cst)
+        self.assertEqual(1, len(pass_result.constants))
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h

Original file line number	Diff line number	Diff line change
`@@ -87,9 +87,6 @@ class VulkanComputeAPITest : public ::testing::Test {`
`87`	`87`	`void SetUp() override {`
`88`	`88`	`// Make sure we are starting with a clean slate`
`89`	`89`	`EXPECT_TRUE(get_vma_allocation_count() == 0);`
`90`		`- if (!context()->descriptor_pool()) {`
`91`		`- context()->descriptor_pool().init(default_descriptor_pool_config());`
`92`		`- }`
`93`	`90`	`}`
`94`	`91`
`95`	`92`	`void TearDown() override {`
Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,8 @@ def is_const(`
`66`	`66`	`)`
`67`	`67`	`elif isinstance(arg, _PRIMITIVE_TYPES):`
`68`	`68`	`return True`
	`69`	`+ elif arg is None:`
	`70`	`+ return True`
`69`	`71`	`elif not isinstance(arg, torch.fx.Node):`
`70`	`72`	`return False`
`71`	`73`	`elif arg in const_node_to_tensor:`