Arm backend: Add numerically stable (log)softmax decomposition

AdrianLundell · oscarandersson8218 · AdrianLundell · commit 11d6f1a97af6 · 2025-03-11T09:31:28.000+01:00
- Only use the old version for Ethos-U55 compile specs since amax isn't supported in that case.
- Add support for negative indices in amax/amin
- Refactor unittests

Change-Id: I7ed43b8d6b95625f59ce9e71d55a21763fc51358
Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
Co-authored-by: Oscar Andersson  &lt;oscar.andersson@arm.com&gt;
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -41,8 +41,9 @@
 from executorch.backends.arm._passes.decompose_select import (  # type: ignore[import-not-found]
     DecomposeSelectPass,
 )
-from executorch.backends.arm._passes.decompose_softmaxes_pass import (
-    DecomposeSoftmaxesPass,
+from executorch.backends.arm._passes.decompose_softmax_pass import DecomposeSoftmaxPass
+from executorch.backends.arm._passes.decompose_softmax_unstable_pass import (
+    DecomposeSoftmaxUnstablePass,
 )
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
@@ -78,7 +79,7 @@
 from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
-from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 
 from executorch.backends.transforms.replace_scalar_with_tensor import (
@@ -151,7 +152,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
-        self.add_pass(DecomposeSoftmaxesPass())
+        self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -199,6 +200,12 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeDivPass())
-        self.add_pass(DecomposeSoftmaxesPass())
+
+        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+            # Numerically stable softmax uses amax which is not supported on Ethos-U55
+            self.add_pass(DecomposeSoftmaxUnstablePass())
+        else:
+            self.add_pass(DecomposeSoftmaxPass())
+
         self.add_pass(ConvertMinMaxPass())
         return self._transform(graph_module)
diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+# For BI case
+torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
+# For MI case
+edge_softmax = (
+    exir_ops.edge.aten._softmax.default,
+    exir_ops.edge.aten._log_softmax.default,
+)
+log_softmax = (torch.ops.aten.log_softmax.int, exir_ops.edge.aten._log_softmax.default)
+
+
+def _get_logsoftmax_ops(op) -> tuple:
+    """
+    Returns the (log_op, sub_op, amax_op, expo_op, sum_op, reciprocal_op), where the ops depends on if
+    the softmax op is an aten or edge op.
+    """
+    if op in edge_softmax:
+        return (
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.amax.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.sum.dim_IntList,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.mul.Tensor,
+        )
+    if op in torch_softmax:
+        return (
+            torch.ops.aten.log.default,
+            torch.ops.aten.sub.Tensor,
+            torch.ops.aten.amax.default,
+            torch.ops.aten.exp.default,
+            torch.ops.aten.sum.dim_IntList,
+            torch.ops.aten.reciprocal.default,
+            torch.ops.aten.mul.Tensor,
+        )
+    raise RuntimeError(f"Can't get logsoftmax decomposition ops for op {op}")
+
+
+class DecomposeSoftmaxPass(ExportPass):
+    """
+    This pass decomposes log_softmax or softmax into more primitive ops.
+    Example:
+        %op1 = amax(x)
+        %op2 = sub(x, %op1)
+        %op3 = exp(%op2)
+        %op4 = sum(%op3, dim)
+        %op5 = reciprocal(%op4)
+        %op6 = mul(%op3, %op5)
+        (in logsoftmax case: %op7 = log(%op6))
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_softmax + edge_softmax:
+            return super().call_operator(op, args, kwargs, meta)
+        log_op, sub_op, max_op, exp_op, sum_op, reciprocal_op, mul_op = (
+            _get_logsoftmax_ops(op)
+        )
+        _input = args[0]
+        dim = [args[1]]
+        op1 = super().call_operator(max_op, (_input, dim, True), {}, meta)
+        op2 = super().call_operator(sub_op, (_input, op1), {}, meta)
+        op3 = super().call_operator(exp_op, (op2,), {}, meta)
+        op4 = super().call_operator(sum_op, (op3, dim, True), {}, meta)
+        op5 = super().call_operator(reciprocal_op, (op4,), {}, meta)
+        op6 = super().call_operator(mul_op, (op3, op5), {}, meta)
+        if op in log_softmax:
+            op6 = super().call_operator(log_op, (op6,), {}, meta)
+        return op6
diff --git a/backends/arm/_passes/decompose_softmax_unstable_pass.py b/backends/arm/_passes/decompose_softmax_unstable_pass.py
@@ -1,5 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -46,7 +45,7 @@ def get_logsoftmax_ops(op) -> tuple:
     raise RuntimeError(f"Can't get softmax decomposition ops for op {op}")
 
 
-class DecomposeSoftmaxesPass(ExportPass):
+class DecomposeSoftmaxUnstablePass(ExportPass):
     """
     This pass decomposes log softmax or softmax into more primitive ops.
 
diff --git a/backends/arm/operators/op_amax.py b/backends/arm/operators/op_amax.py
@@ -5,6 +5,7 @@
 from typing import List
 
 import serializer.tosa_serializer as ts
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -31,6 +32,12 @@ def define_node(
 
         input = inputs[0]
         dim = inputs[1].number
+
+        if dim < 0:
+            tensor = get_first_fake_tensor(node)
+            rank = len(tensor.size())
+            dim = rank + dim
+
         keep_dims = inputs[2].number
         if not keep_dims:
             raise RuntimeError(
diff --git a/backends/arm/operators/op_amin.py b/backends/arm/operators/op_amin.py
@@ -5,6 +5,7 @@
 from typing import List
 
 import serializer.tosa_serializer as ts
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -31,6 +32,12 @@ def define_node(
 
         input = inputs[0]
         dim = inputs[1].number
+
+        if dim < 0:
+            tensor = get_first_fake_tensor(node)
+            rank = len(tensor.size())
+            dim = rank + dim
+
         keep_dims = inputs[2].number
         if not keep_dims:
             raise RuntimeError(
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py