pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 11 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 11 additions & 2 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎backends/arm/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion b/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 12 additions & 0 deletions b/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 12 additions & 0 deletions
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -245,7 +249,7 @@ cmake_dependent_option(
 )
 
 if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
-  set(EXECUTORCH_BUILF_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
@@ -348,6 +352,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
 endif()
 
 if(EXECUTORCH_BUILD_TESTS)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
 endif()
 
@@ -373,7 +378,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
       "fix for this restriction."
   )
 endif()
-set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
+set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -717,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
@@ -134,7 +134,7 @@ target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
 )
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
-target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
 target_link_libraries(coremldelegate PRIVATE executorch_core)
 
 
@@ -3,7 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import logging
-from typing import List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import coremltools as ct
 
@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        do_not_decompose = []
+        op_support = OperatorsSupportedForCoreMLBackend()
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.append(node.target)
+        return do_not_decompose, None
@@ -922,7 +922,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
-					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -954,7 +954,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
-					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
 
@@ -13,6 +13,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
             "getitem",
         ]
 
+    def test_ops_to_not_decompose(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, mask):
+                return torch.ops.aten.scaled_dot_product_attention.default(
+                    q, k, v, attn_mask=mask
+                )
+
+        model = Model()
+        model.eval()
+
+        batch_size = 1
+        n_heads = 12
+        seq_len = 1
+        max_seq_length = 32
+        embedding_dim = 16
+        q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
+        k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        mask = torch.randn(seq_len, max_seq_length)
+        example_inputs = (q, k, v, mask)
+        ep = torch.export.export(model, example_inputs)
+        coreml_partitioner = CoreMLPartitioner()
+
+        # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
+        edge_program_manager = executorch.exir.to_edge_transform_and_lower(
+            ep, partitioner=[coreml_partitioner]
+        )
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            in format_delegated_graph(
+                edge_program_manager.exported_program().graph_module
+            )
+        )
+
+        # Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
+        edge_program_manager2 = executorch.exir.to_edge(ep)
+        edge_program_manager2.to_backend(coreml_partitioner)
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            not in format_delegated_graph(
+                edge_program_manager2.exported_program().graph_module
+            )
+        )
+
     def test_buffer(self):
         embedding_dim = 3
         max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_ops_to_not_decompose()
     test_runner.test_buffer()
@@ -14,7 +14,7 @@ endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 # Third-party folder and Ethos-U driver inclued
 
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_partitioner",
     srcs = [
-        "arm_partitioner.py",
+        "ethosu_backend.py",
+        "ethosu_partitioner.py",
+        "tosa_backend.py",
+        "tosa_partitioner.py",
     ],
     typing = True,
     deps = [
 
@@ -77,6 +77,9 @@
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
 
+from executorch.backends.transforms.replace_scalar_with_tensor import (
+    ReplaceScalarWithTensorArgPass,
+)
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
@@ -102,6 +105,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(ConvertFullLikeToFullPass())
 
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
@@ -125,7 +129,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
-
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
@@ -176,6 +180,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ScalarsToAttributePass())
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
 
@@ -114,6 +114,7 @@ def try_set_param(
             if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
                 bn_bias_node, fused_conv_bias
             ):
+                # pyre-ignore[60]
                 # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
                 conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
                 conv.args = conv_args
 
@@ -76,5 +76,17 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     new_args.append(get_attr_node)
             n.args = tuple(new_args)
 
+            # Replace rsub.Scalar with sub.Tensor as retracing will fail otherwise
+            if n.target == torch.ops.aten.rsub.Scalar:
+                with graph_module.graph.inserting_after(n):
+                    reversed_args = (n.args[1], n.args[0])
+                    sub = graph_module.graph.create_node(
+                        "call_function", torch.ops.aten.sub.Tensor, reversed_args, {}
+                    )
+                    n.replace_all_uses_with(sub)
+                    sub.meta["val"] = n.meta["val"]
+                graph_module.graph.erase_node(n)
+
         graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ target_include_directories(`
`134`	`134`	`coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util`
`135`	`135`	`)`
`136`	`136`	`target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)`
`137`		`-target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)`
	`137`	`+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)`
`138`	`138`	`target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)`
`139`	`139`	`target_link_libraries(coremldelegate PRIVATE executorch_core)`
`140`	`140`