pytorch · perheld · Feb 12, 2026 · Feb 25, 2026 · Feb 26, 2026
@@ -290,13 +290,8 @@
       "name": "arm-ethosu-linux",
       "displayName": "Build ExecuTorch for Arm Ethos-U Linux",
       "inherits": ["common"],
-      "description": "musl declares __assert_fail with int for line; avoid NDEBUG forward-decl mismatch in Release builds",
       "cacheVariables": {
-        "EXECUTORCH_BUILD_ARM_ETHOSU_LINUX": "ON",
-        "EXECUTORCH_BUILD_EXECUTOR_RUNNER": "ON",
-        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
-        "CMAKE_C_FLAGS_RELEASE": "-UNDEBUG",
-        "CMAKE_CXX_FLAGS_RELEASE": "-UNDEBUG",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake",
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake"
       }
     }

@@ -76,27 +76,104 @@ The Arm backend can be built using the following command:
 ./install_executorch.sh
 ```
 
-One of the following commands should also be run once to gather the necessary dependencies for your chosen target(s):
+**NOTE:** While developing, it can be convenient to use `./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
 
-For the Ethos-U target:
+### Target-specific setup and build
+
+Pick one of the target flows below. Each flow has a one-time setup step and a build command.
+
+### Baremetal (Ethos-U) workflow
+
+Builds ExecuTorch runtime libraries for Cortex-M with Ethos-U acceleration.
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --i-agree-to-the-contained-eula
 ```
 
-For the VGF target:
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh
+```
+
+### VGF (Vulkan ML extensions) workflow
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
 ```
 
-For both Ethos-U & VGF targets:
+The current flow lowers to TOSA and converts to VGF for use in external projects,
+so the `executor_runner` is not typically used here.
+
+### Direct Drive (experimental, Ethos-U85 on Linux) workflow
+
+Direct Drive enables execution on Ethos-U85 via the Linux driver stack.
+
+Driver stack (Linux) and API:
+
+```
+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack
+```
+
+An FVP with Linux is available for Direct Drive, but it must be built and run
+manually. See:
 
 ```
-./examples/arm/setup.sh --i-agree-to-the-contained-eula --enable-mlsdk-deps
+https://corstone1000.docs.arm.com/en/corstone1000-2025.12/
 ```
 
-**NOTE:** While developing, it can be convenient to use`./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
+Setup:
+
+```
+./examples/arm/setup.sh --i-agree-to-the-contained-eula --target-toolchain linux-musl
+source ./examples/arm/arm-scratch/setup_path.sh
+```
+
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh \
+  --toolchain=aarch64-linux-musl-gcc \
+  --build_type=Debug
+```
+
+Note: setup selects the linux-musl toolchain; build uses the aarch64-linux-musl GCC toolchain name.
+
+If your Yocto image enables the dropbear SSH server, you can copy the
+`executor_runner` binary into the running FVP via scp:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
+```
+
+#### Direct Drive model (PTE) workflow
+
+Create a PTE file:
+
+```
+python3 -m examples.arm.aot_arm_compiler \
+  --model_name examples/arm/example_modules/add.py \
+  --delegate \
+  --quantize \
+  --target ethos-u85-256 \
+  --direct_drive
+```
+
+Copy the `executor_runner` binary and the generated PTE file to the running FVP:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner add_arm_delegate_ethos-u85-256.pte root@127.0.0.1:/tmp/
+```
+
+Run the model on the FVP:
+
+```
+ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/add_arm_delegate_ethos-u85-256.pte -num_executions 1"
+```
 
 ## Testing
 

@@ -347,19 +347,13 @@ Error platform_execute(
     int output_count,
     Span<executorch::runtime::EValue*> args,
     char* /*ethosu_scratch*/) {
-  std::vector<size_t> input_copy_sizes;
-  std::vector<const char*> linux_input_ptrs;
-  if (input_count > 0) {
-    input_copy_sizes.resize(input_count, 0);
-    linux_input_ptrs.resize(input_count, nullptr);
-  }
+  std::vector<size_t> input_copy_sizes(input_count, 0);
+  std::vector<const char*> linux_input_ptrs(input_count, nullptr);
 
-  std::vector<size_t> output_io_bytes;
-  std::vector<char*> linux_output_ptrs;
-  if (output_count > 0) {
-    output_io_bytes.resize(output_count, 0);
-    linux_output_ptrs.resize(output_count, nullptr);
-  }
+  std::vector<size_t> output_io_bytes(output_count, 0);
+  std::vector<char*> linux_output_ptrs(output_count, nullptr);
+  std::vector<std::vector<char>> output_scratch_buffers(output_count);
+  std::vector<bool> output_needs_adjustment(output_count, false);
 
   for (int i = 0; i < input_count; ++i) {
     auto tensor_in = args[i]->toTensor();
@@ -380,16 +374,12 @@ Error platform_execute(
       const size_t tensor_nbytes = tensor_out.nbytes();
       if (i < static_cast<int>(output_io_bytes.size()) &&
           output_io_bytes[i] != tensor_nbytes) {
-        ET_LOG(
-            Error,
-            "Ethos-U Linux backend output size mismatch for index %d: "
-            "driver IO bytes = %zu, tensor bytes = %zu",
-            i,
-            output_io_bytes[i],
-            tensor_nbytes);
-        return Error::InvalidState;
+        output_scratch_buffers[i].resize(output_io_bytes[i]);
+        linux_output_ptrs[i] = output_scratch_buffers[i].data();
+        output_needs_adjustment[i] = true;
+      } else {
+        linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
       }
-      linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
     }
   }
 
@@ -399,13 +389,37 @@ Error platform_execute(
     return Error::InvalidState;
   }
 
-  return invoke_linux_driver(
+  Error status = invoke_linux_driver(
       handles,
       linux_input_ptrs,
       linux_output_ptrs,
       input_copy_sizes,
       output_io_bytes,
       state->options);
+  if (status != Error::Ok) {
+    return status;
+  }
+
+  if (handles.outputs != nullptr) {
+    for (int i = 0; i < output_count; ++i) {
+      if (!output_needs_adjustment[i]) {
+        continue;
+      }
+      auto tensor_out = args[input_count + i]->toTensor();
+      const size_t tensor_nbytes = tensor_out.nbytes();
+      Error adjust_status = copy_with_layout_adjustment(
+          handles.outputs->io[i],
+          i,
+          output_scratch_buffers[i].data(),
+          tensor_out,
+          tensor_nbytes);
+      if (adjust_status != Error::Ok) {
+        return adjust_status;
+      }
+    }
+  }
+
+  return Error::Ok;
 }
 
 } // namespace arm

@@ -0,0 +1,19 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set_overridable_option(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX ON)
+set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+
+set(CMAKE_C_FLAGS_RELEASE
+    "-UNDEBUG"
+    CACHE STRING "Avoid NDEBUG forward-decl mismatch in musl Release builds"
+)
+set(CMAKE_CXX_FLAGS_RELEASE
+    "-UNDEBUG"
+    CACHE STRING "Avoid NDEBUG forward-decl mismatch in musl Release builds"
+)