pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 3 additions & 13 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 3 additions & 13 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 15 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 15 deletions
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions b/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 0 additions & 3 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/arm/scripts/install_reference_model.sh
Lines changed: 1 addition & 1 deletion b/‎backends/arm/scripts/install_reference_model.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 109 additions & 63 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 109 additions & 63 deletions
@@ -22,18 +22,12 @@ install_executorch_and_backend_lib() {
   ANDROID_NDK=/opt/ndk
   BUCK2=buck2
   ANDROID_ABI=arm64-v8a
-  cmake -DBUCK2="${BUCK2}" \
+  cmake --preset llm \
+    -DBUCK2="${BUCK2}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DXNNPACK_ENABLE_ARM_BF16=OFF \
     -Bcmake-android-out .
 
@@ -51,11 +45,7 @@ build_llama_runner() {
     -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DCMAKE_BUILD_TYPE=Release \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 
@@ -152,21 +152,11 @@ which "${PYTHON_EXECUTABLE}"
 cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
-    retry cmake \
+    retry cmake --preset llm \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DEXECUTORCH_BUILD_MPS="$MPS" \
-        -DEXECUTORCH_BUILD_COREML="$COREML" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
@@ -181,10 +171,6 @@ cmake_build_llama_runner() {
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
     cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"
 
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [macos-arm64, pybind]
+        preset: [macos-arm64, pybind, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -32,14 +32,14 @@ jobs:
         set -eux
         ${CONDA_RUN} ./install_requirements.sh > /dev/null
         ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
-        ${CONDA_RUN} cmake --build cmake-out --parallel
+        ${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 ))
 
   linux:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       fail-fast: false
       matrix:
-        preset: [pybind]
+        preset: [pybind, llm]
         runner: [linux.2xlarge, linux.arm64.2xlarge]
         docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         # Excluding specific runner + docker image combinations that don't make sense:
@@ -65,4 +65,4 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         cmake --preset ${{ matrix.preset }}
-        cmake --build cmake-out --parallel
+        cmake --build cmake-out -j$(( $(nproc) - 1 ))
@@ -692,3 +692,29 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-nxp-neutron:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+        
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        
+        # Build and install Executorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+        
+        # Run pytest
+        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
@@ -36,6 +36,26 @@
         "string": "${hostSystemName}",
         "list": ["Darwin", "Linux", "Windows"]
       }
+    },
+    {
+        "name": "llm",
+        "displayName": "Build LLM libraries",
+        "inherits": [
+            "common"
+        ],
+        "cacheVariables": {
+            "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/llm.cmake",
+            "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+        },
+        "condition": {
+            "type": "inList",
+            "string": "${hostSystemName}",
+            "list": [
+                "Darwin",
+                "Linux",
+                "Windows"
+            ]
+        }
     }
   ]
 }
@@ -25,8 +25,6 @@ endif()
 
 option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
 
-set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
-
 # inmemoryfs sources
 set(INMEMORYFS_SOURCES
     runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -240,7 +238,6 @@ if(EXECUTORCH_BUILD_COREML AND EXECUTORCH_BUILD_PYBIND)
 
   pybind11_add_module(executorchcoreml SHARED runtime/inmemoryfs/inmemory_filesystem_py.cpp)
 
-  target_compile_options(executorchcoreml PRIVATE -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET})
   if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
     target_compile_options(executorchcoreml PRIVATE -g)
   endif()
 
@@ -13,7 +13,7 @@ tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.g
 tosa_reference_model_0_80_branch="v0.80"
 tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
 tosa_serialization_lib_0_80_rev="v0.80.1"
-tosa_reference_model_1_0_rev="4d17b5b960cd986d8cb8052188fbe3ae494789e8"
+tosa_reference_model_1_0_rev="d102f426dd2e3c1f25bbf23292ec8ee51aa9c677"
 
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
 
@@ -13,10 +13,6 @@
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
 from executorch.backends.cadence.aot import compiler
-from executorch.backends.cadence.aot.compiler import (
-    export_to_edge,
-    quantize_and_export_to_edge,
-)
 from executorch.backends.cadence.aot.fuse_ops import (
     FuseFullThenReshapePass,
     FuseMulScalarIntoDequantPass,
@@ -336,94 +332,144 @@ def test_replace_quant_view_dequant_with_requantize(self):
         )
 
     def test_replace_dequant_quant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6).to(torch.int8)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(dequant, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(quant)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
 
         self.check_op_counts(
             graph_module,
             expected_op_counts={
-                # Verify that dequant -> permute -> quant was replaced with permute -> requantize.
+                # Verify that dequant -> quant was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
                 exir_ops.edge.cadence.requantize.default: 1,
             },
         )
 
     def test_replace_dequant_permute_quant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6).to(torch.int8)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(dequant, [2, 0, 1, 3])
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(quant)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
 
         self.check_op_counts(
             graph_module,
             expected_op_counts={
                 # Verify that dequant -> permute -> quant was replaced with permute -> requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
+                exir_ops.edge.aten.permute_copy.default: 1,
                 exir_ops.edge.cadence.requantize.default: 1,
             },
         )
 
     def test_remove_nop_dequant_quant(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-                self.lin1 = torch.nn.Linear(6, 12, bias=False)
-                self.lin2 = torch.nn.Linear(12, 24, bias=False)
+        LEADING_DIMS: Final[int] = 12
+        IN_DIM: Final[int] = 6
+        OUT_DIM: Final[int] = 12
 
-            def forward(self, x):
-                x = self.lin1(x)
-                # redundant dequant+quant will be created around this permute
-                x = torch.permute(x, [0, 2, 1, 3])
-                x = self.lin2(x)
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = (
-            quantize_and_export_to_edge(model, (inputs,))
-            .exported_program()
-            .graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder(
+            "x", torch.randn(LEADING_DIMS, IN_DIM, dtype=torch.float32)
+        )
+        quant1 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 4.5, 6, 0, 127, torch.int8),
+        )
+        weights = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM, IN_DIM], 1)
+        )
+        bias = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 1)
+        )
+        weight_zero_point = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([IN_DIM], 0)
+        )
+        out_multiplier = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 1)
+        )
+        out_shift = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 0)
         )
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        linear1 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantized_linear.default,
+            args=(
+                quant1,
+                weights,
+                bias,
+                0,  # src_zero_point
+                weight_zero_point,
+                out_multiplier,
+                out_shift,
+                0,  # out_zero_point
+                None,
+            ),
+        )
+        dequant1 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(linear1, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(dequant1, [1, 0])
+        )
+        quant2 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        linear2 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantized_linear.default,
+            args=(
+                quant2,
+                weights,
+                bias,
+                0,  # src_zero_point
+                weight_zero_point,
+                out_multiplier,
+                out_shift,
+                0,  # out_zero_point
+                None,
+            ),
+        )
+        dequant2 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(linear2, 1.2, 3, 0, 127, torch.int8),
+        )
+        builder.output(dequant2)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
         self.check_op_counts(
             graph_module,
             expected_op_counts={
-                # Verify that one dequant/quant pair was removed
-                # Expect 1 quantize ops: 1 input
+                # Verify that one dequant/quant pair was removed from chain:
+                # quant->linear->dequant->permute->quant->linear->dequant
+                # gets converted to:
+                # quant->linear->permute->linear->dequant
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
-                # Expect 1 dequant op at the end (output of second linear)
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 1,
             },
         )