pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py
Lines changed: 2 additions & 1 deletion b/‎.github/scripts/extract_benchmark_results.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/scripts/run_nm.py
Lines changed: 171 additions & 0 deletions b/‎.github/scripts/run_nm.py
Lines changed: 171 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 54 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 54 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 11 additions & 31 deletions b/‎CMakeLists.txt
Lines changed: 11 additions & 31 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 3 additions & 1 deletion b/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_table.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_table.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 6 additions & 9 deletions b/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 6 additions & 9 deletions
diff --git a/‎backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
Lines changed: 6 additions & 1 deletion b/‎backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
Lines changed: 6 additions & 1 deletion
@@ -360,6 +360,7 @@ def transform(
                     "app_type": app_type,
                     # Just keep a copy of the benchmark config here
                     "benchmark_config": json.dumps(benchmark_config),
+                    "job_conclusion": "SUCCESS",
                 },
             },
             "model": {
@@ -455,7 +456,7 @@ def transform_failure_record(
         },
         "metric": {
             "name": "FAILURE_REPORT",
-            "benchmark_values": 0,
+            "benchmark_values": [0],
             "target_value": 0,
             "extra_info": {
                 "method": "",
 
@@ -0,0 +1,171 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+
+@dataclass
+class Symbol:
+    name: str
+    addr: int
+    size: int
+    symbol_type: str
+
+
+class Parser:
+    def __init__(self, elf: str, toolchain_prefix: str = "", filter=None):
+        self.elf = elf
+        self.toolchain_prefix = toolchain_prefix
+        self.symbols: Dict[str, Symbol] = self._get_nm_output()
+        self.filter = filter
+
+    @staticmethod
+    def run_nm(
+        elf_file_path: str, args: Optional[List[str]] = None, nm: str = "nm"
+    ) -> str:
+        """
+        Run the nm command on the specified ELF file.
+        """
+        args = [] if args is None else args
+        cmd = [nm] + args + [elf_file_path]
+        try:
+            result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+            return result.stdout
+        except FileNotFoundError:
+            print(f"Error: 'nm' command not found. Please ensure it's installed.")
+            sys.exit(1)
+        except subprocess.CalledProcessError as e:
+            print(f"Error running nm on {elf_file_path}: {e}")
+            print(f"stderr: {e.stderr}")
+            sys.exit(1)
+
+    def _get_nm_output(self) -> Dict[str, Symbol]:
+        args = [
+            "--print-size",
+            "--size-sort",
+            "--reverse-sort",
+            "--demangle",
+            "--format=bsd",
+        ]
+        output = Parser.run_nm(
+            self.elf,
+            args,
+            nm=self.toolchain_prefix + "nm" if self.toolchain_prefix else "nm",
+        )
+        lines = output.splitlines()
+        symbols = []
+        symbol_pattern = re.compile(
+            r"(?P<addr>[0-9a-fA-F]+)\s+(?P<size>[0-9a-fA-F]+)\s+(?P<type>\w)\s+(?P<name>.+)"
+        )
+
+        def parse_line(line: str) -> Optional[Symbol]:
+
+            match = symbol_pattern.match(line)
+            if match:
+                addr = int(match.group("addr"), 16)
+                size = int(match.group("size"), 16)
+                type_ = match.group("type").strip().strip("\n")
+                name = match.group("name").strip().strip("\n")
+                return Symbol(name=name, addr=addr, size=size, symbol_type=type_)
+            return None
+
+        for line in lines:
+            symbol = parse_line(line)
+            if symbol:
+                symbols.append(symbol)
+
+        assert len(symbols) > 0, "No symbols found in nm output"
+        if len(symbols) != len(lines):
+            print(
+                "** Warning: Not all lines were parsed, check the output of nm. Parsed {len(symbols)} lines, given {len(lines)}"
+            )
+        if any(symbol.size == 0 for symbol in symbols):
+            print("** Warning: Some symbols have zero size, check the output of nm.")
+
+        # TODO: Populate the section and module fields from the linker map if available (-Wl,-Map=linker.map)
+        return {symbol.name: symbol for symbol in symbols}
+
+    def print(self):
+        print(f"Elf: {self.elf}")
+
+        def print_table(filter=None, filter_name=None):
+            print("\nAddress\t\tSize\tType\tName")
+            # Apply filter and sort symbols
+            symbols_to_print = {
+                name: sym
+                for name, sym in self.symbols.items()
+                if not filter or filter(sym)
+            }
+            sorted_symbols = sorted(
+                symbols_to_print.items(), key=lambda x: x[1].size, reverse=True
+            )
+
+            # Print symbols and calculate total size
+            size_total = 0
+            for name, sym in sorted_symbols:
+                print(f"{hex(sym.addr)}\t\t{sym.size}\t{sym.symbol_type}\t{sym.name}")
+                size_total += sym.size
+
+            # Print summary
+            symbol_percent = len(symbols_to_print) / len(self.symbols) * 100
+            print("-----")
+            print(f"> Total bytes: {size_total}")
+            print(
+                f"Counted: {len(symbols_to_print)}/{len(self.symbols)}, {symbol_percent:0.2f}% (filter: '{filter_name}')"
+            )
+            print("=====\n")
+
+        # Print tables with different filters
+        def is_executorch_symbol(s):
+            return "executorch" in s.name or s.name.startswith("et")
+
+        FILTER_NAME_TO_FILTER_AND_LABEL = {
+            "all": (None, "All"),
+            "executorch": (is_executorch_symbol, "ExecuTorch"),
+            "executorch_text": (
+                lambda s: is_executorch_symbol(s) and s.symbol_type.lower() == "t",
+                "ExecuTorch .text",
+            ),
+        }
+
+        filter_func, label = FILTER_NAME_TO_FILTER_AND_LABEL.get(
+            self.filter, FILTER_NAME_TO_FILTER_AND_LABEL["all"]
+        )
+        print_table(filter_func, label)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Process ELF file and linker map file."
+    )
+    parser.add_argument(
+        "-e", "--elf-file-path", required=True, help="Path to the ELF file"
+    )
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        default="all",
+        help="Filter symbols by pre-defined filters",
+        choices=["all", "executorch", "executorch_text"],
+    )
+    parser.add_argument(
+        "-p",
+        "--toolchain-prefix",
+        required=False,
+        default="",
+        help="Optional toolchain prefix for nm",
+    )
+
+    args = parser.parse_args()
+    p = Parser(args.elf_file_path, args.toolchain_prefix, filter=args.filter)
+    p.print()
@@ -231,6 +231,60 @@ jobs:
         # Run arm unit tests using the simulator
         backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 
+  test-arm-cortex-m-size-test:
+    name: test-arm-cortex-m-size-test
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-arm-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/ethos-u-scratch/setup_path.sh
+
+        # User baremetal toolchain
+        arm-none-eabi-c++ --version
+        toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+        toolchain_cmake=$(realpath ${toolchain_cmake})
+
+        # Build and test size test
+        bash test/build_size_test.sh "-DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON"
+        elf="cmake-out/test/size_test"
+
+        # Dump basic info
+        ls -al ${elf}
+        arm-none-eabi-size ${elf}
+
+        # Dump symbols
+        python .github/scripts/run_nm.py -e ${elf}
+        python .github/scripts/run_nm.py -e ${elf} -f "executorch" -p "arm-none-eabi-"
+        python .github/scripts/run_nm.py -e ${elf} -f "executorch_text" -p "arm-none-eabi-"
+
+        # Add basic guard - TODO: refine this!
+        arm-none-eabi-strip ${elf}
+        output=$(ls -la ${elf})
+        arr=($output)
+        size=${arr[4]}
+        threshold="102400" # 100KiB
+        echo "size: $size, threshold: $threshold"
+        if [[ "$size" -le "$threshold" ]]; then
+          echo "Success $size <= $threshold"
+        else
+          echo "Fail $size > $threshold"
+          exit 1
+        fi
+
   test-coreml-delegate:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
 
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portable_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
@@ -764,10 +769,6 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
@@ -872,34 +873,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 
-    set(_pybind_training_dep_libs
-        ${TORCH_PYTHON_LIBRARY}
-        etdump
-        executorch
-        util
-        torch
-        extension_training
-    )
-
-    if(EXECUTORCH_BUILD_XNNPACK)
-      # need to explicitly specify XNNPACK and microkernels-prod
-      # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
-      list(APPEND _pybind_training_dep_libs xnnpack_backend XNNPACK microkernels-prod)
-    endif()
-
-    # pybind training
-    pybind11_add_module(_training_lib SHARED extension/training/pybindings/_training_lib.cpp)
-
-    target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
-    target_compile_options(_training_lib PUBLIC ${_pybind_compile_options})
-    target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
-
-    install(TARGETS _training_lib
-            LIBRARY DESTINATION executorch/extension/training/pybindings
-    )
   endif()
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
 
@@ -32,7 +32,9 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+        pre_autograd_aten_dialect = export_for_training(
+            model, example_inputs, strict=True
+        ).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
 
@@ -207,7 +207,7 @@ def lower_module_and_test_output(
         expected_output = model(*sample_inputs)
 
         model = torch.export.export_for_training(
-            model, sample_inputs, dynamic_shapes=dynamic_shapes
+            model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
 
         edge_program = export_to_edge(
 
@@ -40,7 +40,7 @@ def define_node(
 
         if inputs[0].dtype not in (ts.DType.INT8, ts.DType.INT16):
             raise ValueError(
-                f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0]]}"
+                f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0].dtype]}"
             )
 
         table = self._exported_program.state_dict[node.name]  # type: ignore[union-attr]
 
@@ -192,16 +192,13 @@ def test_collate_tosa_BI_tests(self):
             .to_edge_transform_and_lower()
             .to_executorch()
         )
+
+        test_collate_dir = "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
         # test that the output directory is created and contains the expected files
-        assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
-        )
-        assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6_TOSA-0.80+BI.tosa"
-        )
-        assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6_TOSA-0.80+BI.json"
-        )
+        assert os.path.exists(test_collate_dir)
+
+        for file in os.listdir(test_collate_dir):
+            assert file.endswith(("TOSA-0.80+BI.json", "TOSA-0.80+BI.tosa"))
 
         os.environ.pop("TOSA_TESTCASES_BASE_PATH")
         shutil.rmtree("test_collate_tosa_tests", ignore_errors=True)
 
@@ -145,7 +145,12 @@ def test_linear_residaul_tosa_MI(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data)
+@common.parametrize(
+    "test_data",
+    test_data,
+    {"3d_rand": "MLETORCH-855: Issue with Quantization folding."},
+    strict=False,
+)
 def test_linear_residual_tosa_BI(test_data: input_t1):
     pipeline = TosaPipelineBI[input_t1](
         LinearResidualModule(),
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,9 @@ def quantize_and_compare(`
`32`	`32`	`) -> None:`
`33`	`33`	`assert quantization_type in {"PTQ", "QAT"}`
`34`	`34`
`35`		`- pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()`
	`35`	`+ pre_autograd_aten_dialect = export_for_training(`
	`36`	`+ model, example_inputs, strict=True`
	`37`	`+ ).module()`
`36`	`38`
`37`	`39`	`quantization_config = LinearQuantizerConfig.from_dict(`
`38`	`40`	`{`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def define_node(`
`40`	`40`
`41`	`41`	`if inputs[0].dtype not in (ts.DType.INT8, ts.DType.INT16):`
`42`	`42`	`raise ValueError(`
`43`		`- f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0]]}"`
	`43`	`+ f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0].dtype]}"`
`44`	`44`	`)`
`45`	`45`
`46`	`46`	`table = self._exported_program.state_dict[node.name] # type: ignore[union-attr]`