pytorch · metascroy · Mar 20, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+
+# Install llama runner with torchao
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# Export model
+LLAMA_CHECKPOINT=stories110M.pt
+LLAMA_PARAMS=params.json
+MODEL_OUT=model.pte
+TOKENIZER=tokenizer.bin
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
+    --checkpoint "${LLAMA_CHECKPOINT:?}" \
+    --params "${LLAMA_PARAMS:?}" \
+    -kv \
+    --use_sdpa_with_kv_cache \
+    --output_name=${MODEL_OUT} \
+    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+    --group_size ${QLINEAR_GROUP_SIZE} \
+    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+    --disable_dynamic_shape \
+    -d fp32
+
+# Test run
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -23,8 +23,8 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
-        # Mac runners are expensive and limited, and non reliable. 
-        # Do some basic testing for macos jobs, and rely mostly on 
+        # Mac runners are expensive and limited, and non reliable.
+        # Do some basic testing for macos jobs, and rely mostly on
         # test-models-linux-aarch64 job instead.
         model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
         backend: [xnnpack-quantization-delegation]
@@ -288,6 +288,26 @@ jobs:
         # Test ANE llama
         ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
 
+  test-llama-torchao-lowbit:
+    name: test-llama-torchao-lowbit
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Run test
+        ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux

@@ -115,16 +115,23 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental)
-  target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND link_libraries torchao_ops_executorch)
+  # Currently only enable this on Arm-based Macs
   if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+    set(TORCHAO_BUILD_CPU_AARCH64 ON)
     add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
-      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
-    target_link_options_shared_lib(torchao_ops_mps_executorch)
-    list(APPEND link_libraries torchao_ops_mps_executorch)
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
+      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
+    )
+    target_link_options_shared_lib(torchao_ops_executorch)
+    list(APPEND link_libraries torchao_ops_executorch)
+    if(EXECUTORCH_BUILD_MPS)
+      add_subdirectory(
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
+        ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
+      target_link_options_shared_lib(torchao_ops_mps_executorch)
+      list(APPEND link_libraries torchao_ops_mps_executorch)
+    endif()
   endif()
 endif()
 

@@ -380,6 +380,79 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 ### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
+## Running with low-bit kernels
+
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.  Currently dynamic shapes must be disabled when exporting a model with these kernels.
+
+First export your model for lowbit quantization (step 2 above):
+
+```
+# Set these paths to point to the downloaded files
+LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_PARAMS=path/to/params.json
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+python -m examples.models.llama.export_llama \
+  --model "llama3_2" \
+  --checkpoint "${LLAMA_CHECKPOINT:?}" \
+  --params "${LLAMA_PARAMS:?}" \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+  --output_name="llama3_2.pte" \
+  -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+  --group_size ${QLINEAR_GROUP_SIZE} \
+  -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+  --disable_dynamic_shape \
+  -d fp32
+```
+
+Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
+
+The first step is to install ExecuTorch (the same as step 3.1 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+```
+
+Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+```
+
+Finally run your model (similar to step 3.3 above):
+
+```
+cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
+```
 
 ## Utility tools for Llama enablement
 

@@ -98,18 +98,24 @@ def quantize(  # noqa C901
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
-        from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
 
-        with torch.no_grad():
-            model = Int8DynActIntxWeightLinearQuantizer(
-                device="cpu",
-                precision=torch.float32,
-                groupsize=group_size,
-                bitwidth=bitwidth,
-                has_weight_zeros=False,
-            ).quantize(model)
+        from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
+        from torchao.quantization.granularity import PerGroup, PerRow
+        from torchao.quantization.quant_api import quantize_
+        from torchao.utils import unwrap_tensor_subclass
 
+        with torch.no_grad():
+            quantize_(
+                model,
+                Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=getattr(torch, f"int{bitwidth}"),
+                    granularity=(
+                        PerRow() if group_size in [0, -1] else PerGroup(group_size)
+                    ),
+                    has_weight_zeros=False,
+                ),
+            )
+            model = unwrap_tensor_subclass(model)
         if verbose:
             print("quantized model:", model)
         return model
@@ -752,7 +758,6 @@ def get_quant_embedding_transform(args):
         bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
         from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
 
         def _torchao_embedding_quantizer(model):

diff --git a/install_requirements.py b/install_requirements.py
@@ -6,6 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import os
 import platform
 import re
 import subprocess
@@ -117,6 +118,8 @@ def install_requirements(use_pytorch_nightly):
 
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
+    new_env = os.environ.copy()
+    new_env["USE_CPP"] = "1"  # install torchao kernels
     subprocess.run(
         [
             sys.executable,
@@ -127,6 +130,7 @@ def install_requirements(use_pytorch_nightly):
             "--no-build-isolation",
             *LOCAL_REQUIREMENTS,
         ],
+        env=new_env,
         check=True,
     )
 
@@ -143,8 +147,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    import os
-
     # Before doing anything, cd to the directory containing this script.
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
     if not python_is_compatible():