Skip to content

Bump torchao + add unit tests for torchao kernels #9396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions .ci/scripts/test_llama_torchao_lowbit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set -exu

source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"

export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."

if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
fi

which "${PYTHON_EXECUTABLE}"

# Update tokenizers submodule
pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
echo "Update tokenizers submodule"
git submodule update --init
popd

# Install ET with CMake
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-Bcmake-out .
cmake --build cmake-out -j16 --target install --config Release

# Install llama runner with torchao
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_TORCHAO=ON \
-Bcmake-out/examples/models/llama \
examples/models/llama
cmake --build cmake-out/examples/models/llama -j16 --config Release

# Download stories llama110m artifacts
download_stories_model_artifacts

echo "Creating tokenizer.bin"
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin

# Export model
LLAMA_CHECKPOINT=stories110M.pt
LLAMA_PARAMS=params.json
MODEL_OUT=model.pte
TOKENIZER=tokenizer.bin

# Set low-bit quantization parameters
QLINEAR_BITWIDTH=3 # Can be 1-8
QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16

${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
--checkpoint "${LLAMA_CHECKPOINT:?}" \
--params "${LLAMA_PARAMS:?}" \
-kv \
--use_sdpa_with_kv_cache \
--output_name=${MODEL_OUT} \
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
--group_size ${QLINEAR_GROUP_SIZE} \
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
--disable_dynamic_shape \
-d fp32

# Test run
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do a simple non-brittle sanity check? e.g. output length > 0 or something

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the main concern is whether it runs. If it runs, it will produce output.

24 changes: 22 additions & 2 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ jobs:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
strategy:
matrix:
# Mac runners are expensive and limited, and non reliable.
# Do some basic testing for macos jobs, and rely mostly on
# Mac runners are expensive and limited, and non reliable.
# Do some basic testing for macos jobs, and rely mostly on
# test-models-linux-aarch64 job instead.
model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
backend: [xnnpack-quantization-delegation]
Expand Down Expand Up @@ -288,6 +288,26 @@ jobs:
# Test ANE llama
${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh

test-llama-torchao-lowbit:
name: test-llama-torchao-lowbit
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-stable
python-version: '3.11'
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
bash .ci/scripts/setup-conda.sh
eval "$(conda shell.bash hook)"

# Install requirements
${CONDA_RUN} python install_executorch.py
${CONDA_RUN} sh examples/models/llama/install_requirements.sh

# Run test
${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh

test-llama-runner-linux:
# Test Both linux x86 and linux aarch64
name: test-llama-runner-linux
Expand Down
23 changes: 15 additions & 8 deletions examples/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,16 +115,23 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
endif()

if(EXECUTORCH_BUILD_TORCHAO)
set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental)
target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND link_libraries torchao_ops_executorch)
# Currently only enable this on Arm-based Macs
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
set(TORCHAO_BUILD_CPU_AARCH64 ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
target_link_options_shared_lib(torchao_ops_mps_executorch)
list(APPEND link_libraries torchao_ops_mps_executorch)
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
)
target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND link_libraries torchao_ops_executorch)
if(EXECUTORCH_BUILD_MPS)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
target_link_options_shared_lib(torchao_ops_mps_executorch)
list(APPEND link_libraries torchao_ops_mps_executorch)
endif()
endif()
endif()

Expand Down
73 changes: 73 additions & 0 deletions examples/models/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,79 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
### Android
Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.

## Running with low-bit kernels

We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. Currently dynamic shapes must be disabled when exporting a model with these kernels.

First export your model for lowbit quantization (step 2 above):

```
# Set these paths to point to the downloaded files
LLAMA_CHECKPOINT=path/to/checkpoint.pth
LLAMA_PARAMS=path/to/params.json

# Set low-bit quantization parameters
QLINEAR_BITWIDTH=3 # Can be 1-8
QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16

python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${LLAMA_CHECKPOINT:?}" \
--params "${LLAMA_PARAMS:?}" \
-kv \
--use_sdpa_with_kv_cache \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="llama3_2.pte" \
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
--group_size ${QLINEAR_GROUP_SIZE} \
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
--disable_dynamic_shape \
-d fp32
```

Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.

The first step is to install ExecuTorch (the same as step 3.1 above):

```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-Bcmake-out .
cmake --build cmake-out -j16 --target install --config Release
```

Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):

```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_TORCHAO=ON \
-Bcmake-out/examples/models/llama \
examples/models/llama
cmake --build cmake-out/examples/models/llama -j16 --config Release
```

Finally run your model (similar to step 3.3 above):

```
cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
```

## Utility tools for Llama enablement

Expand Down
27 changes: 16 additions & 11 deletions examples/models/llama/source_transformation/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,24 @@ def quantize( # noqa C901
matches = re.findall(pattern, qmode)
assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
bitwidth = int(matches[0][0])
_load_torchao_aten_lib(libname="libtorchao_ops_aten")
from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer

with torch.no_grad():
model = Int8DynActIntxWeightLinearQuantizer(
device="cpu",
precision=torch.float32,
groupsize=group_size,
bitwidth=bitwidth,
has_weight_zeros=False,
).quantize(model)
from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
from torchao.quantization.granularity import PerGroup, PerRow
from torchao.quantization.quant_api import quantize_
from torchao.utils import unwrap_tensor_subclass

with torch.no_grad():
quantize_(
model,
Int8DynamicActivationIntxWeightConfig(
weight_dtype=getattr(torch, f"int{bitwidth}"),
granularity=(
PerRow() if group_size in [0, -1] else PerGroup(group_size)
),
has_weight_zeros=False,
),
)
model = unwrap_tensor_subclass(model)
if verbose:
print("quantized model:", model)
return model
Expand Down Expand Up @@ -752,7 +758,6 @@ def get_quant_embedding_transform(args):
bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
group_size = int(group_size)
bitwidth = int(bitwidth)
_load_torchao_aten_lib(libname="libtorchao_ops_aten")
from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer

def _torchao_embedding_quantizer(model):
Expand Down
6 changes: 4 additions & 2 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# LICENSE file in the root directory of this source tree.

import argparse
import os
import platform
import re
import subprocess
Expand Down Expand Up @@ -117,6 +118,8 @@ def install_requirements(use_pytorch_nightly):

# Install packages directly from local copy instead of pypi.
# This is usually not recommended.
new_env = os.environ.copy()
new_env["USE_CPP"] = "1" # install torchao kernels
subprocess.run(
[
sys.executable,
Expand All @@ -127,6 +130,7 @@ def install_requirements(use_pytorch_nightly):
"--no-build-isolation",
*LOCAL_REQUIREMENTS,
],
env=new_env,
check=True,
)

Expand All @@ -143,8 +147,6 @@ def main(args):


if __name__ == "__main__":
import os

# Before doing anything, cd to the directory containing this script.
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if not python_is_compatible():
Expand Down
2 changes: 1 addition & 1 deletion third-party/ao
Submodule ao updated 493 files
Loading