pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/conda-env-ci.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/conda-env-ci.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 85 additions & 0 deletions b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 85 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 2 deletions b/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 2 deletions
diff --git a/‎backends/arm/test/test_arm_baremetal.sh
Lines changed: 2 additions & 2 deletions b/‎backends/arm/test/test_arm_baremetal.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/api/Context.cpp
Lines changed: 8 additions & 2 deletions b/‎backends/vulkan/runtime/api/Context.cpp
Lines changed: 8 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp
Lines changed: 8 additions & 4 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp
Lines changed: 8 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/vk_api/Pipeline.cpp
Lines changed: 8 additions & 16 deletions b/‎backends/vulkan/runtime/vk_api/Pipeline.cpp
Lines changed: 8 additions & 16 deletions
diff --git a/‎backends/vulkan/runtime/vk_api/Pipeline.h
Lines changed: 1 addition & 4 deletions b/‎backends/vulkan/runtime/vk_api/Pipeline.h
Lines changed: 1 addition & 4 deletions
diff --git a/‎backends/xnnpack/CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎backends/xnnpack/CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/xnnpack/operators/__init__.py
Lines changed: 1 addition & 2 deletions b/‎backends/xnnpack/operators/__init__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/xnnpack/operators/op_dequantize_per_tensor.py
Lines changed: 0 additions & 70 deletions b/‎backends/xnnpack/operators/op_dequantize_per_tensor.py
Lines changed: 0 additions & 70 deletions
@@ -1 +1 @@
-08434df1f2f88c9770e59246caa2ff9c6f613270
+295f2ed4d103017f7e19a7b8263ece606cd629db
@@ -1,4 +1,4 @@
-cmake=3.22.1
+cmake=3.26.4
 ninja=1.10.2
 libuv
 llvm-openmp
 
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+
+# Install llama runner with torchao
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# Export model
+LLAMA_CHECKPOINT=stories110M.pt
+LLAMA_PARAMS=params.json
+MODEL_OUT=model.pte
+TOKENIZER=tokenizer.bin
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
+    --checkpoint "${LLAMA_CHECKPOINT:?}" \
+    --params "${LLAMA_PARAMS:?}" \
+    -kv \
+    --use_sdpa_with_kv_cache \
+    --output_name=${MODEL_OUT} \
+    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+    --group_size ${QLINEAR_GROUP_SIZE} \
+    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+    --disable_dynamic_shape \
+    -d fp32
+
+# Test run
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
@@ -2,7 +2,7 @@ name: android-perf
 
 on:
   schedule:
-    - cron: 0 0 * * *
+    - cron: 0 0,8,16 * * *
   pull_request:
     paths:
       - .github/workflows/android-perf.yml
 
@@ -23,8 +23,8 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
-        # Mac runners are expensive and limited, and non reliable. 
-        # Do some basic testing for macos jobs, and rely mostly on 
+        # Mac runners are expensive and limited, and non reliable.
+        # Do some basic testing for macos jobs, and rely mostly on
         # test-models-linux-aarch64 job instead.
         model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
         backend: [xnnpack-quantization-delegation]
@@ -288,6 +288,26 @@ jobs:
         # Test ANE llama
         ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
 
+  test-llama-torchao-lowbit:
+    name: test-llama-torchao-lowbit
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Run test
+        ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
 
@@ -130,13 +130,13 @@ test_models_ethosu_fvp() { # End to End model tests using model_test.py
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-128 --model=mv2  --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-64  --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256  --model=mv2  --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-1024 --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128  --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128  --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
 
@@ -124,11 +124,17 @@ vkapi::DescriptorSet Context::get_descriptor_set(
   VkPipelineLayout pipeline_layout =
       pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
+  vkapi::SpecVarList spec_constants = {
+      SV(local_workgroup_size[0u]),
+      SV(local_workgroup_size[1u]),
+      SV(local_workgroup_size[2u])};
+
+  spec_constants.append(additional_constants);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
       {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
-       additional_constants,
-       local_workgroup_size});
+       spec_constants});
 
   cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
 
 
@@ -674,7 +674,8 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -692,7 +693,8 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -712,7 +714,8 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -730,7 +733,8 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
 
@@ -275,23 +275,13 @@ ComputePipeline::ComputePipeline(
     const ComputePipeline::Descriptor& descriptor,
     VkPipelineCache pipeline_cache)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  SpecVarList specialization_constants;
-
-  specialization_constants.reserve(
-      3 + descriptor.specialization_constants.size());
-  specialization_constants.append(descriptor.local_wg_size[0]);
-  specialization_constants.append(descriptor.local_wg_size[1]);
-  specialization_constants.append(descriptor.local_wg_size[2]);
-
-  specialization_constants.append(descriptor.specialization_constants);
-  const std::vector<VkSpecializationMapEntry> map_entries =
-      specialization_constants.generate_map_entries();
+  map_entries_ = descriptor.specialization_constants.generate_map_entries();
 
   const VkSpecializationInfo specialization_info{
-      specialization_constants.size(), // mapEntryCount
-      map_entries.data(), // pMapEntries
-      specialization_constants.data_nbytes(), // dataSize
-      specialization_constants.data(), // pData
+      descriptor.specialization_constants.size(), // mapEntryCount
+      map_entries_.data(), // pMapEntries
+      descriptor.specialization_constants.data_nbytes(), // dataSize
+      descriptor.specialization_constants.data(), // pData
   };
 
   const VkPipelineShaderStageCreateInfo shader_stage_create_info{
@@ -330,7 +320,9 @@ ComputePipeline::ComputePipeline(
 }
 
 ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
+    : device_(other.device_),
+      handle_(other.handle_),
+      map_entries_(std::move(other.map_entries_)) {
   other.handle_ = VK_NULL_HANDLE;
 }
 
 
@@ -156,7 +156,6 @@ class ComputePipeline final {
     VkPipelineLayout pipeline_layout;
     VkShaderModule shader_module;
     SpecVarList specialization_constants;
-    utils::WorkgroupSize local_wg_size;
   };
 
   explicit ComputePipeline(
@@ -175,6 +174,7 @@ class ComputePipeline final {
  private:
   VkDevice device_;
   VkPipeline handle_;
+  std::vector<VkSpecializationMapEntry> map_entries_;
 
  public:
   inline VkPipeline handle() const {
@@ -274,9 +274,6 @@ class ComputePipelineCache final {
         seed = utils::hash_combine(seed, new_seed);
       }
 
-      seed = utils::hash_combine(
-          seed, std::hash<uint32_t>()((uint32_t)descriptor.local_wg_size));
-
       return seed;
     }
   };
 
@@ -146,6 +146,10 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   list(APPEND xnn_executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
+  list(APPEND xnn_executor_runner_libs quantized_ops_lib)
+endif()
+
 list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)
 
 # ios can only build library but not binary
 
@@ -15,7 +15,6 @@
     op_ceiling,
     op_clamp,
     op_conv2d,
-    op_dequantize_per_tensor,
     op_div,
     op_dynamic_dequantize_ops,
     op_dynamic_quantize_ops,
@@ -35,7 +34,7 @@
     op_negate,
     op_permute,
     op_prelu,
-    op_quantize_per_tensor,
+    op_quant_dequant,
     op_relu,
     op_rsqrt,
     op_sdpa,
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-08434df1f2f88c9770e59246caa2ff9c6f613270`
	`1`	`+295f2ed4d103017f7e19a7b8263ece606cd629db`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-cmake=3.22.1`
	`1`	`+cmake=3.26.4`
`2`	`2`	`ninja=1.10.2`
`3`	`3`	`libuv`
`4`	`4`	`llvm-openmp`