Skip to content

Commit 8d69c9c

Browse files
authored
Merge branch 'main' into pr_model_improve
2 parents 1d1f3cf + 05277dd commit 8d69c9c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+951
-439
lines changed

.ci/docker/ci_commit_pins/pytorch.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
08434df1f2f88c9770e59246caa2ff9c6f613270
1+
295f2ed4d103017f7e19a7b8263ece606cd629db

.ci/docker/conda-env-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake=3.22.1
1+
cmake=3.26.4
22
ninja=1.10.2
33
libuv
44
llvm-openmp
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
13+
14+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
15+
PYTHON_EXECUTABLE=python3
16+
fi
17+
18+
which "${PYTHON_EXECUTABLE}"
19+
20+
# Update tokenizers submodule
21+
pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
22+
echo "Update tokenizers submodule"
23+
git submodule update --init
24+
popd
25+
26+
# Install ET with CMake
27+
cmake -DPYTHON_EXECUTABLE=python \
28+
-DCMAKE_INSTALL_PREFIX=cmake-out \
29+
-DEXECUTORCH_ENABLE_LOGGING=1 \
30+
-DCMAKE_BUILD_TYPE=Release \
31+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
32+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
33+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
34+
-DEXECUTORCH_BUILD_XNNPACK=OFF \
35+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
36+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
37+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
38+
-Bcmake-out .
39+
cmake --build cmake-out -j16 --target install --config Release
40+
41+
# Install llama runner with torchao
42+
cmake -DPYTHON_EXECUTABLE=python \
43+
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
44+
-DCMAKE_BUILD_TYPE=Release \
45+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
46+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
47+
-DEXECUTORCH_BUILD_XNNPACK=OFF \
48+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
49+
-DEXECUTORCH_BUILD_TORCHAO=ON \
50+
-Bcmake-out/examples/models/llama \
51+
examples/models/llama
52+
cmake --build cmake-out/examples/models/llama -j16 --config Release
53+
54+
# Download stories llama110m artifacts
55+
download_stories_model_artifacts
56+
57+
echo "Creating tokenizer.bin"
58+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
59+
60+
# Export model
61+
LLAMA_CHECKPOINT=stories110M.pt
62+
LLAMA_PARAMS=params.json
63+
MODEL_OUT=model.pte
64+
TOKENIZER=tokenizer.bin
65+
66+
# Set low-bit quantization parameters
67+
QLINEAR_BITWIDTH=3 # Can be 1-8
68+
QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
69+
QEMBEDDING_BITWIDTH=4 # Can be 1-8
70+
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
71+
72+
${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
73+
--checkpoint "${LLAMA_CHECKPOINT:?}" \
74+
--params "${LLAMA_PARAMS:?}" \
75+
-kv \
76+
--use_sdpa_with_kv_cache \
77+
--output_name=${MODEL_OUT} \
78+
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
79+
--group_size ${QLINEAR_GROUP_SIZE} \
80+
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
81+
--disable_dynamic_shape \
82+
-d fp32
83+
84+
# Test run
85+
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"

.github/workflows/android-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: android-perf
22

33
on:
44
schedule:
5-
- cron: 0 0 * * *
5+
- cron: 0 0,8,16 * * *
66
pull_request:
77
paths:
88
- .github/workflows/android-perf.yml

.github/workflows/trunk.yml

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ jobs:
2323
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
2424
strategy:
2525
matrix:
26-
# Mac runners are expensive and limited, and non reliable.
27-
# Do some basic testing for macos jobs, and rely mostly on
26+
# Mac runners are expensive and limited, and non reliable.
27+
# Do some basic testing for macos jobs, and rely mostly on
2828
# test-models-linux-aarch64 job instead.
2929
model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
3030
backend: [xnnpack-quantization-delegation]
@@ -288,6 +288,26 @@ jobs:
288288
# Test ANE llama
289289
${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
290290
291+
test-llama-torchao-lowbit:
292+
name: test-llama-torchao-lowbit
293+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
294+
with:
295+
runner: macos-m1-stable
296+
python-version: '3.11'
297+
submodules: 'true'
298+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
299+
script: |
300+
set -eux
301+
bash .ci/scripts/setup-conda.sh
302+
eval "$(conda shell.bash hook)"
303+
304+
# Install requirements
305+
${CONDA_RUN} python install_executorch.py
306+
${CONDA_RUN} sh examples/models/llama/install_requirements.sh
307+
308+
# Run test
309+
${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
310+
291311
test-llama-runner-linux:
292312
# Test Both linux x86 and linux aarch64
293313
name: test-llama-runner-linux

backends/arm/test/test_arm_baremetal.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,13 @@ test_models_ethosu_fvp() { # End to End model tests using model_test.py
130130
echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
131131
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-128 --model=mv2 --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
132132
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-64 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
133-
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
133+
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
134134

135135
# Ethos-U85
136136
echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
137137
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=mv2 --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
138138
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-1024 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
139-
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
139+
python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
140140
echo "${TEST_SUITE_NAME}: PASS"
141141
}
142142

backends/vulkan/runtime/api/Context.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,17 @@ vkapi::DescriptorSet Context::get_descriptor_set(
124124
VkPipelineLayout pipeline_layout =
125125
pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
126126

127+
vkapi::SpecVarList spec_constants = {
128+
SV(local_workgroup_size[0u]),
129+
SV(local_workgroup_size[1u]),
130+
SV(local_workgroup_size[2u])};
131+
132+
spec_constants.append(additional_constants);
133+
127134
VkPipeline pipeline = pipeline_cache().retrieve(
128135
{pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
129136
shader_cache().retrieve(shader_descriptor),
130-
additional_constants,
131-
local_workgroup_size});
137+
spec_constants});
132138

133139
cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
134140

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,8 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
674674
}
675675

676676
const vkapi::BufferBindInfo vTensor::sizes_ubo() {
677-
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
677+
const size_t size_per_ubo =
678+
storage_.context_->adapter_ptr()->min_ubo_alignment();
678679
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
679680
if (!uniforms_.buffer()) {
680681
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -692,7 +693,8 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
692693
}
693694

694695
const vkapi::BufferBindInfo vTensor::strides_ubo() {
695-
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
696+
const size_t size_per_ubo =
697+
storage_.context_->adapter_ptr()->min_ubo_alignment();
696698
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
697699
if (!uniforms_.buffer()) {
698700
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -712,7 +714,8 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
712714
}
713715

714716
const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
715-
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
717+
const size_t size_per_ubo =
718+
storage_.context_->adapter_ptr()->min_ubo_alignment();
716719
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
717720
if (!uniforms_.buffer()) {
718721
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -730,7 +733,8 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
730733
}
731734

732735
const vkapi::BufferBindInfo vTensor::numel_ubo() {
733-
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
736+
const size_t size_per_ubo =
737+
storage_.context_->adapter_ptr()->min_ubo_alignment();
734738
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
735739
if (!uniforms_.buffer()) {
736740
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);

backends/vulkan/runtime/vk_api/Pipeline.cpp

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -275,23 +275,13 @@ ComputePipeline::ComputePipeline(
275275
const ComputePipeline::Descriptor& descriptor,
276276
VkPipelineCache pipeline_cache)
277277
: device_(device), handle_{VK_NULL_HANDLE} {
278-
SpecVarList specialization_constants;
279-
280-
specialization_constants.reserve(
281-
3 + descriptor.specialization_constants.size());
282-
specialization_constants.append(descriptor.local_wg_size[0]);
283-
specialization_constants.append(descriptor.local_wg_size[1]);
284-
specialization_constants.append(descriptor.local_wg_size[2]);
285-
286-
specialization_constants.append(descriptor.specialization_constants);
287-
const std::vector<VkSpecializationMapEntry> map_entries =
288-
specialization_constants.generate_map_entries();
278+
map_entries_ = descriptor.specialization_constants.generate_map_entries();
289279

290280
const VkSpecializationInfo specialization_info{
291-
specialization_constants.size(), // mapEntryCount
292-
map_entries.data(), // pMapEntries
293-
specialization_constants.data_nbytes(), // dataSize
294-
specialization_constants.data(), // pData
281+
descriptor.specialization_constants.size(), // mapEntryCount
282+
map_entries_.data(), // pMapEntries
283+
descriptor.specialization_constants.data_nbytes(), // dataSize
284+
descriptor.specialization_constants.data(), // pData
295285
};
296286

297287
const VkPipelineShaderStageCreateInfo shader_stage_create_info{
@@ -330,7 +320,9 @@ ComputePipeline::ComputePipeline(
330320
}
331321

332322
ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
333-
: device_(other.device_), handle_(other.handle_) {
323+
: device_(other.device_),
324+
handle_(other.handle_),
325+
map_entries_(std::move(other.map_entries_)) {
334326
other.handle_ = VK_NULL_HANDLE;
335327
}
336328

backends/vulkan/runtime/vk_api/Pipeline.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ class ComputePipeline final {
156156
VkPipelineLayout pipeline_layout;
157157
VkShaderModule shader_module;
158158
SpecVarList specialization_constants;
159-
utils::WorkgroupSize local_wg_size;
160159
};
161160

162161
explicit ComputePipeline(
@@ -175,6 +174,7 @@ class ComputePipeline final {
175174
private:
176175
VkDevice device_;
177176
VkPipeline handle_;
177+
std::vector<VkSpecializationMapEntry> map_entries_;
178178

179179
public:
180180
inline VkPipeline handle() const {
@@ -274,9 +274,6 @@ class ComputePipelineCache final {
274274
seed = utils::hash_combine(seed, new_seed);
275275
}
276276

277-
seed = utils::hash_combine(
278-
seed, std::hash<uint32_t>()((uint32_t)descriptor.local_wg_size));
279-
280277
return seed;
281278
}
282279
};

backends/xnnpack/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
146146
list(APPEND xnn_executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
147147
endif()
148148

149+
if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
150+
list(APPEND xnn_executor_runner_libs quantized_ops_lib)
151+
endif()
152+
149153
list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)
150154

151155
# ios can only build library but not binary

backends/xnnpack/operators/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
op_ceiling,
1616
op_clamp,
1717
op_conv2d,
18-
op_dequantize_per_tensor,
1918
op_div,
2019
op_dynamic_dequantize_ops,
2120
op_dynamic_quantize_ops,
@@ -35,7 +34,7 @@
3534
op_negate,
3635
op_permute,
3736
op_prelu,
38-
op_quantize_per_tensor,
37+
op_quant_dequant,
3938
op_relu,
4039
op_rsqrt,
4140
op_sdpa,

backends/xnnpack/operators/op_dequantize_per_tensor.py

Lines changed: 0 additions & 70 deletions
This file was deleted.

0 commit comments

Comments
 (0)