Skip to content

Commit 53858c2

Browse files
committed
Update on "[ET-VK] Allow clients to specify cache_data_path"
Differential Revision: [D75918105](https://our.internmc.facebook.com/intern/diff/D75918105/) [ghstack-poisoned]
2 parents 157e7da + b0f4199 commit 53858c2

File tree

90 files changed

+1349
-686
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+1349
-686
lines changed

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ cmake --build cmake-out -j16 --target install --config Release
4040

4141
# Install llama runner with torchao
4242
cmake -DPYTHON_EXECUTABLE=python \
43-
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
4443
-DCMAKE_BUILD_TYPE=Release \
4544
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
4645
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \

.ci/scripts/test_model.sh

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,24 @@ prepare_artifacts_upload() {
4949
}
5050

5151
build_cmake_executor_runner() {
52+
local backend_string_select="${1:-}"
5253
echo "Building executor_runner"
5354
rm -rf ${CMAKE_OUTPUT_DIR}
54-
cmake -DCMAKE_BUILD_TYPE=Debug \
55-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
56-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
57-
-B${CMAKE_OUTPUT_DIR} .
58-
59-
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
55+
mkdir ${CMAKE_OUTPUT_DIR}
56+
if [[ "$backend_string_select" == "XNNPACK" ]]; then
57+
echo "Backend $backend_string_select selected"
58+
(cd ${CMAKE_OUTPUT_DIR} \
59+
&& cmake -DCMAKE_BUILD_TYPE=Release \
60+
-DEXECUTORCH_BUILD_XNNPACK=ON \
61+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
62+
cmake --build ${CMAKE_OUTPUT_DIR} -j4
63+
else
64+
cmake -DCMAKE_BUILD_TYPE=Debug \
65+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
66+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
67+
-B${CMAKE_OUTPUT_DIR} .
68+
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
69+
fi
6070
}
6171

6272
run_portable_executor_runner() {
@@ -111,19 +121,6 @@ test_model() {
111121
run_portable_executor_runner
112122
}
113123

114-
build_cmake_xnn_executor_runner() {
115-
echo "Building xnn_executor_runner"
116-
117-
(rm -rf ${CMAKE_OUTPUT_DIR} \
118-
&& mkdir ${CMAKE_OUTPUT_DIR} \
119-
&& cd ${CMAKE_OUTPUT_DIR} \
120-
&& retry cmake -DCMAKE_BUILD_TYPE=Release \
121-
-DEXECUTORCH_BUILD_XNNPACK=ON \
122-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
123-
124-
cmake --build ${CMAKE_OUTPUT_DIR} -j4
125-
}
126-
127124
test_model_with_xnnpack() {
128125
WITH_QUANTIZATION=$1
129126
WITH_DELEGATION=$2
@@ -148,12 +145,11 @@ test_model_with_xnnpack() {
148145

149146
# Run test model
150147
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
148+
# TODO eventually buck should also use consolidated executor runners
151149
buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}"
152150
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
153-
if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then
154-
build_cmake_xnn_executor_runner
155-
fi
156-
./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}"
151+
build_cmake_executor_runner "XNNPACK"
152+
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}"
157153
else
158154
echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
159155
exit 1

.ci/scripts/utils.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ build_executorch_runner() {
158158
cmake_install_executorch_lib() {
159159
echo "Installing libexecutorch.a and libportable_kernels.a"
160160
clean_executorch_install_folders
161-
retry cmake -DBUCK2="$BUCK" \
162-
-DCMAKE_INSTALL_PREFIX=cmake-out \
161+
retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
163162
-DCMAKE_BUILD_TYPE=Release \
164163
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
165164
-Bcmake-out .

backends/vulkan/_passes/fuse_quantized_ops.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from executorch.exir import ExportedProgram
1818
from executorch.exir.dialects._ops import ops as exir_ops
1919
from executorch.exir.pass_base import ExportPass, PassResult
20+
from executorch.exir.passes import dead_code_elimination_pass
2021

2122
#################
2223
## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
224225
)
225226

226227
graph_module.recompile()
227-
graph_module = super().call(graph_module).graph_module
228+
dead_code_elimination_pass(graph_module)
228229

230+
# Re-trace the graph since new nodes were (potentially) inserted
231+
graph_module = super().call(graph_module).graph_module
229232
return PassResult(graph_module, True)

backends/vulkan/_passes/int4_weight_only_quantizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import torch
88
import torch.nn.functional as F
99

10-
from torchao.quantization.GPTQ import _check_linear_int4_k
10+
from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
1111
from torchao.quantization.unified import Quantizer
1212
from torchao.quantization.utils import groupwise_affine_quantize_tensor
1313

backends/vulkan/_passes/tag_memory_meta_pass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import logging
8-
from copy import deepcopy
98
from typing import Any, Optional, Set
109

1110
import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
2221
from executorch.exir.dialects._ops import ops as exir_ops
2322

2423
from executorch.exir.pass_base import ExportPass, PassResult
24+
from executorch.exir.tensor import TensorSpec
2525

2626
logger: logging.Logger = logging.getLogger("")
2727
logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
5252
(arg,),
5353
)
5454
clone_node.meta["val"] = arg.meta["val"]
55-
clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
55+
clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
5656
clone_node.meta["spec"].const = False
5757
set_memory_metadata(clone_node, storage, layout)
5858
arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)

backends/vulkan/op_registry.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
230230
exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
231231
# Symbolic integer ops
232232
torch.ops.aten.sym_size.int,
233+
operator.add,
234+
operator.lt,
235+
operator.gt,
236+
operator.ge,
237+
operator.le,
238+
# Guard and assert ops
239+
torch.ops.aten._assert_scalar.default,
240+
torch.ops.aten.sym_constrain_range_for_size.default,
233241
]
234242
)
235243
def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
500508
return features
501509

502510

503-
@update_features(["llama::update_cache", "llama::custom_sdpa"])
511+
@update_features(
512+
[
513+
"llama::update_cache",
514+
"llama::custom_sdpa",
515+
]
516+
)
504517
def register_sdpa_ops(features: OpFeatures):
505518
features.resize_fn = False
506519
features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
520533
return features
521534

522535

523-
@update_features(exir_ops.edge.aten.view_copy.default)
524-
def register_view_op(features: OpFeatures):
536+
@update_features(
537+
[
538+
exir_ops.edge.aten.clone.default,
539+
exir_ops.edge.aten.permute.default,
540+
exir_ops.edge.aten.permute_copy.default,
541+
exir_ops.edge.aten.select_copy.int,
542+
exir_ops.edge.aten.slice_copy.Tensor,
543+
exir_ops.edge.aten.view_copy.default,
544+
]
545+
)
546+
def register_view_ops(features: OpFeatures):
525547
features.texture_impl = TextureImplFeatures(
526548
valid_packed_dims=all_packed_dims,
527549
)
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
538560
# Indexing and lookup
539561
exir_ops.edge.aten.flip.default,
540562
exir_ops.edge.aten.index_select.default,
541-
exir_ops.edge.aten.select_copy.int,
542563
# Tensor creation
543564
exir_ops.edge.aten.arange.start_step,
544-
exir_ops.edge.aten.clone.default,
545565
exir_ops.edge.aten.constant_pad_nd.default,
546566
exir_ops.edge.aten.full.default,
547567
exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
564584
# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
565585
@update_features(
566586
[
567-
# Indexing and lookup
568-
exir_ops.edge.aten.slice_copy.Tensor,
569587
# Shape Manipulation
570588
exir_ops.edge.aten.squeeze_copy.dims,
571589
exir_ops.edge.aten.unsqueeze_copy.default,
572-
exir_ops.edge.aten.permute_copy.default,
573590
# Tensor combination
574591
exir_ops.edge.aten.cat.default,
575592
exir_ops.edge.aten.repeat.default,

backends/vulkan/partitioner/vulkan_partitioner.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,11 @@ def op_node_is_compatible( # noqa: C901: Function is too complex
146146
def node_is_compatible(
147147
self, node: torch.fx.Node, features: Optional[OpFeatures] = None
148148
) -> Tuple[bool, str]:
149-
if utils.is_symint_node(node):
150-
return node.target in vulkan_supported_ops, "Op is compatible"
151-
elif utils.is_tensor_node(node):
149+
if utils.is_tensor_node(node):
152150
return self.op_node_is_compatible(node, features=features)
151+
# For non-tensor nodes, just check if the op is registered
152+
elif hasattr(node, "target"):
153+
return node.target in vulkan_supported_ops, "Op is compatible"
153154

154155
return False, f"Unsupported node type: {node.format_node()}"
155156

backends/vulkan/runtime/VulkanBackend.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
495495
builder.build_graph();
496496

497497
compute_graph->prepare();
498+
compute_graph->prepare_pipelines();
498499

499500
compute_graph->encode_prepack();
500501
compute_graph->prepack();

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
449449
return idx;
450450
}
451451

452+
ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
453+
for (int i = 0; i < values_.size(); ++i) {
454+
if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
455+
return i;
456+
}
457+
}
458+
return add_scalar(val);
459+
}
460+
452461
ValueRef ComputeGraph::set_input_tensor(
453462
const ValueRef idx,
454463
const bool use_staging) {
@@ -552,7 +561,7 @@ void ComputeGraph::update_descriptor_counts(
552561
}
553562
}
554563

555-
void ComputeGraph::update_pipeline_descriptors(
564+
void ComputeGraph::register_pipeline_to_create(
556565
const vkapi::ShaderInfo& shader_info,
557566
const utils::WorkgroupSize& local_workgroup_size,
558567
const vkapi::SpecVarList& spec_vars,
@@ -578,10 +587,14 @@ void ComputeGraph::update_pipeline_descriptors(
578587
context()->shader_cache().retrieve(shader_info),
579588
spec_constants};
580589

590+
if (context_->pipeline_cache().contains(desc)) {
591+
return;
592+
}
581593
auto it = pipeline_descriptors_.find(desc);
582-
if (it == pipeline_descriptors_.cend()) {
583-
pipeline_descriptors_.insert(desc);
594+
if (it != pipeline_descriptors_.cend()) {
595+
return;
584596
}
597+
pipeline_descriptors_.insert(desc);
585598
}
586599

587600
utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
@@ -691,14 +704,20 @@ void ComputeGraph::prepare() {
691704
shared_object.allocate(this);
692705
shared_object.bind_users(this);
693706
}
707+
}
694708

709+
void ComputeGraph::prepare_pipelines() {
695710
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
696711
node->prepare_pipelines(this);
697712
}
698713
for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
699714
node->prepare_pipelines(this);
700715
}
701716
context_->pipeline_cache().create_pipelines(pipeline_descriptors_);
717+
718+
pipeline_descriptors_ = std::unordered_set<
719+
vkapi::ComputePipelineCache::Key,
720+
vkapi::ComputePipelineCache::Hasher>();
702721
}
703722

704723
void ComputeGraph::encode_prepack() {

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,13 @@ class ComputeGraph final {
609609

610610
ValueRef add_symint(const int32_t val);
611611

612+
/*
613+
* Searches the graph's value list for a Int value with the specified value.
614+
* If one is found, returns the index of the value. Otherwise, add a new value
615+
* and return the index of the new value.
616+
*/
617+
ValueRef get_or_add_value_for_int(const int64_t val);
618+
612619
ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
613620
ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
614621

@@ -709,14 +716,16 @@ class ComputeGraph final {
709716
const vkapi::ShaderInfo& shader_info,
710717
bool execute);
711718

712-
void update_pipeline_descriptors(
719+
void register_pipeline_to_create(
713720
const vkapi::ShaderInfo& shader_info,
714721
const utils::WorkgroupSize& local_workgroup_size,
715722
const vkapi::SpecVarList& spec_vars,
716723
const std::vector<PushConstantDataInfo>& push_constants);
717724

718725
void prepare();
719726

727+
void prepare_pipelines();
728+
720729
//
721730
// Dispatch Utilities
722731
//

backends/vulkan/runtime/graph/ops/DispatchNode.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ DispatchNode::DispatchNode(
3636
}
3737

3838
void DispatchNode::prepare_pipelines(ComputeGraph* graph) {
39-
graph->update_pipeline_descriptors(
39+
graph->register_pipeline_to_create(
4040
shader_, local_workgroup_size_, spec_vars_, push_constants_);
4141
}
4242

backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
2525
const ResizeFunction& resize_fn)
2626
: DispatchNode(
2727
graph,
28-
vkapi::ShaderInfo(),
29-
{1u, 1u, 1u},
28+
pick_shader_fn(&graph, args, resize_args),
3029
{1u, 1u, 1u},
30+
{8u, 8u, 1u},
3131
args,
3232
params,
3333
push_constants,
@@ -37,7 +37,6 @@ DynamicDispatchNode::DynamicDispatchNode(
3737
pick_shader_fn_(pick_shader_fn),
3838
pick_global_wg_fn_(pick_global_wg_fn),
3939
pick_local_wg_fn_(pick_local_wg_fn) {
40-
shader_ = pick_shader_fn(&graph, args, resize_args);
4140
global_workgroup_size_ =
4241
pick_global_wg_fn(&graph, shader_, args, resize_args);
4342
local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
6868
}
6969

7070
void PrepackNode::prepare_pipelines(ComputeGraph* graph) {
71-
graph->update_pipeline_descriptors(
71+
graph->register_pipeline_to_create(
7272
shader_, local_workgroup_size_, spec_vars_, push_constants_);
73-
graph->update_pipeline_descriptors(
74-
noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, push_constants_);
73+
graph->register_pipeline_to_create(
74+
noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, {});
7575
}
7676

7777
void PrepackNode::encode(ComputeGraph* graph) {

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@ layout(std430) buffer;
2222

2323
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
2424
${layout_declare_buffer(B, "r", "nchw_in", "int")}
25-
${layout_declare_ubo(B, "ivec4", "sizes")}
25+
26+
$if USE_PUSH_CONST:
27+
layout(push_constant) uniform restrict Block {
28+
ivec4 sizes;
29+
};
30+
$else:
31+
${layout_declare_ubo(B, "ivec4", "sizes")}
2632

2733
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2834

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer:
88
parameter_names_with_default_values:
99
STORAGE: texture3d
1010
DTYPE: int8
11+
USE_PUSH_CONST: True
1112
generate_variant_forall:
1213
STORAGE:
1314
- VALUE: texture2d
@@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer:
1718
- VALUE: uint8
1819
shader_variants:
1920
- NAME: nchw_to_bitw8_image_nobitw8buffer
21+
- NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
22+
USE_PUSH_CONST: False

0 commit comments

Comments
 (0)