Skip to content

Refactor elementwise_util: create variants with out_dtypes in template argument list #9387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 45 commits into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
31a49e0
Update
swolchok Mar 19, 2025
9fcd885
Update
swolchok Mar 19, 2025
29d6de9
Update
swolchok Mar 19, 2025
79b908c
Update
swolchok Mar 19, 2025
854c991
Update
swolchok Mar 19, 2025
def7ed4
Update
swolchok Mar 19, 2025
40c1b1b
Update
swolchok Mar 19, 2025
7c78357
Update
swolchok Mar 19, 2025
7ba269a
Update
swolchok Mar 19, 2025
b9c545f
Update
swolchok Mar 20, 2025
3091007
Update
swolchok Mar 20, 2025
4a00cac
Update
swolchok Mar 20, 2025
21b81bf
Update
swolchok Mar 20, 2025
b61a8a2
Update
swolchok Mar 25, 2025
91161bd
Update
swolchok Mar 25, 2025
4add706
Update
swolchok Mar 25, 2025
f659627
Update
swolchok Mar 25, 2025
f1c5429
Update
swolchok Mar 25, 2025
b34f04f
Update
swolchok Mar 25, 2025
9a93839
Update
swolchok Mar 26, 2025
bb16a55
Update
swolchok Mar 26, 2025
7f57a19
Update
swolchok Mar 26, 2025
5d95c06
Update
swolchok Mar 26, 2025
4553283
Update
swolchok Mar 26, 2025
b3120fa
Update
swolchok Mar 26, 2025
7086659
Update
swolchok Mar 28, 2025
e13de0e
Update
swolchok Mar 28, 2025
943ab82
Update
swolchok Mar 28, 2025
f22d039
Update
swolchok Mar 28, 2025
45ce46d
Update
swolchok Mar 28, 2025
34eb5d4
Update
swolchok Mar 28, 2025
ea9dc6f
Update
swolchok Mar 28, 2025
7d7859e
Update
swolchok Mar 28, 2025
b98829d
Update
swolchok Mar 28, 2025
3140910
Update
swolchok Mar 28, 2025
7f2bbdb
Update
swolchok Apr 2, 2025
960315e
Update
swolchok Apr 2, 2025
9e42e93
Update
swolchok Apr 2, 2025
96d258e
Update
swolchok Apr 2, 2025
e6f66ab
Update
swolchok Apr 2, 2025
20f3046
Update
swolchok Apr 2, 2025
3aa266d
Update
swolchok Apr 2, 2025
3c88a56
Update
swolchok Apr 2, 2025
153735d
Update
swolchok Apr 2, 2025
cac4293
Update
swolchok Apr 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
# Real integrations should supply their own YAML file that only lists the
# operators necessary for the models that will run.
#
if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
# find pytorch lib here to make it available to all
# sub-directories. Find it before including portable so that
# optimized_portable_kernels can use it.
find_package_torch_headers()
endif()

if(BUILD_EXECUTORCH_PORTABLE_OPS)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
endif()

if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
# find pytorch lib here to make it available to all sub-directories
find_package_torch_headers()
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
endif()

Expand Down
1 change: 1 addition & 0 deletions kernels/optimized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_library(optimized_kernels ${_optimized_kernels__srcs})
target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
target_link_libraries(
optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
)
Expand Down
6 changes: 3 additions & 3 deletions kernels/portable/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ gen_operators_lib(
# Portable kernels support optional parallelization (and, in the
# future, perhaps other performance features). If support is present,
# produce an optimized version.
set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)

if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
add_library(optimized_portable_kernels ${_portable_kernels__srcs})
target_link_libraries(optimized_portable_kernels PRIVATE executorch)
target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
install(
TARGETS optimized_portable_kernels
DESTINATION lib
Expand Down
113 changes: 100 additions & 13 deletions kernels/portable/cpu/util/elementwise_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,8 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
}

namespace internal {
template <
typename CTYPE_COMPUTE,
const char* op_name,
typename Op,
typename... Args>
inline void apply_elementwise_fn(
template <typename CTYPE_COMPUTE, typename Op, typename... Args>
inline bool validate_elementwise_fn_inputs(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& out,
Expand All @@ -65,7 +61,6 @@ inline void apply_elementwise_fn(
static_assert(
(std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
...));
constexpr auto kNumInputs = sizeof...(inputs);
constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
const auto check_input_dtype = [](auto input, auto compute_type) {
return internal::check_tensor_dtype(
Expand All @@ -75,7 +70,30 @@ inline void apply_elementwise_fn(
ctx,
(check_input_dtype(inputs, compute_type) && ...) &&
internal::check_tensor_dtype(out, out_dtypes, compute_type),
InvalidArgument, );
InvalidArgument,
false);

return true;
}

template <
typename CTYPE_COMPUTE,
const char* op_name,
typename Op,
typename... Args>
inline void apply_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& out,
SupportedTensorDtypes out_dtypes,
Args... inputs) {
const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMPUTE>(
compute_fun, ctx, out, out_dtypes, inputs...);
if (!inputs_valid) {
return;
}

constexpr auto kNumInputs = sizeof...(inputs);

struct InputInfo {
load_to_compute_fn<CTYPE_COMPUTE> load_to_compute;
Expand Down Expand Up @@ -120,6 +138,7 @@ inline void apply_elementwise_fn(
});
}

/// DEPRECATED: prefer the variant with out_dtypes in the template argument.
template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
inline void apply_unitensor_elementwise_fn(
const Op& compute_fun,
Expand All @@ -132,19 +151,83 @@ inline void apply_unitensor_elementwise_fn(
compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
}

template <
typename CTYPE_COMPUTE,
const char* op_name,
SupportedTensorDtypes out_dtypes,
typename Op>
inline void apply_unitensor_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& a,
SupportedTensorDtypes a_dtypes,
const Tensor& out) {
internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
}

/**
* DEPRECATED: prefer the variant with out_dtypes in the template argument list.
*/
template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
inline void apply_bitensor_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& a,
SupportedTensorDtypes a_dtypes,
const Tensor& b,
SupportedTensorDtypes b_dtypes,
const Tensor& out,
SupportedTensorDtypes out_dtypes) {
internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
compute_fun,
ctx,
out,
out_dtypes,
std::make_pair(&a, a_dtypes),
std::make_pair(&b, b_dtypes));
}

/**
* Useful for bi-tensor elementwise operators. For each element of the inputs,
* perform a computation and write to the corresponding element of the output.
* Tensor broadcasting is applied wherever it is required.
*/
template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
template <
typename CTYPE_COMPUTE,
const char* op_name,
SupportedTensorDtypes out_dtypes,
typename Op>
inline void apply_bitensor_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& a,
SupportedTensorDtypes a_dtypes,
const Tensor& b,
SupportedTensorDtypes b_dtypes,
const Tensor& out) {
internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
compute_fun,
ctx,
out,
out_dtypes,
std::make_pair(&a, a_dtypes),
std::make_pair(&b, b_dtypes));
}

/**
* DEPRECATED: prefer the variant with out_dtypes in the template argument list.
*/
template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
inline void apply_tritensor_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
const Tensor& a,
SupportedTensorDtypes a_dtypes,
const Tensor& b,
SupportedTensorDtypes b_dtypes,
const Tensor& c,
SupportedTensorDtypes c_dtypes,
const Tensor& out,
SupportedTensorDtypes out_dtypes) {
internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
Expand All @@ -153,7 +236,8 @@ inline void apply_bitensor_elementwise_fn(
out,
out_dtypes,
std::make_pair(&a, a_dtypes),
std::make_pair(&b, b_dtypes));
std::make_pair(&b, b_dtypes),
std::make_pair(&c, c_dtypes));
}

/**
Expand All @@ -176,7 +260,11 @@ inline void apply_bitensor_elementwise_fn(
* static constexpr const char op_name[] = "my_op";
* apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
*/
template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
template <
typename CTYPE_COMPUTE,
const char* op_name,
SupportedTensorDtypes out_dtypes,
typename Op>
inline void apply_tritensor_elementwise_fn(
const Op& compute_fun,
KernelRuntimeContext& ctx,
Expand All @@ -186,8 +274,7 @@ inline void apply_tritensor_elementwise_fn(
SupportedTensorDtypes b_dtypes,
const Tensor& c,
SupportedTensorDtypes c_dtypes,
const Tensor& out,
SupportedTensorDtypes out_dtypes) {
const Tensor& out) {
internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
compute_fun,
ctx,
Expand Down
3 changes: 2 additions & 1 deletion runtime/core/portable_type/c10/c10/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def define_common_targets():
# -Wmacro-redefined, and we only care about getting
# reasonable vectorization and Sleef support.
"-DCPU_CAPABILITY_AVX2",
"-DET_USE_PYTORCH_HEADERS",
"-DHAVE_AVX2_CPU_DEFINITION",
"-DSTANDALONE_TORCH_HEADER",
] + get_sleef_preprocessor_flags(),
Expand All @@ -86,5 +87,5 @@ def define_common_targets():
# linker failure.
"ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
"DEFAULT": [],
}) + ["-DSTANDALONE_TORCH_HEADER"],
}) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]),
)
13 changes: 13 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
endif()

#
# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
#
if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
add_executable(size_test_all_optimized_ops ${_size_test__srcs})
target_link_options_shared_lib(optimized_native_cpu_ops_lib)
target_link_libraries(
size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
if(CMAKE_BUILD_TYPE EQUAL "Release")
target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
endif()
endif()

# Print all summary
executorch_print_configuration_summary()
57 changes: 57 additions & 0 deletions test/build_optimized_size_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Unlike build_size_test.sh, this script:
# - does not attempt to disable exceptions and RTTI
# - as a consequence, is able to build optimized kernels
# - uses MinSizeRel builds
# - is not currently intended to run in CI
# - sets -g to make it easier to use tools like bloaty to investigate size

set -e

# shellcheck source=/dev/null
source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"

cmake_install_executorch_lib() {
echo "Installing libexecutorch.a"
clean_executorch_install_folders
update_tokenizers_git_submodule
CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
-DCMAKE_CXX_STANDARD_REQUIRED=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DOPTIMIZE_SIZE=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out .
cmake --build cmake-out -j9 --target install --config MinSizeRel
}

test_cmake_size_test() {
CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test

echo "Build size test"
cmake --build cmake-out/test -j9 --config MinSizeRel

echo 'ExecuTorch with no ops binary size, unstripped:'
ls -al cmake-out/test/size_test

echo 'ExecuTorch with portable ops binary size, unstripped:'
ls -al cmake-out/test/size_test_all_ops

echo 'ExecuTorch with optimized ops binary size, unstripped:'
ls -al cmake-out/test/size_test_all_optimized_ops
}

if [[ -z $PYTHON_EXECUTABLE ]]; then
PYTHON_EXECUTABLE=python3
fi

cmake_install_executorch_lib
test_cmake_size_test
6 changes: 5 additions & 1 deletion tools/cmake/executorch-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ endif()
if(TARGET coremldelegate)
set_target_properties(
coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
"coreml_inmemoryfs;coreml_util"
"coreml_inmemoryfs;coreml_util"
)
endif()

Expand All @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
endif()
if(TARGET extension_threadpool)
target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
set_target_properties(
extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
"cpuinfo;pthreadpool"
)
endif()
Loading