diff --git a/Android.bp b/Android.bp
index 6cc85f1928..1f1e591bd1 100644
--- a/Android.bp
+++ b/Android.bp
@@ -65,6 +65,7 @@ opencl_srcs = [
"src/core/CL/cl_kernels/common/roi_align_layer.cl",
"src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl",
"src/core/CL/cl_kernels/common/roi_pooling_layer.cl",
+ "src/core/CL/cl_kernels/common/scatter.cl",
"src/core/CL/cl_kernels/common/select.cl",
"src/core/CL/cl_kernels/common/slice_ops.cl",
"src/core/CL/cl_kernels/common/softmax_layer.cl",
@@ -488,6 +489,8 @@ cc_library_static {
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/dequantize/generic/neon/fp16.cpp",
+ "src/cpu/kernels/dequantize/generic/neon/fp32.cpp",
"src/cpu/kernels/directconv2d/nchw/all.cpp",
"src/cpu/kernels/directconv2d/nchw/fp16.cpp",
"src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp",
@@ -553,9 +556,17 @@ cc_library_static {
"src/cpu/kernels/pool3d/neon/fp32.cpp",
"src/cpu/kernels/pool3d/neon/qasymm8.cpp",
"src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/quantize/generic/neon/fp16.cpp",
+ "src/cpu/kernels/quantize/generic/neon/fp32.cpp",
+ "src/cpu/kernels/quantize/generic/neon/integer.cpp",
"src/cpu/kernels/range/generic/neon/fp16.cpp",
"src/cpu/kernels/range/generic/neon/fp32.cpp",
"src/cpu/kernels/range/generic/neon/integer.cpp",
+ "src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp",
+ "src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp",
+ "src/cpu/kernels/reduction_layer/generic/neon/integer.cpp",
+ "src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp",
"src/cpu/kernels/roialign/generic/neon/fp16.cpp",
"src/cpu/kernels/roialign/generic/neon/fp32.cpp",
"src/cpu/kernels/roialign/generic/neon/qasymm8.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c67479ce41..2cf259d6ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
project(
ArmCompute
- VERSION 36.0.0
+ VERSION 37.0.0
DESCRIPTION
"The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
LANGUAGES C CXX ASM)
diff --git a/README.md b/README.md
index 112f40225d..8e3b6394fd 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.04-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.05-green)
The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -37,7 +37,7 @@ Key Features:
## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.04-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-24.05-green)](https://arm-software.github.io/ComputeLibrary/latest)
> Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
@@ -50,24 +50,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
| Platform | Operating System | Release archive (Download) |
| -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon-cl.tar.gz) |
| Architecture | Operating System | Release archive (Download) |
| ------------ | ---------------- | -------------------------- |
-| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.05/arm_compute-v24.05-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.04-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.04)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.05-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.05)
Pre-build binaries are generated with the following security / good coding practices related flags:
> -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
diff --git a/SConscript b/SConscript
index 3430ecf4eb..488f2f3517 100644
--- a/SConscript
+++ b/SConscript
@@ -32,8 +32,8 @@ import json
import codecs
import platform
-VERSION = "v24.04"
-LIBRARY_VERSION_MAJOR = 36
+VERSION = "v24.05"
+LIBRARY_VERSION_MAJOR = 37
LIBRARY_VERSION_MINOR = 0
LIBRARY_VERSION_PATCH = 0
SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -429,6 +429,7 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/common/fill_border.cl',
'src/core/CL/cl_kernels/common/floor.cl',
'src/core/CL/cl_kernels/common/gather.cl',
+ 'src/core/CL/cl_kernels/common/scatter.cl',
'src/core/CL/cl_kernels/common/gemm.cl',
'src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl',
'src/core/CL/cl_kernels/common/gemm_utils.cl',
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index b080a86938..c97751bc0c 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPP_TYPES_H
-#define ARM_COMPUTE_CPP_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
+#define ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
#include "arm_compute/core/Error.h"
@@ -170,6 +170,17 @@ class CPUInfo final
* @return Number of CPUs
*/
unsigned int get_cpu_num() const;
+ /** Return the maximum number of CPUs present excluding the little cores
+ * in case of an Android device
+ *
+ * @return Number of CPUs excluding little
+ */
+ unsigned int get_cpu_num_excluding_little() const;
+ /** Return the vector length in bytes for sme2
+ *
+ * @return Vector length if sme2 is enabled, otherwise returns 0.
+ */
+ unsigned long get_sme2_vector_length() const;
private:
struct Impl;
@@ -184,4 +195,4 @@ struct ThreadInfo
const CPUInfo *cpu_info{nullptr};
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
diff --git a/arm_compute/runtime/CL/functions/CLScatter.h b/arm_compute/runtime/CL/functions/CLScatter.h
index 1c90d208bd..973953624e 100644
--- a/arm_compute/runtime/CL/functions/CLScatter.h
+++ b/arm_compute/runtime/CL/functions/CLScatter.h
@@ -54,15 +54,16 @@ class CLScatter : public IFunction
/** Default destructor */
~CLScatter();
/** Initialise the kernel's inputs and outputs
+ *
+ * @note Negative indices are treated as out of bounds.
*
* Valid data layouts:
* - All
*
- *
* @param[in] compile_context The compile context to be used.
* @param[in] src Source tensor. Values used to fill output. Can be nullptr when zero initialization is true.
* @param[in] updates Tensor containing values used to update output tensor. Data types supported: same as @p src
- * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : U32
+ * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
* @param[out] output Destination tensor. Data types supported: same as @p src.
* @param[in] info Scatter info object.
*/
@@ -85,7 +86,7 @@ class CLScatter : public IFunction
*
* @param[in] src Source tensor.
* @param[in] updates Tensor containing values used for updating the output Tensor. Data types supported : same as @p src
- * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : U32
+ * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
* @param[in] output Destination tensor. Data types supported: same as @p src.
* @param[in] info Scatter info containing type of scatter.
*
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index b522b403a9..9b39714fea 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
#include "arm_compute/runtime/IScheduler.h"
@@ -79,6 +79,7 @@ class OMPScheduler final : public IScheduler
private:
unsigned int _num_threads;
+ unsigned int _nonlittle_num_cpus;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
diff --git a/docs/Doxyfile b/docs/Doxyfile
index cca32210e8..0ecbb2d030 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "Compute Library"
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = 24.04
+PROJECT_NUMBER = 24.05
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index b29b81580d..a5f61d669d 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,9 +41,14 @@ If there is more than one release in a month then an extra sequential number is
@section S2_2_changelog Changelog
+v24.05 Public major release
+ - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types
+ - Various fixes to enable FP16 kernels in armv8a multi_isa builds.
+ - Updated logic in the OpenMP scheduler to exclude LITTLE cores.
+
v24.04 Public major release
- Add Bfloat16 data type support for @ref NEMatMul.
- - Add support for SoftMax in SME2 for FP32 and FP16.
+ - Add support for SoftMax in SME2 for FP32, FP16, QASYMM8 and QASYMM8_SIGNED.
- Add support for in place accumulation to CPU GEMM kernels.
- Add low-precision Int8 * Int8 -> FP32 CPU GEMM which dequantizes after multiplication
- Add is_dynamic flag to QuantizationInfo to signal to operators that it may change after configuration
diff --git a/filelist.json b/filelist.json
index 2c3621cd8b..15449b4f1c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1415,7 +1415,11 @@
"src/cpu/operators/CpuDequantize.cpp",
"src/cpu/kernels/CpuDequantizeKernel.cpp",
"src/runtime/NEON/functions/NEDequantizationLayer.cpp"
- ]
+ ],
+ "neon":{
+ "fp32":["src/cpu/kernels/dequantize/generic/neon/fp32.cpp"],
+ "fp16":["src/cpu/kernels/dequantize/generic/neon/fp16.cpp"]
+ }
}
},
"DetectionPostProcess": {
@@ -1593,7 +1597,6 @@
"neon": {
"common": [
"src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
- "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
@@ -1605,7 +1608,6 @@
"src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
"src/core/NEON/kernels/arm_gemm/interleave-8way.cpp",
"src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
- "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
"src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
"src/core/NEON/kernels/arm_gemm/misc.cpp",
"src/core/NEON/kernels/arm_gemm/quantized.cpp",
@@ -1622,13 +1624,8 @@
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
@@ -1682,6 +1679,13 @@
"fp32":["src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp",
"src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp"],
"fp16":["src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+ "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
+ "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
"src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp"],
"estate32": [
"src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp",
@@ -1690,6 +1694,7 @@
],
"estate64": [
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp"
+
],
"fixed_format_kernels": [
"src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp",
@@ -2093,7 +2098,12 @@
"src/cpu/operators/CpuQuantize.cpp",
"src/cpu/kernels/CpuQuantizeKernel.cpp",
"src/runtime/NEON/functions/NEQuantizationLayer.cpp"
- ]
+ ],
+ "neon":{
+ "fp32":["src/cpu/kernels/quantize/generic/neon/fp32.cpp"],
+ "fp16":["src/cpu/kernels/quantize/generic/neon/fp16.cpp"],
+ "integer":["src/cpu/kernels/quantize/generic/neon/integer.cpp"]
+ }
}
},
"Range": {
@@ -2115,7 +2125,14 @@
"common": [
"src/core/NEON/kernels/NEReductionOperationKernel.cpp",
"src/runtime/NEON/functions/NEReductionOperation.cpp"
- ]
+ ],
+ "neon":{
+ "fp32":["src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp"],
+ "fp16":["src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp"],
+ "integer":["src/cpu/kernels/reduction_layer/generic/neon/integer.cpp"],
+ "qasymm8":["src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp"],
+ "qasymm8_signed":["src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp"]
+ }
}
},
"Reorg": {
@@ -2243,7 +2260,9 @@
"sve2":{
"common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
"fp32" :["src/cpu/kernels/softmax/generic/sme2/fp32.cpp"],
- "fp16" :["src/cpu/kernels/softmax/generic/sme2/fp16.cpp"]
+ "fp16" :["src/cpu/kernels/softmax/generic/sme2/fp16.cpp"],
+ "qasymm8" :["src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp"],
+ "qasymm8_signed" :["src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp"]
}
}
},
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index e3cac07de1..f270824ab4 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -119,6 +119,8 @@ filegroup(
"cpu/kernels/lut/generic/sve2/u8.cpp",
"cpu/kernels/softmax/generic/sme2/fp16.cpp",
"cpu/kernels/softmax/generic/sme2/fp32.cpp",
+ "cpu/kernels/softmax/generic/sme2/qasymm8.cpp",
+ "cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp",
"cpu/kernels/softmax/generic/sve2/impl.cpp"] +
glob(["**/*.h",
"**/*.hpp",
@@ -751,6 +753,8 @@ filegroup(
"cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
+ "cpu/kernels/dequantize/generic/neon/fp16.cpp",
+ "cpu/kernels/dequantize/generic/neon/fp32.cpp",
"cpu/kernels/directconv2d/nchw/all.cpp",
"cpu/kernels/directconv2d/nchw/fp16.cpp",
"cpu/kernels/directconv2d/nhwc/neon/fp16.cpp",
@@ -816,9 +820,17 @@ filegroup(
"cpu/kernels/pool3d/neon/fp32.cpp",
"cpu/kernels/pool3d/neon/qasymm8.cpp",
"cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
+ "cpu/kernels/quantize/generic/neon/fp16.cpp",
+ "cpu/kernels/quantize/generic/neon/fp32.cpp",
+ "cpu/kernels/quantize/generic/neon/integer.cpp",
"cpu/kernels/range/generic/neon/fp16.cpp",
"cpu/kernels/range/generic/neon/fp32.cpp",
"cpu/kernels/range/generic/neon/integer.cpp",
+ "cpu/kernels/reduction_layer/generic/neon/fp16.cpp",
+ "cpu/kernels/reduction_layer/generic/neon/fp32.cpp",
+ "cpu/kernels/reduction_layer/generic/neon/integer.cpp",
+ "cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp",
+ "cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp",
"cpu/kernels/roialign/generic/neon/fp16.cpp",
"cpu/kernels/roialign/generic/neon/fp32.cpp",
"cpu/kernels/roialign/generic/neon/qasymm8.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 984db79c18..87c5f8b21d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -340,6 +340,8 @@ target_sources(
cpu/kernels/lut/generic/sve2/u8.cpp
cpu/kernels/softmax/generic/sme2/fp16.cpp
cpu/kernels/softmax/generic/sme2/fp32.cpp
+ cpu/kernels/softmax/generic/sme2/qasymm8.cpp
+ cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
cpu/kernels/softmax/generic/sve2/impl.cpp
)
@@ -742,6 +744,8 @@ target_sources(
cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
+ cpu/kernels/dequantize/generic/neon/fp16.cpp
+ cpu/kernels/dequantize/generic/neon/fp32.cpp
cpu/kernels/directconv2d/nchw/all.cpp
cpu/kernels/directconv2d/nchw/fp16.cpp
cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
@@ -807,9 +811,17 @@ target_sources(
cpu/kernels/pool3d/neon/fp32.cpp
cpu/kernels/pool3d/neon/qasymm8.cpp
cpu/kernels/pool3d/neon/qasymm8_signed.cpp
+ cpu/kernels/quantize/generic/neon/fp16.cpp
+ cpu/kernels/quantize/generic/neon/fp32.cpp
+ cpu/kernels/quantize/generic/neon/integer.cpp
cpu/kernels/range/generic/neon/fp16.cpp
cpu/kernels/range/generic/neon/fp32.cpp
cpu/kernels/range/generic/neon/integer.cpp
+ cpu/kernels/reduction_layer/generic/neon/fp16.cpp
+ cpu/kernels/reduction_layer/generic/neon/fp32.cpp
+ cpu/kernels/reduction_layer/generic/neon/integer.cpp
+ cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp
+ cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp
cpu/kernels/roialign/generic/neon/fp16.cpp
cpu/kernels/roialign/generic/neon/fp32.cpp
cpu/kernels/roialign/generic/neon/qasymm8.cpp
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 93f51e599a..0911c61b54 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -29,6 +29,7 @@
#include "support/StringSupport.h"
#include "support/ToolchainSupport.h"
+#include