Add some f64 Neon optimisations for aarch64

Some documentation improvements.
ARM-software · Sep 29, 2022 · 3d5fb83 · 3d5fb83
1 parent 633b528
commit 3d5fb83
Show file tree

Hide file tree

Showing 36 changed files with 3,186 additions and 1,381 deletions.
diff --git a/ComputeGraph/cg/static/nodes/cpp/StreamingNodes/README.md b/ComputeGraph/cg/static/nodes/cpp/StreamingNodes/README.md
@@ -1,4 +1,16 @@
-# RingBuffer
+# Streaming Nodes
+
+This is an example implementation to be used with Arm Virtual Hardware (AVH).
+
+It is requiring headers and source files provided by AVH.
+
+Those files are not needed at all to use the Compute Graph.
+
+Those files are kept because they are used in the AVH-SystemModeling example.
+
+But there are simpler way to interface the compute graph to an audio interrupt.
+
+## RingBuffer
 
 It is a way to connect the compute graph with static flow to an audio source or sink.
 

diff --git a/ComputeLibrary/Include/NEMath.h b/ComputeLibrary/Include/NEMath.h
@@ -26,6 +26,28 @@
 
 
 #if defined(ARM_MATH_NEON)
+
+#if defined(__aarch64__)
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
+ *
+ * @return The calculated approximation.
+ */
+static inline float64x2_t vtaylor_polyq_f64(float64x2_t x, const float64_t *coeffs);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float64x2_t vinvq_f64(float64x2_t x);
+
+#endif /* #if defined(__aarch64__) */
+
 /** Calculate floor of a vector.
  *
  * @param[in] val Input vector value in F32 format.
@@ -182,10 +204,14 @@ static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
 /** Exponent polynomial coefficients */
 extern const float32_t exp_tab[4*8];
 
+extern const float64_t exp_tab_64[2*8];
+
 
 /** Logarithm polynomial coefficients */
 extern const float32_t log_tab[4*8];
 
+extern const float64_t log_tab_64[2*8];
+
 #ifndef DOXYGEN_SKIP_THIS
 inline float32x4_t vfloorq_f32(float32x4_t val)
 {
@@ -231,6 +257,18 @@ inline float32x4_t vinvq_f32(float32x4_t x)
     return recip;
 }
 
+#if defined(__aarch64__)
+
+inline float64x2_t vinvq_f64(float64x2_t x)
+{
+    float64x2_t recip = vrecpeq_f64(x);
+    recip             = vmulq_f64(vrecpsq_f64(x, recip), recip);
+    recip             = vmulq_f64(vrecpsq_f64(x, recip), recip);
+    return recip;
+}
+
+#endif /* #if defined(__aarch64__) */
+
 inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
 {
     float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
@@ -243,6 +281,23 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
     return res;
 }
 
+#if defined(__aarch64__)
+
+inline float64x2_t vtaylor_polyq_f64(float64x2_t x, const float64_t *coeffs)
+{
+    float64x2_t A   = vmlaq_f64(vld1q_f64(&coeffs[2*0]), vld1q_f64(&coeffs[2*4]), x);
+    float64x2_t B   = vmlaq_f64(vld1q_f64(&coeffs[2*2]), vld1q_f64(&coeffs[2*6]), x);
+    float64x2_t C   = vmlaq_f64(vld1q_f64(&coeffs[2*1]), vld1q_f64(&coeffs[2*5]), x);
+    float64x2_t D   = vmlaq_f64(vld1q_f64(&coeffs[2*3]), vld1q_f64(&coeffs[2*7]), x);
+    float64x2_t x2  = vmulq_f64(x, x);
+    float64x2_t x4  = vmulq_f64(x2, x2);
+    float64x2_t res = vmlaq_f64(vmlaq_f64(A, B, x2), vmlaq_f64(C, D, x2), x4);
+    return res;
+}
+
+#endif /* #if defined(__aarch64__) */
+
+
 inline float32x4_t vexpq_f32(float32x4_t x)
 {
     static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
@@ -261,6 +316,7 @@ inline float32x4_t vexpq_f32(float32x4_t x)
     poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
     poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
 
+
     return poly;
 }
 
@@ -282,6 +338,7 @@ inline float32x4_t vlogq_f32(float32x4_t x)
     return poly;
 }
 
+
 inline float32x4_t vtanhq_f32(float32x4_t val)
 {
     static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};

diff --git a/ComputeLibrary/README.md b/ComputeLibrary/README.md
@@ -1,7 +1,7 @@
 README
 ======
 
-This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
+This folder is containing two files imported from the ComputeLibrary:
 
     NEMath.h and arm_cl_tables.c 
 
@@ -15,5 +15,7 @@ The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finall
 
 Otherwise, the features and implementations are the same : a few optimized Neon functions.
 
+New aarch64 code have been contributed by the community.
+
 The license covering those files is different : It is a MIT license.
 Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.
diff --git a/ComputeLibrary/Source/arm_cl_tables.c b/ComputeLibrary/Source/arm_cl_tables.c
@@ -39,7 +39,23 @@ const float32_t exp_tab[4*8] =
         0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
 };
 
+
+
 /** Logarithm polynomial coefficients */
+
+/*
+p0
+p4
+p2
+p6
+p1
+p5
+p3
+p7
+where Poly(x) is the Minimax approximation of log(x) over the 
+                range [1, 2]
+*/
+
 const float32_t log_tab[4*8] =
 {
         -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
@@ -52,4 +68,9 @@ const float32_t log_tab[4*8] =
         0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
 };
 
+
+
+
+
+
 #endif
diff --git a/Include/dsp/matrix_functions.h b/Include/dsp/matrix_functions.h
@@ -1,8 +1,8 @@
 /******************************************************************************
  * @file     matrix_functions.h
  * @brief    Public header file for CMSIS DSP Library
- * @version  V1.10.0
- * @date     08 July 2021
+ * @version  V1.10.1
+ * @date     10 August 2022
  * Target Processor: Cortex-M and Cortex-A cores
  ******************************************************************************/
 /*
@@ -620,6 +620,20 @@ void arm_mat_init_f32(
         uint16_t nColumns,
         float32_t * pData);
 
+/**
+ * @brief  Floating-point matrix initialization.
+ * @param[in,out] S         points to an instance of the floating-point matrix structure.
+ * @param[in]     nRows     number of rows in the matrix.
+ * @param[in]     nColumns  number of columns in the matrix.
+ * @param[in]     pData     points to the matrix data array.
+ */
+void arm_mat_init_f64(
+      arm_matrix_instance_f64 * S,
+      uint16_t nRows,
+      uint16_t nColumns,
+      float64_t * pData);
+
+
 
 
   /**

diff --git a/README.md b/README.md
@@ -11,8 +11,17 @@ It provides optimized compute kernels for Cortex-M and for Cortex-A.
 
 Different variants are available according to the core and most of the functions are using a vectorized version when the Helium or Neon extension is available.
 
+This repository contains the CMSIS-DSP library and several other projects:
 
-### Kernels
+* Test framework for bare metal Cortex-M or Cortex-A
+* Examples for bare metal Cortex-M
+* ComputeGraph
+* PythonWrapper
+
+You don't need any of the other projects to build and use CMSIS-DSP library. Building the other projects may require installation of other libraries (CMSIS), other tools (Arm Virtual Hardware) or CMSIS build tools.
+
+
+### CMSIS-DSP Kernels
 
 Kernels provided by CMSIS-DSP (list not exhaustive):
 
@@ -46,9 +55,9 @@ The Python scripts for the static scheduler generator are part of the CMSIS-DSP
 
 The header files are part of the CMSIS-DSP pack (version 1.10.2 and above).
 
-The audio streaming nodes on top of CMSIS-RTOS2 are not part of the CMSIS-DSP pack but can be found in the repository. They are demo quality only. They can be used with Arm Virtual Hardware.
+The audio streaming nodes on top of CMSIS-RTOS2 are not part of the CMSIS-DSP pack but can be found in the repository. They are demo quality only. They can only be used with Arm Virtual Hardware.
 
-The SDF is making it easier to implement a streaming solution : connecting different compute kernels each consuming and producing different amount of data.
+The Compute Graph is making it easier to implement a streaming solution : connecting different compute kernels each consuming and producing different amount of data.
 
 ## Support / Contact
 
@@ -91,6 +100,8 @@ Some compiler may also require the use of option `-munaligned-access` to specify
 
 ## How to build
 
+You can build CMSIS-DSP with the open CMSIS-Pack, or cmake, or Makefile and it is also easy to build if you use any other build tool.
+
 ### Building with MDK or Open CMSIS-Pack
 
 The standard way to build is by using the CMSIS pack technology. CMSIS-DSP is available as a pack.
@@ -99,10 +110,6 @@ This pack technology is supported by some IDE like [Keil MDK](https://www.keil.c
 
 You can also use those packs using the [Open CMSIS-Pack](https://www.open-cmsis-pack.org/) technology and from command line on any platform.
 
-cmake can also be used to build CMSIS-DSP.
-
-### How to build with Open CMSIS-Pack
-
 You should first install the tools from https://github.com/Open-CMSIS-Pack/devtools
 
 You can get the CMSIS-Toolbox which is containing the package installer, cmsis build and cmsis project manager. Here is some documentation:
@@ -113,7 +120,7 @@ You can get the CMSIS-Toolbox which is containing the package installer, cmsis b
 
 Once you have installed the tools, you'll need to download the pack index using the `cpackget` tool.
 
-Then, you'll need to convert the solution file into `.cprj`. For instance, for the CMSIS-DSP Examples, you can go to: 
+Then, you'll need to convert a solution file into `.cprj`. For instance, for the CMSIS-DSP Examples, you can go to: 
 
 `Examples/cmsis_build` 
 
@@ -164,7 +171,18 @@ Once cmake has generated the makefiles, you can use a GNU Make to build.
 
     make VERBOSE=1
 
+### How to build with any other build system
 
+You need the following folders:
+
+* Source
+* Include
+* PrivateInclude
+* ComputeLibrary (only if you target Neon)
+
+In `Source` subfolders, you may either build all of the source file with a datatype suffix (like `_f32.c`), or just compile the files without a datatype suffix. For instance for `BasicMathFunctions`, you can build all the C files except `BasicMathFunctions.c` and `BasicMathFunctionsF16.c`, or you can just build those two files (they are including all of the other C files of the folder).
+
+`f16` files are not mandatory. You can build with `-DDISABLEFLOAT16`
 
 ### How to build for aarch64
 
@@ -193,9 +211,9 @@ For cmake the equivalent options are:
 
 cmake is automatically including the `ComputeLibrary` folder. If you are using a different build, you need to include this folder too to build with Neon support.
 
-### Running
+### Running the examples
 
-The generated executable can be run on a fast model. 
+If you build the examples with CMSIS build tools, the generated executable can be run on a fast model. 
 For instance, if you built for m7, you could just do:
 
     FVP_MPS2_Cortex-M7.exe -a arm_variance_example
@@ -206,6 +224,15 @@ Of course, on your fast model or virtual hardware you should use the right confi
 
 ## Folders and files
 
+The only folders required to build and use CMSIS-DSP Library are:
+
+* Source
+* Include
+* PrivateInclude
+* ComputeLibrary (only when using Neon)
+
+Other folders are part of different projects, tests or examples.
+
 ### Folders
 
 * cmsisdsp
@@ -214,7 +241,8 @@ Of course, on your fast model or virtual hardware you should use the right confi
 * ComputeLibrary:
   * Some kernels required when building CMSIS-DSP with Neon acceleration
 * Examples:
-  * Examples of use of CMSIS-DSP
+  * Examples of use of CMSIS-DSP on bare metal Cortex-M
+  * Require the use of CMSIS Build tools
 * Include:
   * Include files for CMSIS-DSP
 * PrivateInclude:
@@ -225,13 +253,17 @@ Of course, on your fast model or virtual hardware you should use the right confi
 * Scripts:
   * Debugging scripts
   * Script to generate some coefficient tables used by CMSIS-DSP
-* SDFTools:
-  * Examples for the Synchronous Data Flow
-  * C++ templates for the Synchronous Data Flow
+* Compute Graph:
+  * Not needed to build CMSIS-DSP. This project is relying on CMSIS-DSP library
+  * Examples for the Compute Graph
+  * C++ templates for the Compute Graph
+  * Default (and optional) nodes
+  
 * Source:
   * CMSIS-DSP source
 * Testing:
-  * CMSIS-DSP Test framework
+  * CMSIS-DSP Test framework for bare metal Cortex-M and Cortex-A
+  * Require the use of CMSIS build tools
 
 ### Files