Skip to content

Commit

Permalink
Add some f64 Neon optimisations for aarch64
Browse files Browse the repository at this point in the history
Some documentation improvements.
  • Loading branch information
Silfurion authored and christophe0606 committed Sep 29, 2022
1 parent 633b528 commit 3d5fb83
Show file tree
Hide file tree
Showing 36 changed files with 3,186 additions and 1,381 deletions.
14 changes: 13 additions & 1 deletion ComputeGraph/cg/static/nodes/cpp/StreamingNodes/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# RingBuffer
# Streaming Nodes

This is an example implementation to be used with Arm Virtual Hardware (AVH).

It is requiring headers and source files provided by AVH.

Those files are not needed at all to use the Compute Graph.

Those files are kept because they are used in the AVH-SystemModeling example.

But there are simpler way to interface the compute graph to an audio interrupt.

## RingBuffer

It is a way to connect the compute graph with static flow to an audio source or sink.

Expand Down
57 changes: 57 additions & 0 deletions ComputeLibrary/Include/NEMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@


#if defined(ARM_MATH_NEON)

#if defined(__aarch64__)

/** Perform a 7th degree polynomial approximation using Estrin's method.
*
* @param[in] x Input vector value in F32 format.
* @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
*
* @return The calculated approximation.
*/
static inline float64x2_t vtaylor_polyq_f64(float64x2_t x, const float64_t *coeffs);

/** Calculate reciprocal.
*
* @param[in] x Input value.
*
* @return The calculated reciprocal.
*/
static inline float64x2_t vinvq_f64(float64x2_t x);

#endif /* #if defined(__aarch64__) */

/** Calculate floor of a vector.
*
* @param[in] val Input vector value in F32 format.
Expand Down Expand Up @@ -182,10 +204,14 @@ static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
/** Exponent polynomial coefficients */
extern const float32_t exp_tab[4*8];

extern const float64_t exp_tab_64[2*8];


/** Logarithm polynomial coefficients */
extern const float32_t log_tab[4*8];

extern const float64_t log_tab_64[2*8];

#ifndef DOXYGEN_SKIP_THIS
inline float32x4_t vfloorq_f32(float32x4_t val)
{
Expand Down Expand Up @@ -231,6 +257,18 @@ inline float32x4_t vinvq_f32(float32x4_t x)
return recip;
}

#if defined(__aarch64__)

inline float64x2_t vinvq_f64(float64x2_t x)
{
float64x2_t recip = vrecpeq_f64(x);
recip = vmulq_f64(vrecpsq_f64(x, recip), recip);
recip = vmulq_f64(vrecpsq_f64(x, recip), recip);
return recip;
}

#endif /* #if defined(__aarch64__) */

inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
{
float32x4_t A = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
Expand All @@ -243,6 +281,23 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
return res;
}

#if defined(__aarch64__)

inline float64x2_t vtaylor_polyq_f64(float64x2_t x, const float64_t *coeffs)
{
float64x2_t A = vmlaq_f64(vld1q_f64(&coeffs[2*0]), vld1q_f64(&coeffs[2*4]), x);
float64x2_t B = vmlaq_f64(vld1q_f64(&coeffs[2*2]), vld1q_f64(&coeffs[2*6]), x);
float64x2_t C = vmlaq_f64(vld1q_f64(&coeffs[2*1]), vld1q_f64(&coeffs[2*5]), x);
float64x2_t D = vmlaq_f64(vld1q_f64(&coeffs[2*3]), vld1q_f64(&coeffs[2*7]), x);
float64x2_t x2 = vmulq_f64(x, x);
float64x2_t x4 = vmulq_f64(x2, x2);
float64x2_t res = vmlaq_f64(vmlaq_f64(A, B, x2), vmlaq_f64(C, D, x2), x4);
return res;
}

#endif /* #if defined(__aarch64__) */


inline float32x4_t vexpq_f32(float32x4_t x)
{
static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
Expand All @@ -261,6 +316,7 @@ inline float32x4_t vexpq_f32(float32x4_t x)
poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);


return poly;
}

Expand All @@ -282,6 +338,7 @@ inline float32x4_t vlogq_f32(float32x4_t x)
return poly;
}


inline float32x4_t vtanhq_f32(float32x4_t val)
{
static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
Expand Down
4 changes: 3 additions & 1 deletion ComputeLibrary/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
README
======

This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
This folder is containing two files imported from the ComputeLibrary:

NEMath.h and arm_cl_tables.c

Expand All @@ -15,5 +15,7 @@ The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finall

Otherwise, the features and implementations are the same : a few optimized Neon functions.

New aarch64 code have been contributed by the community.

The license covering those files is different : It is a MIT license.
Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.
21 changes: 21 additions & 0 deletions ComputeLibrary/Source/arm_cl_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,23 @@ const float32_t exp_tab[4*8] =
0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
};



/** Logarithm polynomial coefficients */

/*
p0
p4
p2
p6
p1
p5
p3
p7
where Poly(x) is the Minimax approximation of log(x) over the
range [1, 2]
*/

const float32_t log_tab[4*8] =
{
-2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
Expand All @@ -52,4 +68,9 @@ const float32_t log_tab[4*8] =
0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
};






#endif
18 changes: 16 additions & 2 deletions Include/dsp/matrix_functions.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/******************************************************************************
* @file matrix_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.10.0
* @date 08 July 2021
* @version V1.10.1
* @date 10 August 2022
* Target Processor: Cortex-M and Cortex-A cores
******************************************************************************/
/*
Expand Down Expand Up @@ -620,6 +620,20 @@ void arm_mat_init_f32(
uint16_t nColumns,
float32_t * pData);

/**
* @brief Floating-point matrix initialization.
* @param[in,out] S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] pData points to the matrix data array.
*/
void arm_mat_init_f64(
arm_matrix_instance_f64 * S,
uint16_t nRows,
uint16_t nColumns,
float64_t * pData);




/**
Expand Down
62 changes: 47 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,17 @@ It provides optimized compute kernels for Cortex-M and for Cortex-A.

Different variants are available according to the core and most of the functions are using a vectorized version when the Helium or Neon extension is available.

This repository contains the CMSIS-DSP library and several other projects:

### Kernels
* Test framework for bare metal Cortex-M or Cortex-A
* Examples for bare metal Cortex-M
* ComputeGraph
* PythonWrapper

You don't need any of the other projects to build and use CMSIS-DSP library. Building the other projects may require installation of other libraries (CMSIS), other tools (Arm Virtual Hardware) or CMSIS build tools.


### CMSIS-DSP Kernels

Kernels provided by CMSIS-DSP (list not exhaustive):

Expand Down Expand Up @@ -46,9 +55,9 @@ The Python scripts for the static scheduler generator are part of the CMSIS-DSP

The header files are part of the CMSIS-DSP pack (version 1.10.2 and above).

The audio streaming nodes on top of CMSIS-RTOS2 are not part of the CMSIS-DSP pack but can be found in the repository. They are demo quality only. They can be used with Arm Virtual Hardware.
The audio streaming nodes on top of CMSIS-RTOS2 are not part of the CMSIS-DSP pack but can be found in the repository. They are demo quality only. They can only be used with Arm Virtual Hardware.

The SDF is making it easier to implement a streaming solution : connecting different compute kernels each consuming and producing different amount of data.
The Compute Graph is making it easier to implement a streaming solution : connecting different compute kernels each consuming and producing different amount of data.

## Support / Contact

Expand Down Expand Up @@ -91,6 +100,8 @@ Some compiler may also require the use of option `-munaligned-access` to specify

## How to build

You can build CMSIS-DSP with the open CMSIS-Pack, or cmake, or Makefile and it is also easy to build if you use any other build tool.

### Building with MDK or Open CMSIS-Pack

The standard way to build is by using the CMSIS pack technology. CMSIS-DSP is available as a pack.
Expand All @@ -99,10 +110,6 @@ This pack technology is supported by some IDE like [Keil MDK](https://www.keil.c

You can also use those packs using the [Open CMSIS-Pack](https://www.open-cmsis-pack.org/) technology and from command line on any platform.

cmake can also be used to build CMSIS-DSP.

### How to build with Open CMSIS-Pack

You should first install the tools from https://github.com/Open-CMSIS-Pack/devtools

You can get the CMSIS-Toolbox which is containing the package installer, cmsis build and cmsis project manager. Here is some documentation:
Expand All @@ -113,7 +120,7 @@ You can get the CMSIS-Toolbox which is containing the package installer, cmsis b

Once you have installed the tools, you'll need to download the pack index using the `cpackget` tool.

Then, you'll need to convert the solution file into `.cprj`. For instance, for the CMSIS-DSP Examples, you can go to:
Then, you'll need to convert a solution file into `.cprj`. For instance, for the CMSIS-DSP Examples, you can go to:

`Examples/cmsis_build`

Expand Down Expand Up @@ -164,7 +171,18 @@ Once cmake has generated the makefiles, you can use a GNU Make to build.

make VERBOSE=1

### How to build with any other build system

You need the following folders:

* Source
* Include
* PrivateInclude
* ComputeLibrary (only if you target Neon)

In `Source` subfolders, you may either build all of the source file with a datatype suffix (like `_f32.c`), or just compile the files without a datatype suffix. For instance for `BasicMathFunctions`, you can build all the C files except `BasicMathFunctions.c` and `BasicMathFunctionsF16.c`, or you can just build those two files (they are including all of the other C files of the folder).

`f16` files are not mandatory. You can build with `-DDISABLEFLOAT16`

### How to build for aarch64

Expand Down Expand Up @@ -193,9 +211,9 @@ For cmake the equivalent options are:
cmake is automatically including the `ComputeLibrary` folder. If you are using a different build, you need to include this folder too to build with Neon support.
### Running
### Running the examples
The generated executable can be run on a fast model.
If you build the examples with CMSIS build tools, the generated executable can be run on a fast model.
For instance, if you built for m7, you could just do:
FVP_MPS2_Cortex-M7.exe -a arm_variance_example
Expand All @@ -206,6 +224,15 @@ Of course, on your fast model or virtual hardware you should use the right confi
## Folders and files
The only folders required to build and use CMSIS-DSP Library are:
* Source
* Include
* PrivateInclude
* ComputeLibrary (only when using Neon)
Other folders are part of different projects, tests or examples.
### Folders
* cmsisdsp
Expand All @@ -214,7 +241,8 @@ Of course, on your fast model or virtual hardware you should use the right confi
* ComputeLibrary:
* Some kernels required when building CMSIS-DSP with Neon acceleration
* Examples:
* Examples of use of CMSIS-DSP
* Examples of use of CMSIS-DSP on bare metal Cortex-M
* Require the use of CMSIS Build tools
* Include:
* Include files for CMSIS-DSP
* PrivateInclude:
Expand All @@ -225,13 +253,17 @@ Of course, on your fast model or virtual hardware you should use the right confi
* Scripts:
* Debugging scripts
* Script to generate some coefficient tables used by CMSIS-DSP
* SDFTools:
* Examples for the Synchronous Data Flow
* C++ templates for the Synchronous Data Flow
* Compute Graph:
* Not needed to build CMSIS-DSP. This project is relying on CMSIS-DSP library
* Examples for the Compute Graph
* C++ templates for the Compute Graph
* Default (and optional) nodes
* Source:
* CMSIS-DSP source
* Testing:
* CMSIS-DSP Test framework
* CMSIS-DSP Test framework for bare metal Cortex-M and Cortex-A
* Require the use of CMSIS build tools
### Files
Expand Down
Loading

0 comments on commit 3d5fb83

Please sign in to comment.