CMSIS-NN: Add arm_nn_write_q15x2_ia

* Adding arm_nn_write_q15x2_ia replacing write_q15x2_ia in CMSIS-DSP. * Updating clang format to not have brace wrapping after extern.
davemgreen · Feb 9, 2021 · a71c2b4 · a71c2b4
1 parent 10b1569
commit a71c2b4
Show file tree

Hide file tree

Showing 7 changed files with 72 additions and 56 deletions.
diff --git a/CMSIS/NN/.clang-format b/CMSIS/NN/.clang-format
@@ -32,13 +32,14 @@ BraceWrapping:
   AfterClass:      false
   AfterControlStatement: true
   AfterEnum:       true
+  AfterExternBlock: false
   AfterFunction:   true
   AfterStruct:     true
   AfterUnion:      true
   BeforeElse:      true
-  IndentBraces:    true
+  IndentBraces:    false
 BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Allman
+BreakBeforeBraces: Custom
 ColumnLimit:     120
 DerivePointerAlignment: false
 IndentWidth:     4

diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -50,7 +50,8 @@ extern "C" {
 /**
  * @brief Union for SIMD access of q31/q15/q7 types
  */
-union arm_nnword {
+union arm_nnword
+{
     q31_t word;
     /**< q31 type */
     q15_t half_words[2];
@@ -68,7 +69,8 @@ struct arm_nn_double
     int32_t high;
 };
 
-union arm_nn_long_long {
+union arm_nn_long_long
+{
     int64_t long_long;
     struct arm_nn_double word;
 };
@@ -144,8 +146,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
  * @return none.
  *
  * @details  This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
- *           the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its original
- *           order.
+ *           the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
+ * original order.
  *
  */
 void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
@@ -219,7 +221,7 @@ q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
  *              2. NULL if implementation is not available.
  *
  * @details   Supported framework: TensorFlow Lite
-*/
+ */
 q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
                          const q7_t *input_col,
                          const uint16_t output_ch,
@@ -251,7 +253,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
  *          *output += row_base[i] * col_base[i]
  *          sum_col += col_base[i]
  *
-*/
+ */
 arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
                                      const int8_t *row_base,
                                      const int8_t *col_base,
@@ -279,7 +281,7 @@ arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
  *                ..
  *          output[3] += row_base[i + (row_elements * 3)] * col_base[i]
  *          sum_col += col_base[i]
-*/
+ */
 arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
                                      const int32_t offset,
                                      const int8_t *row_base,
@@ -288,34 +290,34 @@ arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
                                      int32_t *const output);
 
 /**
-* @brief General Matrix-multiplication function with per-channel requantization.
-*        This function assumes:
-*        - LHS input matrix NOT transposed (nt)
-*        - RHS input matrix transposed (t)
-*
-*  @note This operation also performs the broadcast bias addition before the requantization
-*
-* @param[in]  lhs                Pointer to the LHS input matrix
-* @param[in]  rhs                Pointer to the RHS input matrix
-* @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of output
-*                                columns (or RHS input rows)
-* @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
-* @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
-*                                The length of this vector is equal to the number of output columns (or RHS input rows)
-* @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length of
-*                                this vector is equal to
-*                                the number of output columns (or RHS input rows)
-* @param[in]  lhs_rows           Number of LHS input rows
-* @param[in]  rhs_rows           Number of RHS input rows
-* @param[in]  rhs_cols           Number of LHS/RHS input columns
-* @param[in]  lhs_offset         Offset to be applied to the LHS input value
-* @param[in]  dst_offset         Offset to be applied the output result
-* @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
-* @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
-*
-* @return     The function returns <code>ARM_MATH_SUCCESS</code>
-*
-*/
+ * @brief General Matrix-multiplication function with per-channel requantization.
+ *        This function assumes:
+ *        - LHS input matrix NOT transposed (nt)
+ *        - RHS input matrix transposed (t)
+ *
+ *  @note This operation also performs the broadcast bias addition before the requantization
+ *
+ * @param[in]  lhs                Pointer to the LHS input matrix
+ * @param[in]  rhs                Pointer to the RHS input matrix
+ * @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of
+ * output columns (or RHS input rows)
+ * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
+ * @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
+ *                                The length of this vector is equal to the number of output columns (or RHS input
+ * rows)
+ * @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length
+ * of this vector is equal to the number of output columns (or RHS input rows)
+ * @param[in]  lhs_rows           Number of LHS input rows
+ * @param[in]  rhs_rows           Number of RHS input rows
+ * @param[in]  rhs_cols           Number of LHS/RHS input columns
+ * @param[in]  lhs_offset         Offset to be applied to the LHS input value
+ * @param[in]  dst_offset         Offset to be applied the output result
+ * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
+ * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
+ *
+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ */
 arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
                                    const q7_t *rhs,
                                    const q31_t *bias,
@@ -387,8 +389,8 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
  *                  - Updated output pointer if an implementaiton is available
  *                  - NULL if no implementation is available.
  *
- * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
- *                 for the following.
+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
+ * out for the following.
  *                  - Output shift
  *                  - Output multiplier
  *                  - Output bias
@@ -428,8 +430,8 @@ q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
  *                  - Updated output pointer if an implementaiton is available
  *                  - NULL if no implementation is available.
  *
- * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
- *                 for the following.
+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
+ * out for the following.
  *                  - Output shift
  *                  - Output multiplier
  *                  - Output bias
@@ -519,8 +521,8 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc
                    "   vstrb.8                 q0, [%[in]], 16            \n"
                    "   letp                    lr, 2b                     \n"
                    "1:                                                    \n"
-                   : [in] "+r"(dst)
-                   : [cnt] "r"(block_size), [set_val] "r"(val)
+                   : [ in ] "+r"(dst)
+                   : [ cnt ] "r"(block_size), [ set_val ] "r"(val)
                    : "q0", "memory", "r14");
 #else
     memset(dst, val, block_size);
@@ -779,8 +781,8 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST
                    "   vstrb.8                 q0, [%[out]], 16           \n"
                    "   letp                    lr, 2b                     \n"
                    "1:                                                    \n"
-                   : [in] "+r"(src), [out] "+r"(dst)
-                   : [cnt] "r"(block_size)
+                   : [ in ] "+r"(src), [ out ] "+r"(dst)
+                   : [ cnt ] "r"(block_size)
                    : "q0", "memory", "r14");
 #else
     memcpy(dst, src, block_size);
@@ -917,6 +919,20 @@ __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val
     return MUL_POW2(x, 1);
 }
 
+/**
+  @brief         Write 2 q15 elements and post increment pointer.
+  @param[in]     dest_q15  Pointer to pointer that holds address of destination.
+  @param[in]     src_q31   Input value to be written.
+  @return        none
+ */
+__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
+{
+    q31_t val = src_q31;
+
+    memcpy(*dest_q15, &val, 4);
+    *dest_q15 += 2;
+}
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -74,7 +74,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
         mask = __QSUB16(0x00000000, buf);
 
-        write_q15x2_ia(&output, in & (~mask));
+        arm_nn_write_q15x2_ia(&output, in & (~mask));
         i--;
     }
 

diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -28,7 +28,6 @@
  *
  * -------------------------------------------------------------------- */
 
-#include "arm_math_memory.h"
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
@@ -63,10 +62,10 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 #endif
 
         in = arm_nn_read_q15x2(pCnt);
-        write_q15x2_ia(&pCnt, __QADD16(vo1, in));
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
 
         in = arm_nn_read_q15x2(pCnt);
-        write_q15x2_ia(&pCnt, __QADD16(vo2, in));
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
 
         cnt--;
     }
@@ -80,4 +79,4 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 
 /**
  * @} end of NNBasicMath group
- */
+ */
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
@@ -86,8 +86,8 @@ void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
         out1 = (int32_t)__PKHTB(in1, in2, 16);
         out2 = (int32_t)__PKHBT(in2, in1, 16);
 #endif
-        write_q15x2_ia(&pDst, out1);
-        write_q15x2_ia(&pDst, out2);
+        arm_nn_write_q15x2_ia(&pDst, out1);
+        arm_nn_write_q15x2_ia(&pDst, out2);
 
         /* Decrement the loop counter */
         blkCnt--;

diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c
@@ -71,8 +71,8 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
         out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8));
         out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
 
-        write_q15x2_ia(&dst, out_q15x2_2);
-        write_q15x2_ia(&dst, out_q15x2_1);
+        arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
+        arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
 
         block_cnt--;
     }

diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
@@ -86,8 +86,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
         out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16);
         out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16);
 
-        write_q15x2_ia(&dst, out_q15x2_1);
-        write_q15x2_ia(&dst, out_q15x2_2);
+        arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
+        arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
 
         block_cnt--;
     }