Skip to content

Commit

Permalink
CMSIS-NN: Add arm_nn_write_q15x2_ia
Browse files Browse the repository at this point in the history
* Adding arm_nn_write_q15x2_ia replacing write_q15x2_ia in CMSIS-DSP.
* Updating clang format to not have brace wrapping after extern.
  • Loading branch information
mansnils authored and felix-johnny committed Feb 9, 2021
1 parent 10b1569 commit a71c2b4
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 56 deletions.
5 changes: 3 additions & 2 deletions CMSIS/NN/.clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@ BraceWrapping:
AfterClass: false
AfterControlStatement: true
AfterEnum: true
AfterExternBlock: false
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeElse: true
IndentBraces: true
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Allman
BreakBeforeBraces: Custom
ColumnLimit: 120
DerivePointerAlignment: false
IndentWidth: 4
Expand Down
102 changes: 59 additions & 43 deletions CMSIS/NN/Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ extern "C" {
/**
* @brief Union for SIMD access of q31/q15/q7 types
*/
union arm_nnword {
union arm_nnword
{
q31_t word;
/**< q31 type */
q15_t half_words[2];
Expand All @@ -68,7 +69,8 @@ struct arm_nn_double
int32_t high;
};

union arm_nn_long_long {
union arm_nn_long_long
{
int64_t long_long;
struct arm_nn_double word;
};
Expand Down Expand Up @@ -144,8 +146,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
* @return none.
*
* @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its original
* order.
* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
* original order.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
Expand Down Expand Up @@ -219,7 +221,7 @@ q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
* 2. NULL if implementation is not available.
*
* @details Supported framework: TensorFlow Lite
*/
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
Expand Down Expand Up @@ -251,7 +253,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
* *output += row_base[i] * col_base[i]
* sum_col += col_base[i]
*
*/
*/
arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
const int8_t *row_base,
const int8_t *col_base,
Expand Down Expand Up @@ -279,7 +281,7 @@ arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
* ..
* output[3] += row_base[i + (row_elements * 3)] * col_base[i]
* sum_col += col_base[i]
*/
*/
arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t offset,
const int8_t *row_base,
Expand All @@ -288,34 +290,34 @@ arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
int32_t *const output);

/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of output
* columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length of
* this vector is equal to
* the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
* output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input
* rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
* of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
Expand Down Expand Up @@ -387,8 +389,8 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
* - Updated output pointer if an implementaiton is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following.
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
* out for the following.
* - Output shift
* - Output multiplier
* - Output bias
Expand Down Expand Up @@ -428,8 +430,8 @@ q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
* - Updated output pointer if an implementaiton is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following.
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
* out for the following.
* - Output shift
* - Output multiplier
* - Output bias
Expand Down Expand Up @@ -519,8 +521,8 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc
" vstrb.8 q0, [%[in]], 16 \n"
" letp lr, 2b \n"
"1: \n"
: [in] "+r"(dst)
: [cnt] "r"(block_size), [set_val] "r"(val)
: [ in ] "+r"(dst)
: [ cnt ] "r"(block_size), [ set_val ] "r"(val)
: "q0", "memory", "r14");
#else
memset(dst, val, block_size);
Expand Down Expand Up @@ -779,8 +781,8 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST
" vstrb.8 q0, [%[out]], 16 \n"
" letp lr, 2b \n"
"1: \n"
: [in] "+r"(src), [out] "+r"(dst)
: [cnt] "r"(block_size)
: [ in ] "+r"(src), [ out ] "+r"(dst)
: [ cnt ] "r"(block_size)
: "q0", "memory", "r14");
#else
memcpy(dst, src, block_size);
Expand Down Expand Up @@ -917,6 +919,20 @@ __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val
return MUL_POW2(x, 1);
}

/**
@brief Write 2 q15 elements and post increment pointer.
@param[in] dest_q15 Pointer to pointer that holds address of destination.
@param[in] src_q31 Input value to be written.
@return none
*/
__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
{
q31_t val = src_q31;

memcpy(*dest_q15, &val, 4);
*dest_q15 += 2;
}

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB16(0x00000000, buf);

write_q15x2_ia(&output, in & (~mask));
arm_nn_write_q15x2_ia(&output, in & (~mask));
i--;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
*
* -------------------------------------------------------------------- */

#include "arm_math_memory.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

Expand Down Expand Up @@ -63,10 +62,10 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
#endif

in = arm_nn_read_q15x2(pCnt);
write_q15x2_ia(&pCnt, __QADD16(vo1, in));
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));

in = arm_nn_read_q15x2(pCnt);
write_q15x2_ia(&pCnt, __QADD16(vo2, in));
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));

cnt--;
}
Expand All @@ -80,4 +79,4 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)

/**
* @} end of NNBasicMath group
*/
*/
4 changes: 2 additions & 2 deletions CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
out1 = (int32_t)__PKHTB(in1, in2, 16);
out2 = (int32_t)__PKHBT(in2, in1, 16);
#endif
write_q15x2_ia(&pDst, out1);
write_q15x2_ia(&pDst, out2);
arm_nn_write_q15x2_ia(&pDst, out1);
arm_nn_write_q15x2_ia(&pDst, out2);

/* Decrement the loop counter */
blkCnt--;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8));
out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);

write_q15x2_ia(&dst, out_q15x2_2);
write_q15x2_ia(&dst, out_q15x2_1);
arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
arm_nn_write_q15x2_ia(&dst, out_q15x2_1);

block_cnt--;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16);
out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16);

write_q15x2_ia(&dst, out_q15x2_1);
write_q15x2_ia(&dst, out_q15x2_2);
arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
arm_nn_write_q15x2_ia(&dst, out_q15x2_2);

block_cnt--;
}
Expand Down

0 comments on commit a71c2b4

Please sign in to comment.