Official ARM version: v5.6.0

2020-07-28 11:24:49 +01:00 · 2020-07-28 11:24:49 +01:00 · 96d6da4e25
commit 96d6da4e25
parent 9f95ff5b6b
2939 changed files with 339304 additions and 113320 deletions
--- a/NN/Include/arm_nn_tables.h
+++ b/NN/Include/arm_nn_tables.h
@ -53,7 +53,4 @@ extern const q15_t tanhTable_q15[256];
 extern const q15_t sigmoidHTable_q15[192];
 extern const q15_t sigmoidLTable_q15[128];

-extern const q15_t sigmoidLTable_q15[128];
-extern const q15_t sigmoidHTable_q15[192];
-
 #endif                          /*  ARM_NN_TABLES_H */
--- a/NN/Include/arm_nnfunctions.h
+++ b/NN/Include/arm_nnfunctions.h
@ -34,7 +34,7 @@
   * ------------
   *
   * This user manual describes the CMSIS NN software library,
-   * a collection of efficient neural network kernels developed to maximize the 
+   * a collection of efficient neural network kernels developed to maximize the
   * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
   *
   * The library is divided into a number of functions each covering a specific category:
@ -47,8 +47,8 @@
   *
   * The library has separate functions for operating on different weight and activation data
   * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
-   * kernels are included in the function description. The implementation details are also 
-   * described in this paper [1]. 
+   * kernels are included in the function description. The implementation details are also
+   * described in this paper [1].
   *
   * Block Diagram
   * --------
@ -86,7 +86,7 @@

 /**
 * @defgroup groupNN Neural Network Functions
- * These functions perform basic operations for neural network layers. 
+ * These functions perform basic operations for neural network layers.
 */

 #ifndef _ARM_NNFUNCTIONS_H
@ -111,12 +111,12 @@ extern    "C"
 *
 * The convolution is implemented in 2 steps: im2col and GEMM
 *
- * im2col is a process of converting each patch of image data into 
+ * im2col is a process of converting each patch of image data into
 * a column. After im2col, the convolution is computed as matrix-matrix
 * multiplication.
- * 
+ *
 * To reduce the memory footprint, the im2col is performed partially.
- * Each iteration, only a few column (i.e., patches) are generated and 
+ * Each iteration, only a few column (i.e., patches) are generated and
 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
 *
 */
@ -136,9 +136,9 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
-   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code>
   *
   */

@ -153,9 +153,9 @@ extern    "C"
                                         const q7_t * bias,
                                         const uint16_t bias_shift,
                                         const uint16_t out_shift,
-                                         q7_t * Im_out, 
-                                         const uint16_t dim_im_out, 
-                                         q15_t * bufferA, 
+                                         q7_t * Im_out,
+                                         const uint16_t dim_im_out,
+                                         q15_t * bufferA,
                                         q7_t * bufferB);

  /**
@ -180,7 +180,7 @@ extern    "C"
   * @param[in]       dim_im_out_y output tensor dimension y
   * @param[in,out]   bufferA      pointer to buffer space for input
   * @param[in,out]   bufferB      pointer to buffer space for output
-   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code>
   */

    arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
@ -219,9 +219,9 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
-   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code>
   *
   */

@ -236,9 +236,9 @@ extern    "C"
                                          const q15_t * bias,
                                          const uint16_t bias_shift,
                                          const uint16_t out_shift,
-                                          q15_t * Im_out, 
-                                          const uint16_t dim_im_out, 
-                                          q15_t * bufferA, 
+                                          q15_t * Im_out,
+                                          const uint16_t dim_im_out,
+                                          q15_t * bufferA,
                                          q7_t * bufferB);

  /**
@ -256,7 +256,7 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -278,9 +278,9 @@ extern    "C"
                                        const q7_t * bias,
                                        const uint16_t bias_shift,
                                        const uint16_t out_shift,
-                                        q7_t * Im_out, 
-                                        const uint16_t dim_im_out, 
-                                        q15_t * bufferA, 
+                                        q7_t * Im_out,
+                                        const uint16_t dim_im_out,
+                                        q15_t * bufferA,
                                        q7_t * bufferB);

  /**
@ -303,7 +303,7 @@ extern    "C"
   * @param[in,out]   Im_out       pointer to output tensor
   * @param[in]       dim_im_out_x output tensor dimension x
   * @param[in]       dim_im_out_y output tensor dimension y
-   * @param[in,out]   bufferA      pointer to buffer space for input 
+   * @param[in,out]   bufferA      pointer to buffer space for input
   * @param[in,out]   bufferB      pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -355,7 +355,7 @@ extern    "C"
   * @param[in,out]   Im_out       pointer to output tensor
   * @param[in]       dim_im_out_x output tensor dimension x
   * @param[in]       dim_im_out_y output tensor dimension y
-   * @param[in,out]   bufferA      pointer to buffer space for input 
+   * @param[in,out]   bufferA      pointer to buffer space for input
   * @param[in,out]   bufferB      pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -405,7 +405,7 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -426,9 +426,9 @@ extern    "C"
                                       const q7_t * bias,
                                       const uint16_t bias_shift,
                                       const uint16_t out_shift,
-                                       q7_t * Im_out, 
-                                       const uint16_t dim_im_out, 
-                                       q15_t * bufferA, 
+                                       q7_t * Im_out,
+                                       const uint16_t dim_im_out,
+                                       q15_t * bufferA,
                                       q7_t * bufferB);

  /**
@ -446,7 +446,7 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -468,9 +468,9 @@ extern    "C"
                                         const q15_t * bias,
                                         const uint16_t bias_shift,
                                         const uint16_t out_shift,
-                                         q15_t * Im_out, 
-                                         const uint16_t dim_im_out, 
-                                         q15_t * bufferA, 
+                                         q15_t * Im_out,
+                                         const uint16_t dim_im_out,
+                                         q15_t * bufferA,
                                         q7_t * bufferB);

  /**
@ -493,7 +493,7 @@ extern    "C"
   * @param[in,out]   Im_out       pointer to output tensor
   * @param[in]       dim_im_out_x output tensor dimension x
   * @param[in]       dim_im_out_y output tensor dimension y
-   * @param[in,out]   bufferA      pointer to buffer space for input 
+   * @param[in,out]   bufferA      pointer to buffer space for input
   * @param[in,out]   bufferB      pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -508,7 +508,7 @@ extern    "C"
   *
   * <b>Input dimension constraints:</b>
   *
-   * ch_im_in is multiple of 2 
+   * ch_im_in is multiple of 2
   *
   * ch_im_out is multipe of 2
   *
@ -532,10 +532,10 @@ extern    "C"
                              const uint16_t out_shift,
                              q15_t * Im_out,
                              const uint16_t dim_im_out_x,
-                              const uint16_t dim_im_out_y, 
-                              q15_t * bufferA, 
+                              const uint16_t dim_im_out_y,
+                              q15_t * bufferA,
                              q7_t * bufferB);
-										 
+
  /**
   * @brief Q7 depthwise separable convolution function
   * @param[in]       Im_in       pointer to input tensor
@ -551,7 +551,7 @@ extern    "C"
   * @param[in]       out_shift   amount of right-shift for output
   * @param[in,out]   Im_out      pointer to output tensor
   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
+   * @param[in,out]   bufferA     pointer to buffer space for input
   * @param[in,out]   bufferB     pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -574,8 +574,8 @@ extern    "C"
                                                   const uint16_t bias_shift,
                                                   const uint16_t out_shift,
                                                   q7_t * Im_out,
-                                                   const uint16_t dim_im_out, 
-                                                   q15_t * bufferA, 
+                                                   const uint16_t dim_im_out,
+                                                   q15_t * bufferA,
                                                   q7_t * bufferB);

  /**
@ -598,7 +598,7 @@ extern    "C"
   * @param[in,out]   Im_out        pointer to output tensor
   * @param[in]       dim_im_out_x  output tensor dimension x
   * @param[in]       dim_im_out_y  output tensor dimension y
-   * @param[in,out]   bufferA       pointer to buffer space for input 
+   * @param[in,out]   bufferA       pointer to buffer space for input
   * @param[in,out]   bufferB       pointer to buffer space for output
   * @return     The function returns either
   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -642,7 +642,7 @@ extern    "C"
 *
 * Here we have two types of kernel functions. The basic function
 * implements the function using regular GEMV approach. The opt functions
- * operates with weights in interleaved formats. 
+ * operates with weights in interleaved formats.
 *
 */

@ -666,9 +666,9 @@ extern    "C"
                                      const uint16_t dim_vec,
                                      const uint16_t num_of_rows,
                                      const uint16_t bias_shift,
-                                      const uint16_t out_shift, 
-                                      const q7_t * bias, 
-                                      q7_t * pOut, 
+                                      const uint16_t out_shift,
+                                      const q7_t * bias,
+                                      q7_t * pOut,
                                      q15_t * vec_buffer);

  /**
@ -691,9 +691,9 @@ extern    "C"
                                          const uint16_t dim_vec,
                                          const uint16_t num_of_rows,
                                          const uint16_t bias_shift,
-                                          const uint16_t out_shift, 
-                                          const q7_t * bias, 
-                                          q7_t * pOut, 
+                                          const uint16_t out_shift,
+                                          const q7_t * bias,
+                                          q7_t * pOut,
                                          q15_t * vec_buffer);

  /**
@ -716,9 +716,9 @@ extern    "C"
                                       const uint16_t dim_vec,
                                       const uint16_t num_of_rows,
                                       const uint16_t bias_shift,
-                                       const uint16_t out_shift, 
-                                       const q15_t * bias, 
-                                       q15_t * pOut, 
+                                       const uint16_t out_shift,
+                                       const q15_t * bias,
+                                       q15_t * pOut,
                                       q15_t * vec_buffer);

  /**
@ -742,8 +742,8 @@ extern    "C"
                                           const uint16_t num_of_rows,
                                           const uint16_t bias_shift,
                                           const uint16_t out_shift,
-                                           const q15_t * bias, 
-                                           q15_t * pOut, 
+                                           const q15_t * bias,
+                                           q15_t * pOut,
                                           q15_t * vec_buffer);

  /**
@ -767,8 +767,8 @@ extern    "C"
                                                  const uint16_t num_of_rows,
                                                  const uint16_t bias_shift,
                                                  const uint16_t out_shift,
-                                                  const q7_t * bias, 
-                                                  q15_t * pOut, 
+                                                  const q7_t * bias,
+                                                  q15_t * pOut,
                                                  q15_t * vec_buffer);

  /**
@ -792,16 +792,16 @@ extern    "C"
                                                      const uint16_t num_of_rows,
                                                      const uint16_t bias_shift,
                                                      const uint16_t out_shift,
-                                                      const q7_t * bias, 
-                                                      q15_t * pOut, 
+                                                      const q7_t * bias,
+                                                      q15_t * pOut,
                                                      q15_t * vec_buffer);

 /**
 * @brief Matrix-Multiplication Kernels for Convolution
 *
- * These functions are used within convolution layer functions for 
+ * These functions are used within convolution layer functions for
 * matrix multiplication.
- * 
+ *
 * The implementation is similar to CMSIS-DSP arm_mat_mult functions
 * with one Q7 and one Q15 operands. The Q15 operand is the im2col
 * output which is always with 2 columns.
@ -826,8 +826,8 @@ extern    "C"
                                            const uint16_t ch_im_out,
                                            const uint16_t numCol_A,
                                            const uint16_t bias_shift,
-                                            const uint16_t out_shift, 
-                                            const q7_t * bias, 
+                                            const uint16_t out_shift,
+                                            const q7_t * bias,
                                            q7_t * pOut);

  /**
@ -848,8 +848,8 @@ extern    "C"
                                                      const uint16_t ch_im_out,
                                                      const uint16_t numCol_A,
                                                      const uint16_t bias_shift,
-                                                      const uint16_t out_shift, 
-                                                      const q7_t * bias, 
+                                                      const uint16_t out_shift,
+                                                      const q7_t * bias,
                                                      q7_t * pOut);

 #ifdef __cplusplus
@ -902,7 +902,7 @@ extern    "C"
   * @return none.
   */

-    void      arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, 
+    void      arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
                                           arm_nn_activation_type type);

  /**
@ -944,9 +944,9 @@ extern    "C"
                                 const uint16_t ch_im_in,
                                 const uint16_t dim_kernel,
                                 const uint16_t padding,
-                                 const uint16_t stride, 
-                                 const uint16_t dim_im_out, 
-                                 q7_t * bufferA, 
+                                 const uint16_t stride,
+                                 const uint16_t dim_im_out,
+                                 q7_t * bufferA,
                                 q7_t * Im_out);

  /**
@ -969,9 +969,9 @@ extern    "C"
                                 const uint16_t ch_im_in,
                                 const uint16_t dim_kernel,
                                 const uint16_t padding,
-                                 const uint16_t stride, 
-                                 const uint16_t dim_im_out, 
-                                 q7_t * bufferA, 
+                                 const uint16_t stride,
+                                 const uint16_t dim_im_out,
+                                 q7_t * bufferA,
                                 q7_t * Im_out);

 /**
@ -1003,6 +1003,71 @@ extern    "C"

    void      arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);

+  /**
+   * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier
+   *        and input channels. Unless specified otherwise, arguments are mandatory.
+   *
+   * @param[in]     input     Pointer to input tensor
+   * @param[in]     input_x   Width of input tensor
+   * @param[in]     input_y   Height of input tensor
+   * @param[in]     input_ch  Channels in input tensor
+   * @param[in]     kernel    Pointer to kernel weights
+   * @param[in]     kernel_x  Width of kernel
+   * @param[in]     kernel_y  Height of kernel
+   * @param[in]     ch_mult   Number of channel multiplier
+   * @param[in]     pad_x     Padding sizes x
+   * @param[in]     pad_y     Padding sizes y
+   * @param[in]     stride_x  Convolution stride along the width
+   * @param[in]     stride_y  Convolution stride along the height
+   * @param[in]     dilation_x Dilation along width. Not used and intended for future enhancement.
+   * @param[in]     dilation_y Dilation along height. Not used and intended for future enhancement.
+   * @param[in]     bias       Pointer to optional bias values. If no bias is
+   *                           availble, NULL is expected
+   * @param[in]     input_offset  Input tensor zero offset
+   * @param[in]     filter_offset Kernel tensor zero offset
+   * @param[in]     output_offset Output tensor zero offset
+   * @param[in,out] output        Pointer to output tensor
+   * @param[in]     output_x  Width of output tensor
+   * @param[in]     output_y  Height of output tensor
+   * @param[in]     output_activation_min   Minimum value to clamp the output to. Range : {0, 255}
+   * @param[in]     output_activation_max   Minimum value to clamp the output to. Range : {0, 255}
+   * @param[in]     out_shift  Amount of right-shift for output
+   * @param[in]     out_mult   Output multiplier for requantization
+   * @return        The function returns one of the following
+   *                <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
+   *                <code>ARM_MATH_SUCCESS</code> - Successful operation
+   *                <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
+   *
+   * <b> Input constraints</b>
+   * ch_mult  is multiple of 2
+   * kernel_x is multiple of 2
+   *
+   */
+    arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
+                                                const uint16_t input_x,
+                                                const uint16_t input_y,
+                                                const uint16_t input_ch,
+                                                const uint8_t *kernel,
+                                                const uint16_t kernel_x,
+                                                const uint16_t kernel_y,
+                                                const int16_t ch_mult,
+                                                const int16_t pad_x,
+                                                const int16_t pad_y,
+                                                const int16_t stride_x,
+                                                const int16_t stride_y,
+                                                const int16_t dilation_x,
+                                                const int16_t dilation_y,
+                                                const int32_t *bias,
+                                                const int32_t input_offset,
+                                                const int32_t filter_offset,
+                                                const int32_t output_offset,
+                                                uint8_t *output,
+                                                const uint16_t output_x,
+                                                const uint16_t output_y,
+                                                const int32_t output_activation_min,
+                                                const int32_t output_activation_max,
+                                                const int32_t out_shift,
+                                                const int32_t out_mult);
 #ifdef __cplusplus
 }
 #endif
--- a/NN/Include/arm_nnsupportfunctions.h
+++ b/NN/Include/arm_nnsupportfunctions.h
@ -32,13 +32,17 @@

 #include "arm_math.h"
 #include "arm_common_tables.h"
-//#include <cstring>

 #ifdef __cplusplus
 extern    "C"
 {
 #endif

+#define LEFT_SHIFT(_shift)  (_shift > 0 ? _shift : 0)
+#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
+#define Q31_MIN (0x80000000L)
+#define Q31_MAX (0x7FFFFFFFL)
+
 /**
 * @brief Union for SIMD access of Q31/Q15/Q7 types
 */
@ -72,11 +76,11 @@ typedef enum
 */

 /**
- * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift 
- * @param[in]       *pSrc points to the Q7 input vector    
- * @param[out]      *pDst points to the Q15 output vector   
- * @param[in]       blockSize length of the input vector    
- * @return none.    
+ * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
+ * @param[in]       *pSrc points to the Q7 input vector
+ * @param[out]      *pDst points to the Q15 output vector
+ * @param[in]       blockSize length of the input vector
+ * @return none.
 *
 */

@ -84,10 +88,10 @@ void      arm_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t block

 /**
 * @brief  Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
- * @param[in]       *pSrc points to the Q7 input vector    
- * @param[out]      *pDst points to the Q15 output vector   
- * @param[in]       blockSize length of the input vector    
- * @return none.    
+ * @param[in]       *pSrc points to the Q7 input vector
+ * @param[out]      *pDst points to the Q15 output vector
+ * @param[in]       blockSize length of the input vector
+ * @return none.
 *
 */

@ -163,7 +167,7 @@ void arm_nn_mult_q15(
  q15_t * pDst,
  const uint16_t out_shift,
  uint32_t blockSize);
-  
+
 /**
 * @brief           Q7 vector multiplication with variable output shifts
 * @param[in]       *pSrcA        pointer to the first input vector
@ -185,16 +189,79 @@ void arm_nn_mult_q7(
  q7_t * pDst,
  const uint16_t out_shift,
  uint32_t blockSize);
- 
+
 /**
- * @brief defition to adding rouding offset
+ * @brief macro for adding rounding offset
 */
 #ifndef ARM_NN_TRUNCATE
-    #define NN_ROUND(out_shift) ( 0x1 << (out_shift - 1) )
+    #define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 )
 #else
    #define NN_ROUND(out_shift) 0
 #endif

+/**
+ * @brief           Saturating doubling high multiply. Result matches
+ *                  NEON instruction VQRDMULH.
+ * @param[in]       m1        Multiplicand
+ * @param[in]       m2        Multiplier
+ * @return          Result of multiplication.
+ *
+ */
+__STATIC_FORCEINLINE q31_t arm_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2)
+{
+    q31_t result = 0;
+    // Rounding offset to add for a right shift of 31
+    q63_t mult = 1 << 30;
+
+    if ((m1 < 0) ^ (m2 < 0))
+    {
+        mult = 1 - mult;
+    }
+    // Gets resolved as a SMLAL instruction
+    mult = mult + (q63_t)m1 * m2;
+
+    // Utilize all of the upper 32 bits. This is the doubling step
+    // as well.
+    result = mult / (1UL << 31);
+
+    if ((m1 == m2) && (m1 == Q31_MIN))
+    {
+        result = Q31_MAX;
+    }
+    return result;
+}
+
+/**
+ * @brief           Rounding divide by power of two.
+ * @param[in]       dividend - Dividend
+ * @param[in]       exponent - Divisor = power(2, exponent)
+ *                             Range: [0, 31]
+ * @return          Rounded result of division. Midpoint is rounded away from zero.
+ *
+ */
+__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
+{
+    q31_t result = 0;
+    const q31_t remainder_mask = (1l << exponent) - 1;
+    int32_t remainder = remainder_mask & dividend;
+
+    // Basic division
+    result = dividend >> exponent;
+
+    // Adjust 'result' for rounding (mid point away from zero)
+    q31_t threshold = remainder_mask >> 1;
+    if (result < 0)
+    {
+        threshold++;
+    }
+    if (remainder > threshold)
+    {
+        result++;
+    }
+
+    return result;
+}
+
 #ifdef __cplusplus
 }
 #endif