Official ARM version: v5.6.0

This commit is contained in:
rihab kouki 2020-07-28 11:24:49 +01:00
parent 9f95ff5b6b
commit 96d6da4e25
2939 changed files with 339304 additions and 113320 deletions

View file

@ -53,7 +53,4 @@ extern const q15_t tanhTable_q15[256];
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
extern const q15_t sigmoidLTable_q15[128];
extern const q15_t sigmoidHTable_q15[192];
#endif /* ARM_NN_TABLES_H */

View file

@ -34,7 +34,7 @@
* ------------
*
* This user manual describes the CMSIS NN software library,
* a collection of efficient neural network kernels developed to maximize the
* a collection of efficient neural network kernels developed to maximize the
* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
*
* The library is divided into a number of functions each covering a specific category:
@ -47,8 +47,8 @@
*
* The library has separate functions for operating on different weight and activation data
* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
* kernels are included in the function description. The implementation details are also
* described in this paper [1].
* kernels are included in the function description. The implementation details are also
* described in this paper [1].
*
* Block Diagram
* --------
@ -86,7 +86,7 @@
/**
* @defgroup groupNN Neural Network Functions
* These functions perform basic operations for neural network layers.
* These functions perform basic operations for neural network layers.
*/
#ifndef _ARM_NNFUNCTIONS_H
@ -111,12 +111,12 @@ extern "C"
*
* The convolution is implemented in 2 steps: im2col and GEMM
*
* im2col is a process of converting each patch of image data into
* im2col is a process of converting each patch of image data into
* a column. After im2col, the convolution is computed as matrix-matrix
* multiplication.
*
*
* To reduce the memory footprint, the im2col is performed partially.
* Each iteration, only a few column (i.e., patches) are generated and
* Each iteration, only a few column (i.e., patches) are generated and
* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
*
*/
@ -136,9 +136,9 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
@ -153,9 +153,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -180,7 +180,7 @@ extern "C"
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
@ -219,9 +219,9 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
@ -236,9 +236,9 @@ extern "C"
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -256,7 +256,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -278,9 +278,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -303,7 +303,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -355,7 +355,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -405,7 +405,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -426,9 +426,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -446,7 +446,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -468,9 +468,9 @@ extern "C"
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -493,7 +493,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -508,7 +508,7 @@ extern "C"
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
* ch_im_in is multiple of 2
*
* ch_im_out is multipe of 2
*
@ -532,10 +532,10 @@ extern "C"
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB);
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
@ -551,7 +551,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -574,8 +574,8 @@ extern "C"
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB);
/**
@ -598,7 +598,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@ -642,7 +642,7 @@ extern "C"
*
* Here we have two types of kernel functions. The basic function
* implements the function using regular GEMV approach. The opt functions
* operates with weights in interleaved formats.
* operates with weights in interleaved formats.
*
*/
@ -666,9 +666,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut,
q15_t * vec_buffer);
/**
@ -691,9 +691,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut,
q15_t * vec_buffer);
/**
@ -716,9 +716,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t * bias,
q15_t * pOut,
const uint16_t out_shift,
const q15_t * bias,
q15_t * pOut,
q15_t * vec_buffer);
/**
@ -742,8 +742,8 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t * bias,
q15_t * pOut,
const q15_t * bias,
q15_t * pOut,
q15_t * vec_buffer);
/**
@ -767,8 +767,8 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q15_t * pOut,
const q7_t * bias,
q15_t * pOut,
q15_t * vec_buffer);
/**
@ -792,16 +792,16 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q15_t * pOut,
const q7_t * bias,
q15_t * pOut,
q15_t * vec_buffer);
/**
* @brief Matrix-Multiplication Kernels for Convolution
*
* These functions are used within convolution layer functions for
* These functions are used within convolution layer functions for
* matrix multiplication.
*
*
* The implementation is similar to CMSIS-DSP arm_mat_mult functions
* with one Q7 and one Q15 operands. The Q15 operand is the im2col
* output which is always with 2 columns.
@ -826,8 +826,8 @@ extern "C"
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut);
/**
@ -848,8 +848,8 @@ extern "C"
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut);
#ifdef __cplusplus
@ -902,7 +902,7 @@ extern "C"
* @return none.
*/
void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
arm_nn_activation_type type);
/**
@ -944,9 +944,9 @@ extern "C"
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t * bufferA,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t * bufferA,
q7_t * Im_out);
/**
@ -969,9 +969,9 @@ extern "C"
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t * bufferA,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t * bufferA,
q7_t * Im_out);
/**
@ -1003,6 +1003,71 @@ extern "C"
void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
/**
* @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier
* and input channels. Unless specified otherwise, arguments are mandatory.
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x Convolution stride along the width
* @param[in] stride_y Convolution stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* availble, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] out_shift Amount of right-shift for output
* @param[in] out_mult Output multiplier for requantization
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
* <code>ARM_MATH_SUCCESS</code> - Successful operation
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
*
* <b> Input constraints</b>
* ch_mult is multiple of 2
* kernel_x is multiple of 2
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t out_shift,
const int32_t out_mult);
#ifdef __cplusplus
}
#endif

View file

@ -32,13 +32,17 @@
#include "arm_math.h"
#include "arm_common_tables.h"
//#include <cstring>
#ifdef __cplusplus
extern "C"
{
#endif
#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
#define Q31_MIN (0x80000000L)
#define Q31_MAX (0x7FFFFFFFL)
/**
* @brief Union for SIMD access of Q31/Q15/Q7 types
*/
@ -72,11 +76,11 @@ typedef enum
*/
/**
* @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
* @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
*
*/
@ -84,10 +88,10 @@ void arm_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t block
/**
* @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
*
*/
@ -163,7 +167,7 @@ void arm_nn_mult_q15(
q15_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
@ -185,16 +189,79 @@ void arm_nn_mult_q7(
q7_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
/**
* @brief defition to adding rouding offset
* @brief macro for adding rounding offset
*/
#ifndef ARM_NN_TRUNCATE
#define NN_ROUND(out_shift) ( 0x1 << (out_shift - 1) )
#define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 )
#else
#define NN_ROUND(out_shift) 0
#endif
/**
* @brief Saturating doubling high multiply. Result matches
* NEON instruction VQRDMULH.
* @param[in] m1 Multiplicand
* @param[in] m2 Multiplier
* @return Result of multiplication.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2)
{
q31_t result = 0;
// Rounding offset to add for a right shift of 31
q63_t mult = 1 << 30;
if ((m1 < 0) ^ (m2 < 0))
{
mult = 1 - mult;
}
// Gets resolved as a SMLAL instruction
mult = mult + (q63_t)m1 * m2;
// Utilize all of the upper 32 bits. This is the doubling step
// as well.
result = mult / (1UL << 31);
if ((m1 == m2) && (m1 == Q31_MIN))
{
result = Q31_MAX;
}
return result;
}
/**
* @brief Rounding divide by power of two.
* @param[in] dividend - Dividend
* @param[in] exponent - Divisor = power(2, exponent)
* Range: [0, 31]
* @return Rounded result of division. Midpoint is rounded away from zero.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
{
q31_t result = 0;
const q31_t remainder_mask = (1l << exponent) - 1;
int32_t remainder = remainder_mask & dividend;
// Basic division
result = dividend >> exponent;
// Adjust 'result' for rounding (mid point away from zero)
q31_t threshold = remainder_mask >> 1;
if (result < 0)
{
threshold++;
}
if (remainder > threshold)
{
result++;
}
return result;
}
#ifdef __cplusplus
}
#endif