- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicAbs Vector Absolute Value
+
+ Computes the absolute value of a vector on an element-by-element basis.
+
+
+ pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
+
+
+ The functions support in-place computation allowing the source and
+ destination pointers to reference the same memory buffer.
+ There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
- * @addtogroup BasicAbs
- * @{
+ @addtogroup BasicAbs
+ @{
*/
/**
- * @brief Floating-point vector absolute value.
- * @param[in] *pSrc points to the input buffer
- * @param[out] *pDst points to the output buffer
- * @param[in] blockSize number of samples in each vector
- * @return none.
+ @brief Floating-point vector absolute value.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_abs_f32(
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t res;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t in1, in2, in3, in4; /* temporary variables */
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- /*loop Unrolling */
+ while (blkCnt > 0U)
+ {
+ /* C = |A| */
+
+ /* Calculate absolute values and then store the results in the destination buffer. */
+ vec1 = vld1q_f32(pSrc);
+ res = vabsq_f32(vec1);
+ vst1q_f32(pDst, res);
+
+ /* Increment pointers */
+ pSrc += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Calculate absolute and then store the results in the destination buffer. */
- /* read sample from source */
- in1 = *pSrc;
- in2 = *(pSrc + 1);
- in3 = *(pSrc + 2);
- /* find absolute value */
- in1 = fabsf(in1);
+ /* Calculate absolute and store result in destination buffer. */
+ *pDst++ = fabsf(*pSrc++);
- /* read sample from source */
- in4 = *(pSrc + 3);
+ *pDst++ = fabsf(*pSrc++);
- /* find absolute value */
- in2 = fabsf(in2);
+ *pDst++ = fabsf(*pSrc++);
- /* read sample from source */
- *pDst = in1;
+ *pDst++ = fabsf(*pSrc++);
- /* find absolute value */
- in3 = fabsf(in3);
-
- /* find absolute value */
- in4 = fabsf(in4);
-
- /* store result to destination */
- *(pDst + 1) = in2;
-
- /* store result to destination */
- *(pDst + 2) = in3;
-
- /* store result to destination */
- *(pDst + 3) = in4;
-
-
- /* Update source pointer to process next sampels */
- pSrc += 4U;
-
- /* Update destination pointer to process next sampels */
- pDst += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Calculate absolute and then store the results in the destination buffer. */
+
+ /* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicAbs group
+ @} end of BasicAbs group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_abs_q15.c b/DSP/Source/BasicMathFunctions/arm_abs_q15.c
index ec47fff..eb944ce 100644
--- a/DSP/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q15.c
@@ -3,13 +3,13 @@
* Title: arm_abs_q15.c
* Description: Q15 vector absolute value
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,139 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicAbs
- * @{
+ @addtogroup BasicAbs
+ @{
*/
/**
- * @brief Q15 vector absolute value.
- * @param[in] *pSrc points to the input buffer
- * @param[out] *pDst points to the output buffer
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+ @brief Q15 vector absolute value.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_abs_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q15_t in; /* Temporary input variable */
-#if defined (ARM_MATH_DSP)
- __SIMD32_TYPE *simd;
+#if defined (ARM_MATH_LOOPUNROLL)
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t in1; /* Input value1 */
- q15_t in2; /* Input value2 */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- simd = __SIMD32_CONST(pDst);
while (blkCnt > 0U)
{
/* C = |A| */
- /* Read two inputs */
- in1 = *pSrc++;
- in2 = *pSrc++;
-
-
- /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
-#ifndef ARM_MATH_BIG_ENDIAN
- *simd++ =
- __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
- ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
+ /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
+ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
-
- *simd++ =
- __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
- ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- in1 = *pSrc++;
- in2 = *pSrc++;
-
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *simd++ =
- __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
- ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
-
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
+ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
- *simd++ =
- __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
- ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- pDst = (q15_t *)simd;
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = |A| */
- /* Read the input */
- in1 = *pSrc++;
-
- /* Calculate absolute value of input and then store the result in the destination buffer. */
- *pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t in; /* Temporary input variable */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = |A| */
- /* Read the input */
+
+ /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
-
- /* Calculate absolute value of input and then store the result in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of BasicAbs group
+ @} end of BasicAbs group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_abs_q31.c b/DSP/Source/BasicMathFunctions/arm_abs_q31.c
index 2733f51..bf7608b 100644
--- a/DSP/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q31.c
@@ -3,13 +3,13 @@
* Title: arm_abs_q31.c
* Description: Q31 vector absolute value
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,90 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicAbs
- * @{
+ @addtogroup BasicAbs
+ @{
*/
-
/**
- * @brief Q31 vector absolute value.
- * @param[in] *pSrc points to the input buffer
- * @param[out] *pDst points to the output buffer
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+ @brief Q31 vector absolute value.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_abs_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
- q31_t in; /* Input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t in; /* Temporary variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t in1, in2, in3, in4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
- *pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
- *pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
- *pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
- *pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
+ /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
- /* Decrement the loop counter */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
- in = *pSrc++;
- *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
- /* Decrement the loop counter */
+ /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of BasicAbs group
+ @} end of BasicAbs group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_abs_q7.c b/DSP/Source/BasicMathFunctions/arm_abs_q7.c
index d0acbfc..a6c4a6c 100644
--- a/DSP/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q7.c
@@ -3,13 +3,13 @@
* Title: arm_abs_q7.c
* Description: Q7 vector absolute value
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,117 +29,106 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicAbs
- * @{
+ @addtogroup BasicAbs
+ @{
*/
/**
- * @brief Q7 vector absolute value.
- * @param[in] *pSrc points to the input buffer
- * @param[out] *pDst points to the output buffer
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * \par Conditions for optimum performance
- * Input and output buffers should be aligned by 32-bit
- *
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+ @brief Q7 vector absolute value.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Conditions for optimum performance
+ Input and output buffers should be aligned by 32-bit
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
void arm_abs_q7(
- q7_t * pSrc,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
- q7_t in; /* Input value1 */
+ uint32_t blkCnt; /* Loop counter */
+ q7_t in; /* Temporary input variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t in1, in2, in3, in4; /* temporary input variables */
- q31_t out1, out2, out3, out4; /* temporary output variables */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Read inputs */
- in1 = (q31_t) * pSrc;
- in2 = (q31_t) * (pSrc + 1);
- in3 = (q31_t) * (pSrc + 2);
- /* find absolute value */
- out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
+ /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
- /* read input */
- in4 = (q31_t) * (pSrc + 3);
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
- /* find absolute value */
- out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
- /* store result to destination */
- *pDst = (q7_t) out1;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
- /* find absolute value */
- out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
-
- /* find absolute value */
- out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
-
- /* store result to destination */
- *(pDst + 1) = (q7_t) out2;
-
- /* store result to destination */
- *(pDst + 2) = (q7_t) out3;
-
- /* store result to destination */
- *(pDst + 3) = (q7_t) out4;
-
- /* update pointers to process next samples */
- pSrc += 4U;
- pDst += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #define ARM_MATH_CM0_FAMILY */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
- /* Read the input */
+
+ /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (in > 0) ? in : (q7_t) __QSUB(0, in);
+#else
+ *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
- /* Store the Absolute result in the destination buffer */
- *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicAbs group
+ @} end of BasicAbs group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_add_f32.c b/DSP/Source/BasicMathFunctions/arm_add_f32.c
index 78feb64..1c66a24 100644
--- a/DSP/Source/BasicMathFunctions/arm_add_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_add_f32.c
@@ -3,13 +3,13 @@
* Title: arm_add_f32.c
* Description: Floating-point vector addition
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,110 +29,117 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @defgroup BasicAdd Vector Addition
- *
- * Element-by-element addition of two vectors.
- *
- *
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicAdd Vector Addition
+
+ Element-by-element addition of two vectors.
+
+
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicDotProd Vector Dot Product
+
+ Computes the dot product of two vectors.
+ The vectors are multiplied element-by-element and then summed.
+
+
+
+ There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
- * @addtogroup dot_prod
- * @{
+ @addtogroup BasicDotProd
+ @{
*/
/**
- * @brief Dot product of floating-point vectors.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[in] blockSize number of samples in each vector
- * @param[out] *result output result returned here
- * @return none.
+ @brief Dot product of floating-point vectors.
+ @param[in] pSrcA points to the first input vector.
+ @param[in] pSrcB points to the second input vector.
+ @param[in] blockSize number of samples in each vector.
+ @param[out] result output result returned here.
+ @return none
*/
-
void arm_dot_prod_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- uint32_t blockSize,
- float32_t * result)
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ uint32_t blockSize,
+ float32_t * result)
{
- float32_t sum = 0.0f; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary return variable */
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
+ float32x4_t accum = vdupq_n_f32(0);
-#if defined (ARM_MATH_DSP)
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- /*loop Unrolling */
+ vec1 = vld1q_f32(pSrcA);
+ vec2 = vld1q_f32(pSrcB);
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
+ /* Calculate dot product and then store the result in a temporary buffer. */
+
+ accum = vmlaq_f32(accum, vec1, vec2);
+
+ /* Increment pointers */
+ pSrcA += 4;
+ pSrcB += 4;
+
+ vec1 = vld1q_f32(pSrcA);
+ vec2 = vld1q_f32(pSrcB);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#if __aarch64__
+ sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
+#else
+ sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1];
+#endif
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the result in a temporary buffer */
- sum += (*pSrcA++) * (*pSrcB++);
- sum += (*pSrcA++) * (*pSrcB++);
- sum += (*pSrcA++) * (*pSrcB++);
+
+ /* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
- /* Decrement the loop counter */
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the result in a temporary buffer. */
+
+ /* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the result back in the destination buffer */
+
+ /* Store result in destination buffer */
*result = sum;
}
/**
- * @} end of dot_prod group
+ @} end of BasicDotProd group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c b/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
index dec4ec5..e303b09 100644
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q15.c
* Description: Q15 dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,100 +29,92 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup dot_prod
- * @{
+ @addtogroup BasicDotProd
+ @{
*/
/**
- * @brief Dot product of Q15 vectors.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[in] blockSize number of samples in each vector
- * @param[out] *result output result returned here
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
- * results are added to a 64-bit accumulator in 34.30 format.
- * Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
- * there is no risk of overflow.
- * The return result is in 34.30 format.
+ @brief Dot product of Q15 vectors.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] blockSize number of samples in each vector
+ @param[out] result output result returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
+ results are added to a 64-bit accumulator in 34.30 format.
+ Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
+ there is no risk of overflow.
+ The return result is in 34.30 format.
*/
void arm_dot_prod_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- uint32_t blockSize,
- q63_t * result)
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ uint32_t blockSize,
+ q63_t * result)
{
- q63_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary return variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the result in a temporary buffer. */
- sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
- sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
- /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+ /* Calculate dot product and store result in a temporary buffer. */
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+#else
+ sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+ sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+ sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+ sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the results in a temporary buffer. */
- sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
-
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the results in a temporary buffer. */
- sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);
- /* Decrement the loop counter */
+ /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+// sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
+//#else
+ sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Store the result in the destination buffer in 34.30 format */
+ /* Store result in destination buffer in 34.30 format */
*result = sum;
-
}
/**
- * @} end of dot_prod group
+ @} end of BasicDotProd group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c b/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
index 67ae887..76cd577 100644
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q31.c
* Description: Q31 dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,103 +29,87 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup dot_prod
- * @{
+ @addtogroup BasicDotProd
+ @{
*/
/**
- * @brief Dot product of Q31 vectors.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[in] blockSize number of samples in each vector
- * @param[out] *result output result returned here
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
- * are truncated to 2.48 format by discarding the lower 14 bits.
- * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
- * There are 15 guard bits in the accumulator and there is no risk of overflow as long as
- * the length of the vectors is less than 2^16 elements.
- * The return result is in 16.48 format.
+ @brief Dot product of Q31 vectors.
+ @param[in] pSrcA points to the first input vector.
+ @param[in] pSrcB points to the second input vector.
+ @param[in] blockSize number of samples in each vector.
+ @param[out] result output result returned here.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
+ are truncated to 2.48 format by discarding the lower 14 bits.
+ The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ There are 15 guard bits in the accumulator and there is no risk of overflow as long as
+ the length of the vectors is less than 2^16 elements.
+ The return result is in 16.48 format.
*/
void arm_dot_prod_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- uint32_t blockSize,
- q63_t * result)
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ uint32_t blockSize,
+ q63_t * result)
{
- q63_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary return variable */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inA3, inA4;
- q31_t inB1, inB2, inB3, inB4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the result in a temporary buffer. */
- inA1 = *pSrcA++;
- inA2 = *pSrcA++;
- inA3 = *pSrcA++;
- inA4 = *pSrcA++;
- inB1 = *pSrcB++;
- inB2 = *pSrcB++;
- inB3 = *pSrcB++;
- inB4 = *pSrcB++;
- sum += ((q63_t) inA1 * inB1) >> 14U;
- sum += ((q63_t) inA2 * inB2) >> 14U;
- sum += ((q63_t) inA3 * inB3) >> 14U;
- sum += ((q63_t) inA4 * inB4) >> 14U;
+ /* Calculate dot product and store result in a temporary buffer. */
+ sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
- /* Decrement the loop counter */
+ sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+ sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+ sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Calculate dot product and then store the result in a temporary buffer. */
- sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14U;
- /* Decrement the loop counter */
+ /* Calculate dot product and store result in a temporary buffer. */
+ sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the result in the destination buffer in 16.48 format */
+ /* Store result in destination buffer in 16.48 format */
*result = sum;
}
/**
- * @} end of dot_prod group
+ @} end of BasicDotProd group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c b/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
index 487efe3..8e18a73 100644
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q7.c
* Description: Q7 dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,61 +29,58 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup dot_prod
- * @{
+ @addtogroup BasicDotProd
+ @{
*/
/**
- * @brief Dot product of Q7 vectors.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[in] blockSize number of samples in each vector
- * @param[out] *result output result returned here
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
- * results are added to an accumulator in 18.14 format.
- * Nonsaturating additions are used and there is no danger of wrap around as long as
- * the vectors are less than 2^18 elements long.
- * The return result is in 18.14 format.
+ @brief Dot product of Q7 vectors.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] blockSize number of samples in each vector
+ @param[out] result output result returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
+ results are added to an accumulator in 18.14 format.
+ Nonsaturating additions are used and there is no danger of wrap around as long as
+ the vectors are less than 2^18 elements long.
+ The return result is in 18.14 format.
*/
void arm_dot_prod_q7(
- q7_t * pSrcA,
- q7_t * pSrcB,
- uint32_t blockSize,
- q31_t * result)
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ uint32_t blockSize,
+ q31_t * result)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary return variable */
- q31_t sum = 0; /* Temporary variables to store output */
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q31_t input1, input2; /* Temporary variables */
+ q31_t inA1, inA2, inB1, inB2; /* Temporary variables */
+#endif
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t input1, input2; /* Temporary variables to store input */
- q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */
-
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
+ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
/* read 4 samples at a time from sourceA */
- input1 = *__SIMD32(pSrcA)++;
+ input1 = read_q7x4_ia ((q7_t **) &pSrcA);
/* read 4 samples at a time from sourceB */
- input2 = *__SIMD32(pSrcB)++;
+ input2 = read_q7x4_ia ((q7_t **) &pSrcB);
/* extract two q7_t samples to q15_t samples */
inA1 = __SXTB16(__ROR(input1, 8));
@@ -97,51 +94,46 @@ void arm_dot_prod_q7(
/* multiply and accumulate two samples at a time */
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
+#else
+ sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+ sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+ sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+ sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+#endif
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Dot product and then store the results in a temporary buffer. */
- sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
-
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
- /* Dot product and then store the results in a temporary buffer. */
- sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);
- /* Decrement the loop counter */
+ /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+// sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
+//#else
+ sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
- /* Store the result in the destination buffer in 18.14 format */
+ /* Store result in destination buffer in 18.14 format */
*result = sum;
}
/**
- * @} end of dot_prod group
+ @} end of BasicDotProd group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_mult_f32.c b/DSP/Source/BasicMathFunctions/arm_mult_f32.c
index e4a9ef2..53ad73c 100644
--- a/DSP/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mult_f32.c
* Description: Floating-point vector multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,134 +29,120 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @defgroup BasicMult Vector Multiplication
- *
- * Element-by-element multiplication of two vectors.
- *
- *
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicMult Vector Multiplication
+
+ Element-by-element multiplication of two vectors.
+
+
+
+ There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
- * @addtogroup BasicMult
- * @{
+ @addtogroup BasicMult
+ @{
*/
/**
- * @brief Floating-point vector multiplication.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
+ @brief Floating-point vector multiplication.
+ @param[in] pSrcA points to the first input vector.
+ @param[in] pSrcB points to the second input vector.
+ @param[out] pDst points to the output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
*/
void arm_mult_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- float32_t * pDst,
- uint32_t blockSize)
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ float32_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counters */
-#if defined (ARM_MATH_DSP)
+ uint32_t blkCnt; /* Loop counter */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inA1, inA2, inA3, inA4; /* temporary input variables */
- float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
- float32_t out1, out2, out3, out4; /* temporary output variables */
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
- /* loop Unrolling */
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * B */
+
+ /* Multiply the inputs and then store the results in the destination buffer. */
+ vec1 = vld1q_f32(pSrcA);
+ vec2 = vld1q_f32(pSrcB);
+ res = vmulq_f32(vec1, vec2);
+ vst1q_f32(pDst, res);
+
+ /* Increment pointers */
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and store the results in output buffer */
- /* read sample from sourceA */
- inA1 = *pSrcA;
- /* read sample from sourceB */
- inB1 = *pSrcB;
- /* read sample from sourceA */
- inA2 = *(pSrcA + 1);
- /* read sample from sourceB */
- inB2 = *(pSrcB + 1);
- /* out = sourceA * sourceB */
- out1 = inA1 * inB1;
+ /* Multiply inputs and store result in destination buffer. */
+ *pDst++ = (*pSrcA++) * (*pSrcB++);
- /* read sample from sourceA */
- inA3 = *(pSrcA + 2);
- /* read sample from sourceB */
- inB3 = *(pSrcB + 2);
+ *pDst++ = (*pSrcA++) * (*pSrcB++);
- /* out = sourceA * sourceB */
- out2 = inA2 * inB2;
+ *pDst++ = (*pSrcA++) * (*pSrcB++);
- /* read sample from sourceA */
- inA4 = *(pSrcA + 3);
+ *pDst++ = (*pSrcA++) * (*pSrcB++);
- /* store result to destination buffer */
- *pDst = out1;
-
- /* read sample from sourceB */
- inB4 = *(pSrcB + 3);
-
- /* out = sourceA * sourceB */
- out3 = inA3 * inB3;
-
- /* store result to destination buffer */
- *(pDst + 1) = out2;
-
- /* out = sourceA * sourceB */
- out4 = inA4 * inB4;
- /* store result to destination buffer */
- *(pDst + 2) = out3;
- /* store result to destination buffer */
- *(pDst + 3) = out4;
-
-
- /* update pointers to process next samples */
- pSrcA += 4U;
- pSrcB += 4U;
- pDst += 4U;
-
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and store the results in output buffer */
+
+ /* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicMult group
+ @} end of BasicMult group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_mult_q15.c b/DSP/Source/BasicMathFunctions/arm_mult_q15.c
index 8e20963..37aa924 100644
--- a/DSP/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mult_q15.c
* Description: Q15 vector multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,65 +29,65 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicMult
- * @{
+ @addtogroup BasicMult
+ @{
*/
-
/**
- * @brief Q15 vector multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ @brief Q15 vector multiplication
+ @param[in] pSrcA points to first input vector
+ @param[in] pSrcB points to second input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_mult_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counters */
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q31_t inA1, inA2, inB1, inB2; /* Temporary input variables */
+ q15_t out1, out2, out3, out4; /* Temporary output variables */
+ q31_t mul1, mul2, mul3, mul4; /* Temporary variables */
+#endif
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
- q15_t out1, out2, out3, out4; /* temporary output variables */
- q31_t mul1, mul2, mul3, mul4; /* temporary variables */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* read two samples at a time from sourceA */
- inA1 = *__SIMD32(pSrcA)++;
- /* read two samples at a time from sourceB */
- inB1 = *__SIMD32(pSrcB)++;
- /* read two samples at a time from sourceA */
- inA2 = *__SIMD32(pSrcA)++;
- /* read two samples at a time from sourceB */
- inB2 = *__SIMD32(pSrcB)++;
+ /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+ /* read 2 samples at a time from sourceA */
+ inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+ /* read 2 samples at a time from sourceB */
+ inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+ /* read 2 samples at a time from sourceA */
+ inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+ /* read 2 samples at a time from sourceB */
+ inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
- mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
+ mul2 = (q31_t) ((q15_t) (inA1 ) * (q15_t) (inB1 ));
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
- mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
+ mul4 = (q31_t) ((q15_t) (inA2 ) * (q15_t) (inB2 ));
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT(mul1 >> 15, 16);
@@ -95,48 +95,49 @@ void arm_mult_q15(
out3 = (q15_t) __SSAT(mul3 >> 15, 16);
out4 = (q15_t) __SSAT(mul4 >> 15, 16);
- /* store the result */
+ /* store result to destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
- *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
+ write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
+ write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
#else
-
- *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
- *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
+ write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
+ write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Decrement the blockSize loop counter */
+#else
+ *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+ *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+ *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+ *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and store the result in the destination buffer */
+
+ /* Multiply inputs and store result in destination buffer. */
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicMult group
+ @} end of BasicMult group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_mult_q31.c b/DSP/Source/BasicMathFunctions/arm_mult_q31.c
index c302b01..9592684 100644
--- a/DSP/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mult_q31.c
* Description: Q31 vector multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,120 +29,91 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicMult
- * @{
+ @addtogroup BasicMult
+ @{
*/
/**
- * @brief Q31 vector multiplication.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+ @brief Q31 vector multiplication.
+ @param[in] pSrcA points to the first input vector.
+ @param[in] pSrcB points to the second input vector.
+ @param[out] pDst points to the output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_mult_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counters */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t out; /* Temporary output variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inA3, inA4; /* temporary input variables */
- q31_t inB1, inB2, inB3, inB4; /* temporary input variables */
- q31_t out1, out2, out3, out4; /* temporary output variables */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and then store the results in the destination buffer. */
- inA1 = *pSrcA++;
- inA2 = *pSrcA++;
- inA3 = *pSrcA++;
- inA4 = *pSrcA++;
- inB1 = *pSrcB++;
- inB2 = *pSrcB++;
- inB3 = *pSrcB++;
- inB4 = *pSrcB++;
- out1 = ((q63_t) inA1 * inB1) >> 32;
- out2 = ((q63_t) inA2 * inB2) >> 32;
- out3 = ((q63_t) inA3 * inB3) >> 32;
- out4 = ((q63_t) inA4 * inB4) >> 32;
+ /* Multiply inputs and store result in destination buffer. */
+ out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+ out = __SSAT(out, 31);
+ *pDst++ = out << 1U;
- out1 = __SSAT(out1, 31);
- out2 = __SSAT(out2, 31);
- out3 = __SSAT(out3, 31);
- out4 = __SSAT(out4, 31);
+ out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+ out = __SSAT(out, 31);
+ *pDst++ = out << 1U;
- *pDst++ = out1 << 1U;
- *pDst++ = out2 << 1U;
- *pDst++ = out3 << 1U;
- *pDst++ = out4 << 1U;
+ out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+ out = __SSAT(out, 31);
+ *pDst++ = out << 1U;
- /* Decrement the blockSize loop counter */
+ out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+ out = __SSAT(out, 31);
+ *pDst++ = out << 1U;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = A * B */
- /* Multiply the inputs and then store the results in the destination buffer. */
- inA1 = *pSrcA++;
- inB1 = *pSrcB++;
- out1 = ((q63_t) inA1 * inB1) >> 32;
- out1 = __SSAT(out1, 31);
- *pDst++ = out1 << 1U;
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and then store the results in the destination buffer. */
- *pDst++ =
- (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
- /* Decrement the blockSize loop counter */
+ /* Multiply inputs and store result in destination buffer. */
+ out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+ out = __SSAT(out, 31);
+ *pDst++ = out << 1U;
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of BasicMult group
+ @} end of BasicMult group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_mult_q7.c b/DSP/Source/BasicMathFunctions/arm_mult_q7.c
index d8a2f8a..5587ce5 100644
--- a/DSP/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q7.c
@@ -3,13 +3,13 @@
* Title: arm_mult_q7.c
* Description: Q7 vector multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,87 +29,91 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicMult
- * @{
+ @addtogroup BasicMult
+ @{
*/
/**
- * @brief Q7 vector multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ @brief Q7 vector multiplication
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_mult_q7(
- q7_t * pSrcA,
- q7_t * pSrcB,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ q7_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counters */
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q7_t out1, out2, out3, out4; /* Temporary output variables */
+#endif
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and store the results in temporary variables */
+
+#if defined (ARM_MATH_DSP)
+ /* Multiply inputs and store results in temporary variables */
out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
- /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
- *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
+ /* Pack and store result in destination buffer (in single write) */
+ write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+ *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+ *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+ *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+ *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+#endif
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
- /* Multiply the inputs and store the result in the destination buffer */
+
+ /* Multiply input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicMult group
+ @} end of BasicMult group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_negate_f32.c b/DSP/Source/BasicMathFunctions/arm_negate_f32.c
index e39624c..f807112 100644
--- a/DSP/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_f32.c
@@ -3,13 +3,13 @@
* Title: arm_negate_f32.c
* Description: Negates floating-point vectors
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,106 +29,117 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @defgroup negate Vector Negate
- *
- * Negates the elements of a vector.
- *
- *
- * pDst[n] = -pSrc[n], 0 <= n < blockSize.
- *
- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicNegate Vector Negate
+
+ Negates the elements of a vector.
+
+
+ pDst[n] = -pSrc[n], 0 <= n < blockSize.
+
+
+ The functions support in-place computation allowing the source and
+ destination pointers to reference the same memory buffer.
+ There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
- * @addtogroup negate
- * @{
+ @addtogroup BasicNegate
+ @{
*/
/**
- * @brief Negates the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in the vector
- * @return none.
+ @brief Negates the elements of a floating-point vector.
+ @param[in] pSrc points to input vector.
+ @param[out] pDst points to output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
*/
void arm_negate_f32(
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+ float32x4_t vec1;
+ float32x4_t res;
-#if defined (ARM_MATH_DSP)
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t in1, in2, in3, in4; /* temporary variables */
+ while (blkCnt > 0U)
+ {
+ /* C = -A */
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
+ /* Negate and then store the results in the destination buffer. */
+ vec1 = vld1q_f32(pSrc);
+ res = vnegq_f32(vec1);
+ vst1q_f32(pDst, res);
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* read inputs from source */
- in1 = *pSrc;
- in2 = *(pSrc + 1);
- in3 = *(pSrc + 2);
- in4 = *(pSrc + 3);
+ /* Increment pointers */
+ pSrc += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
- /* negate the input */
- in1 = -in1;
- in2 = -in2;
- in3 = -in3;
- in4 = -in4;
-
- /* store the result to destination */
- *pDst = in1;
- *(pDst + 1) = in2;
- *(pDst + 2) = in3;
- *(pDst + 3) = in4;
-
- /* update pointers to process next samples */
- pSrc += 4U;
- pDst += 4U;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ /* Tail */
+ blkCnt = blockSize & 0x3;
#else
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
- /* Negate and then store the results in the destination buffer. */
+
+ /* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
- /* Decrement the loop counter */
+ *pDst++ = -*pSrc++;
+
+ *pDst++ = -*pSrc++;
+
+ *pDst++ = -*pSrc++;
+
+ /* Decrement loop counter */
blkCnt--;
}
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = -A */
+
+ /* Negate and store result in destination buffer. */
+ *pDst++ = -*pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
}
/**
- * @} end of negate group
+ @} end of BasicNegate group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_negate_q15.c b/DSP/Source/BasicMathFunctions/arm_negate_q15.c
index 9624160..267e4cc 100644
--- a/DSP/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q15.c
@@ -3,13 +3,13 @@
* Title: arm_negate_q15.c
* Description: Negates Q15 vectors
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,103 +29,98 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup negate
- * @{
+ @addtogroup BasicNegate
+ @{
*/
/**
- * @brief Negates the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in the vector
- * @return none.
- *
- * \par Conditions for optimum performance
- * Input and output buffers should be aligned by 32-bit
- *
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+ @brief Negates the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector.
+ @param[out] pDst points to the output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
+
+ @par Conditions for optimum performance
+ Input and output buffers should be aligned by 32-bit
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_negate_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
- q15_t in;
+ uint32_t blkCnt; /* Loop counter */
+ q15_t in; /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q31_t in1; /* Temporary input variables */
+#endif
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t in1, in2; /* Temporary variables */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
- /* Read two inputs at a time */
- in1 = _SIMD32_OFFSET(pSrc);
- in2 = _SIMD32_OFFSET(pSrc + 2);
- /* negate two samples at a time */
- in1 = __QSUB16(0, in1);
+#if defined (ARM_MATH_DSP)
+ /* Negate and store result in destination buffer (2 samples at a time). */
+ in1 = read_q15x2_ia ((q15_t **) &pSrc);
+ write_q15x2_ia (&pDst, __QSUB16(0, in1));
- /* negate two samples at a time */
- in2 = __QSUB16(0, in2);
+ in1 = read_q15x2_ia ((q15_t **) &pSrc);
+ write_q15x2_ia (&pDst, __QSUB16(0, in1));
+#else
+ in = *pSrc++;
+ *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
- /* store the result to destination 2 samples at a time */
- _SIMD32_OFFSET(pDst) = in1;
- /* store the result to destination 2 samples at a time */
- _SIMD32_OFFSET(pDst + 2) = in2;
+ in = *pSrc++;
+ *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+ in = *pSrc++;
+ *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
- /* update pointers to process next samples */
- pSrc += 4U;
- pDst += 4U;
+ in = *pSrc++;
+ *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+#endif
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
- /* Negate and then store the result in the destination buffer. */
- in = *pSrc++;
- *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
- /* Decrement the loop counter */
+ /* Negate and store result in destination buffer. */
+ in = *pSrc++;
+ *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of negate group
+ @} end of BasicNegate group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_negate_q31.c b/DSP/Source/BasicMathFunctions/arm_negate_q31.c
index 4a5a58d..645fb0a 100644
--- a/DSP/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q31.c
@@ -3,13 +3,13 @@
* Title: arm_negate_q31.c
* Description: Negates Q31 vectors
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,89 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup negate
- * @{
+ @addtogroup BasicNegate
+ @{
*/
/**
- * @brief Negates the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in the vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+ @brief Negates the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector.
+ @param[out] pDst points to the output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_negate_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t in; /* Temporary variable */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t in; /* Temporary input variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t in1, in2, in3, in4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
- /* Negate and then store the results in the destination buffer. */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
- *pDst++ = __QSUB(0, in1);
- *pDst++ = __QSUB(0, in2);
- *pDst++ = __QSUB(0, in3);
- *pDst++ = __QSUB(0, in4);
+ /* Negate and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
- /* Decrement the loop counter */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
- /* Negate and then store the result in the destination buffer. */
- in = *pSrc++;
- *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
- /* Decrement the loop counter */
+ /* Negate and store result in destination buffer. */
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of negate group
+ @} end of BasicNegate group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_negate_q7.c b/DSP/Source/BasicMathFunctions/arm_negate_q7.c
index d72c317..40a373e 100644
--- a/DSP/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q7.c
@@ -3,13 +3,13 @@
* Title: arm_negate_q7.c
* Description: Negates Q7 vectors
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,85 +29,98 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup negate
- * @{
+ @addtogroup BasicNegate
+ @{
*/
/**
- * @brief Negates the elements of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in the vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+ @brief Negates the elements of a Q7 vector.
+ @param[in] pSrc points to the input vector.
+ @param[out] pDst points to the output vector.
+ @param[in] blockSize number of samples in each vector.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
*/
void arm_negate_q7(
- q7_t * pSrc,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
- q7_t in;
+ uint32_t blkCnt; /* Loop counter */
+ q7_t in; /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q31_t in1; /* Temporary input variable */
+#endif
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t input; /* Input values1-4 */
- q31_t zero = 0x00000000;
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
- /* Read four inputs */
- input = *__SIMD32(pSrc)++;
- /* Store the Negated results in the destination buffer in a single cycle by packing the results */
- *__SIMD32(pDst)++ = __QSUB8(zero, input);
+#if defined (ARM_MATH_DSP)
+ /* Negate and store result in destination buffer (4 samples at a time). */
+ in1 = read_q7x4_ia ((q7_t **) &pSrc);
+ write_q7x4_ia (&pDst, __QSUB8(0, in1));
+#else
+ in = *pSrc++;
+ *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
- /* Decrement the loop counter */
+ in = *pSrc++;
+ *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+ in = *pSrc++;
+ *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+ in = *pSrc++;
+ *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
- /* Negate and then store the results in the destination buffer. */ \
- in = *pSrc++;
- *pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;
- /* Decrement the loop counter */
+ /* Negate and store result in destination buffer. */
+ in = *pSrc++;
+
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (q7_t) __QSUB(0, in);
+#else
+ *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of negate group
+ @} end of BasicNegate group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_offset_f32.c b/DSP/Source/BasicMathFunctions/arm_offset_f32.c
index ebc20a4..b10e3f1 100644
--- a/DSP/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_offset_f32.c
@@ -3,13 +3,13 @@
* Title: arm_offset_f32.c
* Description: Floating-point vector offset
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,119 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @defgroup offset Vector Offset
- *
- * Adds a constant offset to each element of a vector.
- *
- *
- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicOffset Vector Offset
+
+ Adds a constant offset to each element of a vector.
+
+
- *
- * In the fixed-point Q7, Q15, and Q31 functions, scale is represented by
- * a fractional multiplication scaleFract and an arithmetic shift shift.
- * The shift allows the gain of the scaling operation to exceed 1.0.
- * The algorithm used with fixed-point data is:
- *
- *
- *
- * The overall scale factor applied to the fixed-point data is
- *
- * scale = scaleFract * 2^shift.
- *
- *
- * The functions support in-place computation allowing the source and destination
- * pointers to reference the same memory buffer.
+ @defgroup BasicScale Vector Scale
+
+ Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
+
+
+
+ In the fixed-point Q7, Q15, and Q31 functions, scale is represented by
+ a fractional multiplication scaleFract and an arithmetic shift shift.
+ The shift allows the gain of the scaling operation to exceed 1.0.
+ The algorithm used with fixed-point data is:
+
+
- *
- * If shift is positive then the elements of the vector are shifted to the left.
- * If shift is negative then the elements of the vector are shifted to the right.
- *
- * The functions support in-place computation allowing the source and destination
- * pointers to reference the same memory buffer.
+ @defgroup BasicShift Vector Shift
+
+ Shifts the elements of a fixed-point vector by a specified number of bits.
+ There are separate functions for Q7, Q15, and Q31 data types.
+ The underlying algorithm used is:
+
+
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ @defgroup BasicSub Vector Subtraction
+
+ Element-by-element subtraction of two vectors.
+
+
+
+ There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
- * @addtogroup BasicSub
- * @{
+ @addtogroup BasicSub
+ @{
*/
-
/**
- * @brief Floating-point vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
+ @brief Floating-point vector subtraction.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_sub_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- float32_t * pDst,
- uint32_t blockSize)
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ float32_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inA1, inA2, inA3, inA4; /* temporary variables */
- float32_t inB1, inB2, inB3, inB4; /* temporary variables */
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- /*loop Unrolling */
+ while (blkCnt > 0U)
+ {
+ /* C = A - B */
+
+ /* Subtract and then store the results in the destination buffer. */
+ vec1 = vld1q_f32(pSrcA);
+ vec2 = vld1q_f32(pSrcB);
+ res = vsubq_f32(vec1, vec2);
+ vst1q_f32(pDst, res);
+
+ /* Increment pointers */
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
- /* Read 4 input samples from sourceA and sourceB */
- inA1 = *pSrcA;
- inB1 = *pSrcB;
- inA2 = *(pSrcA + 1);
- inB2 = *(pSrcB + 1);
- inA3 = *(pSrcA + 2);
- inB3 = *(pSrcB + 2);
- inA4 = *(pSrcA + 3);
- inB4 = *(pSrcB + 3);
- /* dst = srcA - srcB */
- /* subtract and store the result */
- *pDst = inA1 - inB1;
- *(pDst + 1) = inA2 - inB2;
- *(pDst + 2) = inA3 - inB3;
- *(pDst + 3) = inA4 - inB4;
+ /* Subtract and store result in destination buffer. */
+ *pDst++ = (*pSrcA++) - (*pSrcB++);
+ *pDst++ = (*pSrcA++) - (*pSrcB++);
- /* Update pointers to process next sampels */
- pSrcA += 4U;
- pSrcB += 4U;
- pDst += 4U;
+ *pDst++ = (*pSrcA++) - (*pSrcB++);
- /* Decrement the loop counter */
+ *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
+
+ /* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of BasicSub group
+ @} end of BasicSub group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_sub_q15.c b/DSP/Source/BasicMathFunctions/arm_sub_q15.c
index 17942eb..835917e 100644
--- a/DSP/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q15.c
@@ -3,13 +3,13 @@
* Title: arm_sub_q15.c
* Description: Q15 vector subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,100 +29,98 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicSub
- * @{
+ @addtogroup BasicSub
+ @{
*/
/**
- * @brief Q15 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ @brief Q15 vector subtraction.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_sub_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2;
q31_t inB1, inB2;
+#endif
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the results in the destination buffer two samples at a time. */
- inA1 = *__SIMD32(pSrcA)++;
- inA2 = *__SIMD32(pSrcA)++;
- inB1 = *__SIMD32(pSrcB)++;
- inB2 = *__SIMD32(pSrcB)++;
- *__SIMD32(pDst)++ = __QSUB16(inA1, inB1);
- *__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
+#if defined (ARM_MATH_DSP)
+ /* read 2 times 2 samples at a time from sourceA */
+ inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+ inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+ /* read 2 times 2 samples at a time from sourceB */
+ inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+ inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
- /* Decrement the loop counter */
+ /* Subtract and store 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
+ write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
+#else
+ *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+ *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+ *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+ *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);
- /* Decrement the loop counter */
+ /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
+#else
+ *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
}
/**
- * @} end of BasicSub group
+ @} end of BasicSub group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_sub_q31.c b/DSP/Source/BasicMathFunctions/arm_sub_q31.c
index 72b8597..bac1927 100644
--- a/DSP/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q31.c
@@ -3,13 +3,13 @@
* Title: arm_sub_q31.c
* Description: Q31 vector subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,106 +29,80 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicSub
- * @{
+ @addtogroup BasicSub
+ @{
*/
/**
- * @brief Q31 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+ @brief Q31 vector subtraction.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_sub_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inA3, inA4;
- q31_t inB1, inB2, inB3, inB4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
- inA1 = *pSrcA++;
- inA2 = *pSrcA++;
- inB1 = *pSrcB++;
- inB2 = *pSrcB++;
- inA3 = *pSrcA++;
- inA4 = *pSrcA++;
- inB3 = *pSrcB++;
- inB4 = *pSrcB++;
-
- *pDst++ = __QSUB(inA1, inB1);
- *pDst++ = __QSUB(inA2, inB2);
- *pDst++ = __QSUB(inA3, inB3);
- *pDst++ = __QSUB(inA4, inB4);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
+ /* Subtract and store result in destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
- /* Decrement the loop counter */
+ *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+ *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+ *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+ /* Decrement loop counter */
blkCnt--;
}
-#else
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
- /* Run the below code for Cortex-M0 */
+#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);
- /* Decrement the loop counter */
+ /* Subtract and store result in destination buffer. */
+ *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of BasicSub group
+ @} end of BasicSub group
*/
diff --git a/DSP/Source/BasicMathFunctions/arm_sub_q7.c b/DSP/Source/BasicMathFunctions/arm_sub_q7.c
index d211f40..a55a8fd 100644
--- a/DSP/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q7.c
@@ -3,13 +3,13 @@
* Title: arm_sub_q7.c
* Description: Q7 vector subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,91 +29,81 @@
#include "arm_math.h"
/**
- * @ingroup groupMath
+ @ingroup groupMath
*/
/**
- * @addtogroup BasicSub
- * @{
+ @addtogroup BasicSub
+ @{
*/
/**
- * @brief Q7 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ @brief Q7 vector subtraction.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[out] pDst points to the output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_sub_q7(
- q7_t * pSrcA,
- q7_t * pSrcB,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ q7_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the results in the destination buffer 4 samples at a time. */
- *__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
- /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+ /* Subtract and store result in destination buffer (4 samples at a time). */
+ write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+ *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+ *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+ *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+ *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);
- /* Decrement the loop counter */
+ /* Subtract and store result in destination buffer. */
+ *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
}
/**
- * @} end of BasicSub group
+ @} end of BasicSub group
*/
diff --git a/DSP/Source/CMakeLists.txt b/DSP/Source/CMakeLists.txt
new file mode 100644
index 0000000..f5c58a7
--- /dev/null
+++ b/DSP/Source/CMakeLists.txt
@@ -0,0 +1,223 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSP)
+
+# Needed to find the config modules
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+# Select which parts of the CMSIS-DSP must be compiled.
+# There are some dependencies between the parts but they are not tracked
+# by this cmake. So, enabling some functions may require to enable some
+# other ones.
+option(BASICMATH "Basic Math Functions" ON)
+option(COMPLEXMATH "Complex Math Functions" ON)
+option(CONTROLLER "Controller Functions" ON)
+option(FASTMATH "Fast Math Functions" ON)
+option(FILTERING "Filtering Functions" ON)
+option(MATRIX "Matrix Functions" ON)
+option(STATISTICS "Statistics Functions" ON)
+option(SUPPORT "Support Functions" ON)
+option(TRANSFORM "Transform Functions" ON)
+
+# When OFF it is the default behavior : all tables are included.
+option(CONFIGTABLE "Configuration of table allowed" OFF)
+
+# When CONFIGTABLE is ON, select if all interpolation tables must be included
+option(ALLFAST "All interpolation tables included" OFF)
+# When CONFIGTABLE is ON, select if all FFT tables must be included
+option(ALLFFT "All fft tables included" OFF)
+
+# Features which require inclusion of a data table.
+# Since some tables may be big, the corresponding feature can be
+# disabled.
+# Those options are taken into account only when CONFIGTABLE is ON
+option(ARM_COS_F32 "cos f32" OFF)
+option(ARM_COS_Q31 "cos q31" OFF)
+option(ARM_COS_Q15 "cos q15" OFF)
+option(ARM_SIN_F32 "sin f32" OFF)
+option(ARM_SIN_Q31 "sin q31" OFF)
+option(ARM_SIN_Q15 "sin q15" OFF)
+option(ARM_SIN_COS_F32 "sin cos f32" OFF)
+option(ARM_SIN_COS_Q31 "sin cos q31" OFF)
+
+option(ARM_LMS_NORM_Q31 "lms norm q31" OFF)
+option(ARM_LMS_NORM_Q15 "lms norm q15" OFF)
+
+option(CFFT_F32_16 "cfft f32 16" OFF)
+option(CFFT_F32_32 "cfft f32 32" OFF)
+option(CFFT_F32_64 "cfft f32 64" OFF)
+option(CFFT_F32_128 "cfft f32 128" OFF)
+option(CFFT_F32_256 "cfft f32 256" OFF)
+option(CFFT_F32_512 "cfft f32 512" OFF)
+option(CFFT_F32_1024 "cfft f32 1024" OFF)
+option(CFFT_F32_2048 "cfft f32 2048" OFF)
+option(CFFT_F32_4096 "cfft f32 4096" OFF)
+
+option(CFFT_Q31_16 "cfft q31 16" OFF)
+option(CFFT_Q31_32 "cfft q31 32" OFF)
+option(CFFT_Q31_64 "cfft q31 64" OFF)
+option(CFFT_Q31_128 "cfft q31 128" OFF)
+option(CFFT_Q31_256 "cfft q31 256" OFF)
+option(CFFT_Q31_512 "cfft q31 512" OFF)
+option(CFFT_Q31_1024 "cfft q31 1024" OFF)
+option(CFFT_Q31_2048 "cfft q31 2048" OFF)
+option(CFFT_Q31_4096 "cfft q31 4096" OFF)
+
+option(CFFT_Q15_16 "cfft q15 16" OFF)
+option(CFFT_Q15_32 "cfft q15 32" OFF)
+option(CFFT_Q15_64 "cfft q15 64" OFF)
+option(CFFT_Q15_128 "cfft q15 128" OFF)
+option(CFFT_Q15_256 "cfft q15 256" OFF)
+option(CFFT_Q15_512 "cfft q15 512" OFF)
+option(CFFT_Q15_1024 "cfft q15 1024" OFF)
+option(CFFT_Q15_2048 "cfft q15 2048" OFF)
+option(CFFT_Q15_4096 "cfft q15 4096" OFF)
+
+option(RFFT_FAST_F32_32 "rfft fast f32 32" OFF)
+option(RFFT_FAST_F32_64 "rfft fast f32 64" OFF)
+option(RFFT_FAST_F32_128 "rfft fast f32 128" OFF)
+option(RFFT_FAST_F32_256 "rfft fast f32 256" OFF)
+option(RFFT_FAST_F32_512 "rfft fast f32 512" OFF)
+option(RFFT_FAST_F32_1024 "rfft fast f32 1024" OFF)
+option(RFFT_FAST_F32_2048 "rfft fast f32 2048" OFF)
+option(RFFT_FAST_F32_4096 "rfft fast f32 4096" OFF)
+
+
+option(RFFT_F32_128 "rfft f32 128" OFF)
+option(RFFT_F32_512 "rfft f32 512" OFF)
+option(RFFT_F32_2048 "rfft f32 2048" OFF)
+option(RFFT_F32_8192 "rfft f32 8192" OFF)
+
+option(RFFT_Q31_32 "rfft q31 32" OFF)
+option(RFFT_Q31_64 "rfft q31 64" OFF)
+option(RFFT_Q31_128 "rfft q31 128" OFF)
+option(RFFT_Q31_256 "rfft q31 256" OFF)
+option(RFFT_Q31_512 "rfft q31 512" OFF)
+option(RFFT_Q31_1024 "rfft q31 1024" OFF)
+option(RFFT_Q31_2048 "rfft q31 2048" OFF)
+option(RFFT_Q31_4096 "rfft q31 4096" OFF)
+option(RFFT_Q31_8192 "rfft q31 8192" OFF)
+
+option(RFFT_Q15_32 "rfft q15 32" OFF)
+option(RFFT_Q15_64 "rfft q15 64" OFF)
+option(RFFT_Q15_128 "rfft q15 128" OFF)
+option(RFFT_Q15_256 "rfft q15 256" OFF)
+option(RFFT_Q15_512 "rfft q15 512" OFF)
+option(RFFT_Q15_1024 "rfft q15 1024" OFF)
+option(RFFT_Q15_2048 "rfft q15 2048" OFF)
+option(RFFT_Q15_4096 "rfft q15 4096" OFF)
+option(RFFT_Q15_8192 "rfft q15 8192" OFF)
+
+option(DCT4_F32_128 "dct4 f32 128" OFF)
+option(DCT4_F32_512 "dct4 f32 512" OFF)
+option(DCT4_F32_2048 "dct4 f32 2048" OFF)
+option(DCT4_F32_8192 "dct4 f32 8192" OFF)
+
+option(DCT4_Q31_128 "dct4 q31 128" OFF)
+option(DCT4_Q31_512 "dct4 q31 512" OFF)
+option(DCT4_Q31_2048 "dct4 q31 2048" OFF)
+option(DCT4_Q31_8192 "dct4 q31 8192" OFF)
+
+option(DCT4_Q15_128 "dct4 q15 128" OFF)
+option(DCT4_Q15_512 "dct4 q15 512" OFF)
+option(DCT4_Q15_2048 "dct4 q15 2048" OFF)
+option(DCT4_Q15_8192 "dct4 q15 8192" OFF)
+
+
+###########################
+#
+# CMSIS DSP
+#
+###########################
+
+# DSP Sources
+SET(DSP ".")
+
+add_library(CMSISDSP INTERFACE)
+
+include(config)
+
+
+if (BASICMATH)
+ add_subdirectory(BasicMathFunctions)
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
+endif()
+
+if (COMPLEXMATH)
+ add_subdirectory(ComplexMathFunctions)
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
+endif()
+
+if (CONTROLLER)
+ add_subdirectory(ControllerFunctions)
+ # Fast tables inclusion is allowed
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
+ endif()
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
+endif()
+
+if (FASTMATH)
+ add_subdirectory(FastMathFunctions)
+ # Fast tables inclusion is allowed
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
+ endif()
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
+endif()
+
+if (FILTERING)
+ add_subdirectory(FilteringFunctions)
+ # Fast tables inclusion is allowed
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
+ endif()
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
+endif()
+
+if (MATRIX)
+ add_subdirectory(MatrixFunctions)
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
+endif()
+
+if (STATISTICS)
+ add_subdirectory(StatisticsFunctions)
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
+endif()
+
+if (SUPPORT)
+ add_subdirectory(SupportFunctions)
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
+endif()
+
+if (TRANSFORM)
+ add_subdirectory(TransformFunctions)
+ # FFT tables inclusion is allowed
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
+ endif()
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
+endif()
+
+if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM)
+ add_subdirectory(CommonTables)
+ if (TRANSFORM)
+ # FFT tables inclusion is allowed
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
+ endif()
+ endif()
+ if (FILTERING OR CONTROLLER OR FASTMATH)
+ # Select which tables to include
+ if (CONFIGTABLE)
+ target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
+ endif()
+ endif()
+ target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
+endif()
+
+### Includes
+target_include_directories(CMSISDSP INTERFACE "${DSP}/../Include")
+
+
+
diff --git a/DSP/Source/CommonTables/CMakeLists.txt b/DSP/Source/CommonTables/CMakeLists.txt
new file mode 100644
index 0000000..7bdad93
--- /dev/null
+++ b/DSP/Source/CommonTables/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPCommon)
+
+
+
+add_library(CMSISDSPCommon STATIC arm_common_tables.c)
+
+if (CONFIGTABLE AND ALLFFT)
+ target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES)
+endif()
+
+if (CONFIGTABLE AND ALLFAST)
+ target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES)
+endif()
+
+include(fft)
+fft(CMSISDSPCommon)
+
+include(interpol)
+interpol(CMSISDSPCommon)
+
+target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
+
+configdsp(CMSISDSPCommon ..)
+
+### Includes
+target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/CommonTables/CommonTables.c b/DSP/Source/CommonTables/CommonTables.c
new file mode 100644
index 0000000..acda9f8
--- /dev/null
+++ b/DSP/Source/CommonTables/CommonTables.c
@@ -0,0 +1,31 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: CommonTables.c
+ * Description: Combination of all common table source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_common_tables.c"
+#include "arm_const_structs.c"
+
diff --git a/DSP/Source/CommonTables/arm_common_tables.c b/DSP/Source/CommonTables/arm_common_tables.c
index 1f8f589..4b49b34 100644
--- a/DSP/Source/CommonTables/arm_common_tables.c
+++ b/DSP/Source/CommonTables/arm_common_tables.c
@@ -3,13 +3,13 @@
* Title: arm_common_tables.c
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,165 +30,167 @@
#include "arm_common_tables.h"
/**
- * @ingroup ComplexFFT
+ @ingroup ComplexFFT
*/
/**
- * @addtogroup CFFT_CIFFT Complex FFT Tables
- * @{
+ @addtogroup CFFT_CIFFT Complex FFT Tables
+ @{
*/
/**
-* \par
-* Pseudo code for Generation of Bit reversal Table is
-* \par
-*
for(l=1;l <= N/4;l++)
-* {
-* for(i=0;i> 1;
-* }
-* \par
-* where N = 4096 logN2 = 12
-* \par
-* N is the maximum FFT Size supported
+ @par
+ Pseudo code for Generation of Bit reversal Table is
+ @par
+
for (l = 1; l <= N/4; l++)
+ {
+ for (i = 0; i< logN2; i++)
+ {
+ a[i] = l & (1 << i);
+ }
+ for (j = 0; j < logN2; j++)
+ {
+ if (a[j] != 0)
+ y[l] += (1 << ((logN2 - 1) - j));
+ }
+ y[l] = y[l] >> 1;
+ }
-* \par
-* where N = 16 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
for (i = 0; i < N/; i++)
+ {
+ twiddleCoef[2*i] = cos(i * 2*PI/(float)N);
+ twiddleCoef[2*i+1] = sin(i * 2*PI/(float)N);
+ }
+ @par
+ where N = 16, PI = 3.14159265358979
+ @par
+ Cos and Sin values are in interleaved fashion
*/
const float32_t twiddleCoef_16[32] = {
1.000000000f, 0.000000000f,
@@ -209,20 +211,23 @@ const float32_t twiddleCoef_16[32] = {
0.923879533f, -0.382683432f
};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
+
/**
-* \par
-* Example code for Floating-point Twiddle factors Generation:
-* \par
-*
-* \par
-* where N = 32 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 64 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 128 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
-*/
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
+/**
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 256 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 512 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 1024 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 2048 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 4096 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are in interleaved fashion
-*
+ @par
+ Example code for Floating-point Twiddle factors Generation:
+ @par
+
-* \par
-* where N = 16 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
+ @brief Q31 Twiddle factors Table
*/
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16)
+/**
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 32 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 64 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 128 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
for (i = 0; i < 3N/4; i++)
+ {
+ twiddleCoefQ31[2*i] = cos(i * 2*PI/(float)N);
+ twiddleCoefQ31[2*i+1] = sin(i * 2*PI/(float)N);
+ }
+ @par
+ where N = 128, PI = 3.14159265358979
+ @par
+ Cos and Sin values are interleaved fashion
+ @par
+ Convert Floating point to Q31(Fixed point 1.31):
+ round(twiddleCoefQ31(i) * pow(2, 31))
+ */
const q31_t twiddleCoef_128_q31[192] = {
(q31_t)0x7FFFFFFF, (q31_t)0x00000000, (q31_t)0x7FD8878D,
(q31_t)0x0647D97C, (q31_t)0x7F62368F, (q31_t)0x0C8BD35E,
@@ -8734,24 +8760,27 @@ const q31_t twiddleCoef_128_q31[192] = {
(q31_t)0x809DC970, (q31_t)0xF9B82683, (q31_t)0x80277872
};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256)
/**
-* \par
-* Example code for Q31 Twiddle factors Generation::
-* \par
-*
-* \par
-* where N = 256 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 512 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 1024 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 2048 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 4096 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to Q31(Fixed point 1.31):
-* round(twiddleCoefQ31(i) * pow(2, 31))
-*
-*/
+ @par
+ Example code for Q31 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 32 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 64 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 128 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 256 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 512 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 1024 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 2048 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
-* \par
-* where N = 4096 and PI = 3.14159265358979
-* \par
-* Cos and Sin values are interleaved fashion
-* \par
-* Convert Floating point to q15(Fixed point 1.15):
-* round(twiddleCoefq15(i) * pow(2, 15))
-*
-*/
+ @par
+ Example code for q15 Twiddle factors Generation::
+ @par
+
+ @par
+ where N is the Number of weights to be calculated and c is pi/(2*N)
+ @par
+ In the tables below the real and imaginary values are placed alternatively, hence the
+ array length is 2*N.
+ */
+
+
+/**
+ @par
+ cosFactor tables are generated using the formula :
+ @par
+ where N is the Number of weights to be calculated and c is pi/(2*N)
+ @par
+ Converted the output to q15 format by multiplying with 2^31 and saturated if required.
+ @par
+ In the tables below the real and imaginary values are placed alternatively, hence the
+ array length is 2*N.
+ */
+
+/**
+ @par
+ cosFactor tables are generated using the formula :
cos_factors[n] = 2 * cos((2n+1)*pi/(4*N))
+ @par
+ C command to generate the table
+
+ for (i = 0; i< N; i++)
+ {
+ cos_factors[i] = 2 * cos((2*i+1)*c/2);
+ }
+ for (i = 0; i< N; i++)
+ {
+ weights[(2*i)] = cos(i*c);
+ weights[(2*i)+1] = -sin(i*c);
+ }
+ @par
+ where N is the Number of weights to be calculated and c is pi/(2*N)
+ @par
+ Convert the output to q31 format by multiplying with 2^31 and saturated if required.
+ @par
+ In the tables below the real and imaginary values are placed alternatively, hence the
+ array length is 2*N.
+ */
+
+/**
+ @par
+ cosFactor tables are generated using the formula :
cos_factors[n] = 2 * cos((2n+1)*pi/(4*N))
+ @par
+ C command to generate the table
+
+ for (i = 0; i< N; i++)
+ {
+ cos_factors[i] = 2 * cos((2*i+1)*c/2);
+ }
- * where pi value is 3.14159265358979
- * \par
- * Second, convert floating-point to Q31 (Fixed point):
- * (sinTable[i] * pow(2, 31))
- * \par
- * Finally, round to the nearest integer value:
- * sinTable[i] += (sinTable[i] > 0 ? 0.5 :-0.5);
+ @par
+ Table values are in Q31 (1.31 fixed-point format) and generation is done in
+ three steps. First, generate sin values in floating point:
+
- * where pi value is 3.14159265358979
- * \par
- * Second, convert floating-point to Q15 (Fixed point):
- * (sinTable[i] * pow(2, 15))
- * \par
- * Finally, round to the nearest integer value:
- * sinTable[i] += (sinTable[i] > 0 ? 0.5 :-0.5);
+ @par
+ Table values are in Q15 (1.15 fixed-point format) and generation is done in
+ three steps. First, generate sin values in floating point:
+
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup cmplx_conj Complex Conjugate
+
+ Conjugates the elements of a complex data vector.
+
+ The pSrc points to the source data and
+ pDst points to the destination data where the result should be written.
+ numSamples specifies the number of complex samples
+ and the data in each array is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ Each array has a total of 2*numSamples values.
+
+ The underlying algorithm is used:
+
+ for (n = 0; n < numSamples; n++) {
+ pDst[(2*n) ] = pSrc[(2*n) ]; // real part
+ pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
+ }
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup cmplx_conj
- * @{
+ @addtogroup cmplx_conj
+ @{
*/
/**
- * @brief Floating-point complex conjugate.
- * @param *pSrc points to the input vector
- * @param *pDst points to the output vector
- * @param numSamples number of complex samples in each vector
- * @return none.
+ @brief Floating-point complex conjugate.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
*/
+
void arm_cmplx_conj_f32(
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t numSamples)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4_t zero;
+ float32x4x2_t vec;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inR1, inR2, inR3, inR4;
- float32_t inI1, inI2, inI3, inI4;
+ zero = vdupq_n_f32(0.0);
- /*loop Unrolling */
+ /* Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0]+jC[1] = A[0]+(-1)*jA[1] */
+ /* Calculate Complex Conjugate and then store the results in the destination buffer. */
+ vec = vld2q_f32(pSrc);
+ vec.val[1] = vsubq_f32(zero,vec.val[1]);
+ vst2q_f32(pDst,vec);
+
+ /* Increment pointers */
+ pSrc += 8;
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = numSamples & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- /* read real input samples */
- inR1 = pSrc[0];
- /* store real samples to destination */
- pDst[0] = inR1;
- inR2 = pSrc[2];
- pDst[2] = inR2;
- inR3 = pSrc[4];
- pDst[4] = inR3;
- inR4 = pSrc[6];
- pDst[6] = inR4;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
- /* read imaginary input samples */
- inI1 = pSrc[1];
- inI2 = pSrc[3];
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ *pDst++ = -*pSrc++;
- /* conjugate input */
- inI1 = -inI1;
+ *pDst++ = *pSrc++;
+ *pDst++ = -*pSrc++;
- /* read imaginary input samples */
- inI3 = pSrc[5];
+ *pDst++ = *pSrc++;
+ *pDst++ = -*pSrc++;
- /* conjugate input */
- inI2 = -inI2;
+ *pDst++ = *pSrc++;
+ *pDst++ = -*pSrc++;
- /* read imaginary input samples */
- inI4 = pSrc[7];
-
- /* conjugate input */
- inI3 = -inI3;
-
- /* store imaginary samples to destination */
- pDst[1] = inI1;
- pDst[3] = inI2;
-
- /* conjugate input */
- inI4 = -inI4;
-
- /* store imaginary samples to destination */
- pDst[5] = inI3;
-
- /* increment source pointer by 8 to process next sampels */
- pSrc += 8U;
-
- /* store imaginary sample to destination */
- pDst[7] = inI4;
-
- /* increment destination pointer by 8 to store next samples */
- pDst += 8U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
- /* realOut + j (imagOut) = realIn + j (-1) imagIn */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- *pDst++ = *pSrc++;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of cmplx_conj group
+ @} end of cmplx_conj group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
index 7950229..073a337 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q15.c
* Description: Q15 complex conjugate
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,68 +29,66 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_conj
- * @{
+ @addtogroup cmplx_conj
+ @{
*/
/**
- * @brief Q15 complex conjugate.
- * @param *pSrc points to the input vector
- * @param *pDst points to the output vector
- * @param numSamples number of complex samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+ @brief Q15 complex conjugate.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_cmplx_conj_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t numSamples)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t in1; /* Temporary input variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in2, in3, in4; /* Temporary input variables */
+#endif
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
- q31_t in1, in2, in3, in4;
- q31_t zero = 0;
- /*loop Unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- in1 = *__SIMD32(pSrc)++;
- in2 = *__SIMD32(pSrc)++;
- in3 = *__SIMD32(pSrc)++;
- in4 = *__SIMD32(pSrc)++;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+
+ #if defined (ARM_MATH_DSP)
+ in1 = read_q15x2_ia ((q15_t **) &pSrc);
+ in2 = read_q15x2_ia ((q15_t **) &pSrc);
+ in3 = read_q15x2_ia ((q15_t **) &pSrc);
+ in4 = read_q15x2_ia ((q15_t **) &pSrc);
#ifndef ARM_MATH_BIG_ENDIAN
-
- in1 = __QASX(zero, in1);
- in2 = __QASX(zero, in2);
- in3 = __QASX(zero, in3);
- in4 = __QASX(zero, in4);
-
+ in1 = __QASX(0, in1);
+ in2 = __QASX(0, in2);
+ in3 = __QASX(0, in3);
+ in4 = __QASX(0, in4);
#else
-
- in1 = __QSAX(zero, in1);
- in2 = __QSAX(zero, in2);
- in3 = __QSAX(zero, in3);
- in4 = __QSAX(zero, in4);
-
+ in1 = __QSAX(0, in1);
+ in2 = __QSAX(0, in2);
+ in3 = __QSAX(0, in3);
+ in4 = __QSAX(0, in4);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
@@ -98,52 +96,62 @@ void arm_cmplx_conj_q15(
in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
- *__SIMD32(pDst)++ = in1;
- *__SIMD32(pDst)++ = in2;
- *__SIMD32(pDst)++ = in3;
- *__SIMD32(pDst)++ = in4;
+ write_q15x2_ia (&pDst, in1);
+ write_q15x2_ia (&pDst, in2);
+ write_q15x2_ia (&pDst, in3);
+ write_q15x2_ia (&pDst, in4);
+#else
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+ *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
- /* Decrement the loop counter */
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+ *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+ *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+ *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- *pDst++ = *pSrc++;
- *pDst++ = __SSAT(-*pSrc++, 16);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- q15_t in;
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- /* Run the below code for Cortex-M0 */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- while (numSamples > 0U)
+ while (blkCnt > 0U)
{
- /* realOut + j (imagOut) = realIn+ j (-1) imagIn */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- *pDst++ = *pSrc++;
- in = *pSrc++;
- *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
- /* Decrement the loop counter */
- numSamples--;
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __SSAT(-in1, 16);
+#else
+ *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of cmplx_conj group
+ @} end of cmplx_conj group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
index 709ce0e..6ef1ddb 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q31.c
* Description: Q31 complex conjugate
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,109 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_conj
- * @{
+ @addtogroup cmplx_conj
+ @{
*/
/**
- * @brief Q31 complex conjugate.
- * @param *pSrc points to the input vector
- * @param *pDst points to the output vector
- * @param numSamples number of complex samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+ @brief Q31 complex conjugate.
+ @param[in] pSrc points to the input vector
+ @param[out] pDst points to the output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_cmplx_conj_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t numSamples)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
{
- uint32_t blkCnt; /* loop counter */
- q31_t in; /* Input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t in; /* Temporary input variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inR1, inR2, inR3, inR4; /* Temporary real variables */
- q31_t inI1, inI2, inI3, inI4; /* Temporary imaginary variables */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
- /* read real input sample */
- inR1 = pSrc[0];
- /* store real input sample */
- pDst[0] = inR1;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
- /* read imaginary input sample */
- inI1 = pSrc[1];
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
- /* read real input sample */
- inR2 = pSrc[2];
- /* store real input sample */
- pDst[2] = inR2;
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
- /* read imaginary input sample */
- inI2 = pSrc[3];
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
- /* negate imaginary input sample */
- inI1 = __QSUB(0, inI1);
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
- /* read real input sample */
- inR3 = pSrc[4];
- /* store real input sample */
- pDst[4] = inR3;
-
- /* read imaginary input sample */
- inI3 = pSrc[5];
-
- /* negate imaginary input sample */
- inI2 = __QSUB(0, inI2);
-
- /* read real input sample */
- inR4 = pSrc[6];
- /* store real input sample */
- pDst[6] = inR4;
-
- /* negate imaginary input sample */
- inI3 = __QSUB(0, inI3);
-
- /* store imaginary input sample */
- inI4 = pSrc[7];
-
- /* store imaginary input samples */
- pDst[1] = inI1;
-
- /* negate imaginary input sample */
- inI4 = __QSUB(0, inI4);
-
- /* store imaginary input samples */
- pDst[3] = inI2;
-
- /* increment source pointer by 8 to proecess next samples */
- pSrc += 8U;
-
- /* store imaginary input samples */
- pDst[5] = inI3;
- pDst[7] = inI4;
-
- /* increment destination pointer by 8 to process next samples */
- pDst += 8U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
- /* Calculate Complex Conjugate and then store the results in the destination buffer. */
- /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
- *pDst++ = *pSrc++;
- in = *pSrc++;
- *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
- /* Decrement the loop counter */
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+ *pDst++ = __QSUB(0, in);
+#else
+ *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of cmplx_conj group
+ @} end of cmplx_conj group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
index bfc352b..06f1bfa 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_f32.c
* Description: Floating-point complex dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,163 +29,205 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @defgroup cmplx_dot_prod Complex Dot Product
- *
- * Computes the dot product of two complex vectors.
- * The vectors are multiplied element-by-element and then summed.
- *
- * The pSrcA points to the first complex input vector and
- * pSrcB points to the second complex input vector.
- * numSamples specifies the number of complex samples
- * and the data in each array is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * Each array has a total of 2*numSamples values.
- *
- * The underlying algorithm is used:
- *
- * realResult=0;
- * imagResult=0;
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup cmplx_dot_prod Complex Dot Product
+
+ Computes the dot product of two complex vectors.
+ The vectors are multiplied element-by-element and then summed.
+
+ The pSrcA points to the first complex input vector and
+ pSrcB points to the second complex input vector.
+ numSamples specifies the number of complex samples
+ and the data in each array is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ Each array has a total of 2*numSamples values.
+
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup cmplx_dot_prod
- * @{
+ @addtogroup cmplx_dot_prod
+ @{
*/
/**
- * @brief Floating-point complex dot product
- * @param *pSrcA points to the first input vector
- * @param *pSrcB points to the second input vector
- * @param numSamples number of complex samples in each vector
- * @param *realResult real part of the result returned here
- * @param *imagResult imaginary part of the result returned here
- * @return none.
+ @brief Floating-point complex dot product.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] numSamples number of samples in each vector
+ @param[out] realResult real part of the result returned here
+ @param[out] imagResult imaginary part of the result returned here
+ @return none
*/
void arm_cmplx_dot_prod_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- uint32_t numSamples,
- float32_t * realResult,
- float32_t * imagResult)
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ uint32_t numSamples,
+ float32_t * realResult,
+ float32_t * imagResult)
{
- float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result storage */
- float32_t a0,b0,c0,d0;
+ uint32_t blkCnt; /* Loop counter */
+ float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
+ float32_t a0,b0,c0,d0;
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4x2_t vec1,vec2,vec3,vec4;
+ float32x4_t accR,accI;
+ float32x2_t accum = vdup_n_f32(0);
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
+ accR = vdupq_n_f32(0.0);
+ accI = vdupq_n_f32(0.0);
- /*loop Unrolling */
+ /* Loop unrolling: Compute 8 outputs at a time */
+ blkCnt = numSamples >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */
+ /* Calculate dot product and then store the result in a temporary buffer. */
+
+ vec1 = vld2q_f32(pSrcA);
+ vec2 = vld2q_f32(pSrcB);
+
+ /* Increment pointers */
+ pSrcA += 8;
+ pSrcB += 8;
+
+ /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+ accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
+ accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
+
+ /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+ accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
+ accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
+
+ vec3 = vld2q_f32(pSrcA);
+ vec4 = vld2q_f32(pSrcB);
+
+ /* Increment pointers */
+ pSrcA += 8;
+ pSrcB += 8;
+
+ /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+ accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
+ accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
+
+ /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+ accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
+ accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
+ real_sum += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
+ imag_sum += accum[0] + accum[1];
+
+ /* Tail */
+ blkCnt = numSamples & 0x7;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
- /* Decrement the loop counter */
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples & 0x3U;
-
- while (blkCnt > 0U)
- {
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
-
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Store the real and imaginary results in the destination buffers */
+ /* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
- * @} end of cmplx_dot_prod group
+ @} end of cmplx_dot_prod group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
index 9e23a01..2ecd801 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q15.c
* Description: Processing function for the Q15 Complex Dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,143 +29,120 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_dot_prod
- * @{
+ @addtogroup cmplx_dot_prod
+ @{
*/
/**
- * @brief Q15 complex dot product
- * @param *pSrcA points to the first input vector
- * @param *pSrcB points to the second input vector
- * @param numSamples number of complex samples in each vector
- * @param *realResult real part of the result returned here
- * @param *imagResult imaginary part of the result returned here
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
- * These are accumulated in a 64-bit accumulator with 34.30 precision.
- * As a final step, the accumulators are converted to 8.24 format.
- * The return results realResult and imagResult are in 8.24 format.
+ @brief Q15 complex dot product.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] numSamples number of samples in each vector
+ @param[out] realResult real part of the result returned here
+ @param[out] imagResult imaginary part of the result returned her
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
+ These are accumulated in a 64-bit accumulator with 34.30 precision.
+ As a final step, the accumulators are converted to 8.24 format.
+ The return results realResult and imagResult are in 8.24 format.
*/
void arm_cmplx_dot_prod_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- uint32_t numSamples,
- q31_t * realResult,
- q31_t * imagResult)
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ uint32_t numSamples,
+ q31_t * realResult,
+ q31_t * imagResult)
{
- q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
- q15_t a0,b0,c0,d0;
+ uint32_t blkCnt; /* Loop counter */
+ q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
+ q15_t a0,b0,c0,d0;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += (q31_t)a0 * c0;
- imag_sum += (q31_t)a0 * d0;
- real_sum -= (q31_t)b0 * d0;
- imag_sum += (q31_t)b0 * c0;
+ real_sum += (q31_t)a0 * c0;
+ imag_sum += (q31_t)a0 * d0;
+ real_sum -= (q31_t)b0 * d0;
+ imag_sum += (q31_t)b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += (q31_t)a0 * c0;
- imag_sum += (q31_t)a0 * d0;
- real_sum -= (q31_t)b0 * d0;
- imag_sum += (q31_t)b0 * c0;
+ real_sum += (q31_t)a0 * c0;
+ imag_sum += (q31_t)a0 * d0;
+ real_sum -= (q31_t)b0 * d0;
+ imag_sum += (q31_t)b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += (q31_t)a0 * c0;
- imag_sum += (q31_t)a0 * d0;
- real_sum -= (q31_t)b0 * d0;
- imag_sum += (q31_t)b0 * c0;
+ real_sum += (q31_t)a0 * c0;
+ imag_sum += (q31_t)a0 * d0;
+ real_sum -= (q31_t)b0 * d0;
+ imag_sum += (q31_t)b0 * c0;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += (q31_t)a0 * c0;
- imag_sum += (q31_t)a0 * d0;
- real_sum -= (q31_t)b0 * d0;
- imag_sum += (q31_t)b0 * c0;
+ real_sum += (q31_t)a0 * c0;
+ imag_sum += (q31_t)a0 * d0;
+ real_sum -= (q31_t)b0 * d0;
+ imag_sum += (q31_t)b0 * c0;
- /* Decrement the loop counter */
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
-
- real_sum += (q31_t)a0 * c0;
- imag_sum += (q31_t)a0 * d0;
- real_sum -= (q31_t)b0 * d0;
- imag_sum += (q31_t)b0 * c0;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += a0 * c0;
- imag_sum += a0 * d0;
- real_sum -= b0 * d0;
- imag_sum += b0 * c0;
+ real_sum += (q31_t)a0 * c0;
+ imag_sum += (q31_t)a0 * d0;
+ real_sum -= (q31_t)b0 * d0;
+ imag_sum += (q31_t)b0 * c0;
-
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Store the real and imaginary results in 8.24 format */
+ /* Store real and imaginary result in 8.24 format */
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (real_sum >> 6);
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
@@ -173,5 +150,5 @@ void arm_cmplx_dot_prod_q15(
}
/**
- * @} end of cmplx_dot_prod group
+ @} end of cmplx_dot_prod group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
index 6eb5b6e..d715d98 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q31.c
* Description: Q31 complex dot product
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,147 +29,125 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_dot_prod
- * @{
+ @addtogroup cmplx_dot_prod
+ @{
*/
/**
- * @brief Q31 complex dot product
- * @param *pSrcA points to the first input vector
- * @param *pSrcB points to the second input vector
- * @param numSamples number of complex samples in each vector
- * @param *realResult real part of the result returned here
- * @param *imagResult imaginary part of the result returned here
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
- * The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
- * Additions are nonsaturating and no overflow will occur as long as numSamples is less than 32768.
- * The return results realResult and imagResult are in 16.48 format.
- * Input down scaling is not required.
+ @brief Q31 complex dot product.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] numSamples number of samples in each vector
+ @param[out] realResult real part of the result returned here
+ @param[out] imagResult imaginary part of the result returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
+ The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
+ Additions are nonsaturating and no overflow will occur as long as numSamples is less than 32768.
+ The return results realResult and imagResult are in 16.48 format.
+ Input down scaling is not required.
*/
void arm_cmplx_dot_prod_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- uint32_t numSamples,
- q63_t * realResult,
- q63_t * imagResult)
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ uint32_t numSamples,
+ q63_t * realResult,
+ q63_t * imagResult)
{
- q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
- q31_t a0,b0,c0,d0;
+ uint32_t blkCnt; /* Loop counter */
+ q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
+ q31_t a0,b0,c0,d0;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
+ real_sum += ((q63_t)a0 * c0) >> 14;
+ imag_sum += ((q63_t)a0 * d0) >> 14;
+ real_sum -= ((q63_t)b0 * d0) >> 14;
+ imag_sum += ((q63_t)b0 * c0) >> 14;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
+ real_sum += ((q63_t)a0 * c0) >> 14;
+ imag_sum += ((q63_t)a0 * d0) >> 14;
+ real_sum -= ((q63_t)b0 * d0) >> 14;
+ imag_sum += ((q63_t)b0 * c0) >> 14;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
+ real_sum += ((q63_t)a0 * c0) >> 14;
+ imag_sum += ((q63_t)a0 * d0) >> 14;
+ real_sum -= ((q63_t)b0 * d0) >> 14;
+ imag_sum += ((q63_t)b0 * c0) >> 14;
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
+ real_sum += ((q63_t)a0 * c0) >> 14;
+ imag_sum += ((q63_t)a0 * d0) >> 14;
+ real_sum -= ((q63_t)b0 * d0) >> 14;
+ imag_sum += ((q63_t)b0 * c0) >> 14;
- /* Decrement the loop counter */
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
-
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- a0 = *pSrcA++;
- b0 = *pSrcA++;
- c0 = *pSrcB++;
- d0 = *pSrcB++;
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
- real_sum += ((q63_t)a0 * c0) >> 14;
- imag_sum += ((q63_t)a0 * d0) >> 14;
- real_sum -= ((q63_t)b0 * d0) >> 14;
- imag_sum += ((q63_t)b0 * c0) >> 14;
+ real_sum += ((q63_t)a0 * c0) >> 14;
+ imag_sum += ((q63_t)a0 * d0) >> 14;
+ real_sum -= ((q63_t)b0 * d0) >> 14;
+ imag_sum += ((q63_t)b0 * c0) >> 14;
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Store the real and imaginary results in 16.48 format */
+ /* Store real and imaginary result in 16.48 format */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
- * @} end of cmplx_dot_prod group
+ @} end of cmplx_dot_prod group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
index 95aaf1e..84812dc 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_f32.c
* Description: Floating-point complex magnitude
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,125 +29,160 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @defgroup cmplx_mag Complex Magnitude
- *
- * Computes the magnitude of the elements of a complex data vector.
- *
- * The pSrc points to the source data and
- * pDst points to the where the result should be written.
- * numSamples specifies the number of complex samples
- * in the input array and the data is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The input array has a total of 2*numSamples values;
- * the output array has a total of numSamples values.
- * The underlying algorithm is used:
- *
- *
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup cmplx_mag Complex Magnitude
+
+ Computes the magnitude of the elements of a complex data vector.
+
+ The pSrc points to the source data and
+ pDst points to the where the result should be written.
+ numSamples specifies the number of complex samples
+ in the input array and the data is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ The input array has a total of 2*numSamples values;
+ the output array has a total of numSamples values.
+
+ The underlying algorithm is used:
+
+
+ for (n = 0; n < numSamples; n++) {
+ pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
+ }
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup cmplx_mag
- * @{
- */
-/**
- * @brief Floating-point complex magnitude.
- * @param[in] *pSrc points to complex input buffer
- * @param[out] *pDst points to real output buffer
- * @param[in] numSamples number of complex samples in the input vector
- * @return none.
- *
+ @addtogroup cmplx_mag
+ @{
*/
+/**
+ @brief Floating-point complex magnitude.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+ */
void arm_cmplx_mag_f32(
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t numSamples)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
{
- float32_t realIn, imagIn; /* Temporary variables to hold input values */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
+ float32_t real, imag; /* Temporary variables to hold input values */
- /*loop Unrolling */
- blkCnt = numSamples >> 2U;
+#if defined(ARM_MATH_NEON)
+
+ float32x4x2_t vecA;
+ float32x4_t vRealA;
+ float32x4_t vImagA;
+ float32x4_t vMagSqA;
+
+ float32x4x2_t vecB;
+ float32x4_t vRealB;
+ float32x4_t vImagB;
+ float32x4_t vMagSqB;
+
+ /* Loop unrolling: Compute 8 outputs at a time */
+ blkCnt = numSamples >> 3;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
+ /* out = sqrt((real * real) + (imag * imag)) */
- /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
- realIn = *pSrc++;
- imagIn = *pSrc++;
- /* store the result in the destination buffer. */
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+ vecA = vld2q_f32(pSrc);
+ pSrc += 8;
- realIn = *pSrc++;
- imagIn = *pSrc++;
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+ vecB = vld2q_f32(pSrc);
+ pSrc += 8;
- realIn = *pSrc++;
- imagIn = *pSrc++;
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+ vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
+ vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
+ vMagSqA = vaddq_f32(vRealA, vImagA);
- realIn = *pSrc++;
- imagIn = *pSrc++;
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+ vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
+ vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
+ vMagSqB = vaddq_f32(vRealB, vImagB);
+ /* Store the result in the destination buffer. */
+ vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
+ pDst += 4;
+
+ vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
+ pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
- realIn = *pSrc++;
- imagIn = *pSrc++;
- /* store the result in the destination buffer. */
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ blkCnt = numSamples & 7;
#else
- /* Run the below code for Cortex-M0 */
+#if defined (ARM_MATH_LOOPUNROLL)
- while (numSamples > 0U)
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
{
- /* out = sqrt((real * real) + (imag * imag)) */
- realIn = *pSrc++;
- imagIn = *pSrc++;
- /* store the result in the destination buffer. */
- arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
- /* Decrement the loop counter */
- numSamples--;
+ real = *pSrc++;
+ imag = *pSrc++;
+
+ /* store result in destination buffer. */
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+
+ /* store result in destination buffer. */
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
}
/**
- * @} end of cmplx_mag group
+ @} end of cmplx_mag group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
index 03d9b2a..a493274 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q15.c
* Description: Q15 complex magnitude
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,113 +29,134 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_mag
- * @{
+ @addtogroup cmplx_mag
+ @{
*/
-
/**
- * @brief Q15 complex magnitude
- * @param *pSrc points to the complex input vector
- * @param *pDst points to the real output vector
- * @param numSamples number of complex samples in the input vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
+ @brief Q15 complex magnitude.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
*/
void arm_cmplx_mag_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t numSamples)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
{
- q31_t acc0, acc1; /* Accumulators */
+ uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
+ q31_t in;
+ q31_t acc0; /* Accumulators */
+#else
+ q15_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
+#endif
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
- q31_t in1, in2, in3, in4;
- q31_t acc2, acc3;
+#if defined (ARM_MATH_LOOPUNROLL)
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
-
- /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
- in1 = *__SIMD32(pSrc)++;
- in2 = *__SIMD32(pSrc)++;
- in3 = *__SIMD32(pSrc)++;
- in4 = *__SIMD32(pSrc)++;
-
- acc0 = __SMUAD(in1, in1);
- acc1 = __SMUAD(in2, in2);
- acc2 = __SMUAD(in3, in3);
- acc3 = __SMUAD(in4, in4);
-
- /* store the result in 2.14 format in the destination buffer. */
- arm_sqrt_q15((q15_t) ((acc0) >> 17), pDst++);
- arm_sqrt_q15((q15_t) ((acc1) >> 17), pDst++);
- arm_sqrt_q15((q15_t) ((acc2) >> 17), pDst++);
- arm_sqrt_q15((q15_t) ((acc3) >> 17), pDst++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples % 0x4U;
-
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
- in1 = *__SIMD32(pSrc)++;
- acc0 = __SMUAD(in1, in1);
- /* store the result in 2.14 format in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ /* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
- /* Decrement the loop counter */
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+#else
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+
+ /* store result in 2.14 format in destination buffer. */
+ arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
#else
- /* Run the below code for Cortex-M0 */
- q15_t real, imag; /* Temporary variables to hold input values */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* out = sqrt(real * real + imag * imag) */
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+#if defined (ARM_MATH_DSP)
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+
+ /* store result in 2.14 format in destination buffer. */
+ arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+#else
real = *pSrc++;
imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
- acc0 = (real * real);
- acc1 = (imag * imag);
-
- /* store the result in 2.14 format in the destination buffer. */
+ /* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+#endif
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of cmplx_mag group
+ @} end of cmplx_mag group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
index 830ecb9..873e566 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q31.c
* Description: Q31 complex magnitude
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,145 +29,102 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_mag
- * @{
+ @addtogroup cmplx_mag
+ @{
*/
/**
- * @brief Q31 complex magnitude
- * @param *pSrc points to the complex input vector
- * @param *pDst points to the real output vector
- * @param numSamples number of complex samples in the input vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
- * Input down scaling is not required.
+ @brief Q31 complex magnitude.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
+ Input down scaling is not required.
*/
void arm_cmplx_mag_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t numSamples)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
{
- q31_t real, imag; /* Temporary variables to hold input values */
- q31_t acc0, acc1; /* Accumulators */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t real1, real2, imag1, imag2; /* Temporary variables to hold input values */
- q31_t out1, out2, out3, out4; /* Accumulators */
- q63_t mul1, mul2, mul3, mul4; /* Temporary variables */
-
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* read complex input from source buffer */
- real1 = pSrc[0];
- imag1 = pSrc[1];
- real2 = pSrc[2];
- imag2 = pSrc[3];
-
- /* calculate power of input values */
- mul1 = (q63_t) real1 *real1;
- mul2 = (q63_t) imag1 *imag1;
- mul3 = (q63_t) real2 *real2;
- mul4 = (q63_t) imag2 *imag2;
-
- /* get the result to 3.29 format */
- out1 = (q31_t) (mul1 >> 33);
- out2 = (q31_t) (mul2 >> 33);
- out3 = (q31_t) (mul3 >> 33);
- out4 = (q31_t) (mul4 >> 33);
-
- /* add real and imaginary accumulators */
- out1 = out1 + out2;
- out3 = out3 + out4;
-
- /* read complex input from source buffer */
- real1 = pSrc[4];
- imag1 = pSrc[5];
- real2 = pSrc[6];
- imag2 = pSrc[7];
-
- /* calculate square root */
- arm_sqrt_q31(out1, &pDst[0]);
-
- /* calculate power of input values */
- mul1 = (q63_t) real1 *real1;
-
- /* calculate square root */
- arm_sqrt_q31(out3, &pDst[1]);
-
- /* calculate power of input values */
- mul2 = (q63_t) imag1 *imag1;
- mul3 = (q63_t) real2 *real2;
- mul4 = (q63_t) imag2 *imag2;
-
- /* get the result to 3.29 format */
- out1 = (q31_t) (mul1 >> 33);
- out2 = (q31_t) (mul2 >> 33);
- out3 = (q31_t) (mul3 >> 33);
- out4 = (q31_t) (mul4 >> 33);
-
- /* add real and imaginary accumulators */
- out1 = out1 + out2;
- out3 = out3 + out4;
-
- /* calculate square root */
- arm_sqrt_q31(out1, &pDst[2]);
-
- /* increment destination by 8 to process next samples */
- pSrc += 8U;
-
- /* calculate square root */
- arm_sqrt_q31(out3, &pDst[3]);
-
- /* increment destination by 4 to process next samples */
- pDst += 4U;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples % 0x4U;
-
-#else
-
- /* Run the below code for Cortex-M0 */
- blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 2.30 format in the destination buffer. */
+
+ /* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
- /* Decrement the loop counter */
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+ arm_sqrt_q31(acc0 + acc1, pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+ arm_sqrt_q31(acc0 + acc1, pDst++);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+ arm_sqrt_q31(acc0 + acc1, pDst++);
+
+ /* Decrement loop counter */
blkCnt--;
}
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+
+ /* store result in 2.30 format in destination buffer. */
+ arm_sqrt_q31(acc0 + acc1, pDst++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
}
/**
- * @} end of cmplx_mag group
+ @} end of cmplx_mag group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
index 59127a2..99f051c 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_f32.c
* Description: Floating-point complex magnitude squared
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,176 +29,156 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @defgroup cmplx_mag_squared Complex Magnitude Squared
- *
- * Computes the magnitude squared of the elements of a complex data vector.
- *
- * The pSrc points to the source data and
- * pDst points to the where the result should be written.
- * numSamples specifies the number of complex samples
- * in the input array and the data is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The input array has a total of 2*numSamples values;
- * the output array has a total of numSamples values.
- *
- * The underlying algorithm is used:
- *
- *
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup cmplx_mag_squared Complex Magnitude Squared
+
+ Computes the magnitude squared of the elements of a complex data vector.
+
+ The pSrc points to the source data and
+ pDst points to the where the result should be written.
+ numSamples specifies the number of complex samples
+ in the input array and the data is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ The input array has a total of 2*numSamples values;
+ the output array has a total of numSamples values.
+
+ The underlying algorithm is used:
+
+
+ for (n = 0; n < numSamples; n++) {
+ pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
+ }
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup cmplx_mag_squared
- * @{
+ @addtogroup cmplx_mag_squared
+ @{
*/
-
/**
- * @brief Floating-point complex magnitude squared
- * @param[in] *pSrc points to the complex input vector
- * @param[out] *pDst points to the real output vector
- * @param[in] numSamples number of complex samples in the input vector
- * @return none.
+ @brief Floating-point complex magnitude squared.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
*/
void arm_cmplx_mag_squared_f32(
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t numSamples)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
{
- float32_t real, imag; /* Temporary variables to store real and imaginary values */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ float32_t real, imag; /* Temporary input variables */
-#if defined (ARM_MATH_DSP)
- float32_t real1, real2, real3, real4; /* Temporary variables to hold real values */
- float32_t imag1, imag2, imag3, imag4; /* Temporary variables to hold imaginary values */
- float32_t mul1, mul2, mul3, mul4; /* Temporary variables */
- float32_t mul5, mul6, mul7, mul8; /* Temporary variables */
- float32_t out1, out2, out3, out4; /* Temporary variables to hold output values */
+#if defined(ARM_MATH_NEON)
+ float32x4x2_t vecA;
+ float32x4_t vRealA;
+ float32x4_t vImagA;
+ float32x4_t vMagSqA;
- /*loop Unrolling */
- blkCnt = numSamples >> 2U;
+ float32x4x2_t vecB;
+ float32x4_t vRealB;
+ float32x4_t vImagB;
+ float32x4_t vMagSqB;
+
+ /* Loop unrolling: Compute 8 outputs at a time */
+ blkCnt = numSamples >> 3;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
- /* read real input sample from source buffer */
- real1 = pSrc[0];
- /* read imaginary input sample from source buffer */
- imag1 = pSrc[1];
+ /* out = sqrt((real * real) + (imag * imag)) */
- /* calculate power of real value */
- mul1 = real1 * real1;
+ vecA = vld2q_f32(pSrc);
+ pSrc += 8;
- /* read real input sample from source buffer */
- real2 = pSrc[2];
+ vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
+ vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
+ vMagSqA = vaddq_f32(vRealA, vImagA);
- /* calculate power of imaginary value */
- mul2 = imag1 * imag1;
+ vecB = vld2q_f32(pSrc);
+ pSrc += 8;
- /* read imaginary input sample from source buffer */
- imag2 = pSrc[3];
+ vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
+ vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
+ vMagSqB = vaddq_f32(vRealB, vImagB);
- /* calculate power of real value */
- mul3 = real2 * real2;
+ /* Store the result in the destination buffer. */
+ vst1q_f32(pDst, vMagSqA);
+ pDst += 4;
- /* read real input sample from source buffer */
- real3 = pSrc[4];
-
- /* calculate power of imaginary value */
- mul4 = imag2 * imag2;
-
- /* read imaginary input sample from source buffer */
- imag3 = pSrc[5];
-
- /* calculate power of real value */
- mul5 = real3 * real3;
- /* calculate power of imaginary value */
- mul6 = imag3 * imag3;
-
- /* read real input sample from source buffer */
- real4 = pSrc[6];
-
- /* accumulate real and imaginary powers */
- out1 = mul1 + mul2;
-
- /* read imaginary input sample from source buffer */
- imag4 = pSrc[7];
-
- /* accumulate real and imaginary powers */
- out2 = mul3 + mul4;
-
- /* calculate power of real value */
- mul7 = real4 * real4;
- /* calculate power of imaginary value */
- mul8 = imag4 * imag4;
-
- /* store output to destination */
- pDst[0] = out1;
-
- /* accumulate real and imaginary powers */
- out3 = mul5 + mul6;
-
- /* store output to destination */
- pDst[1] = out2;
-
- /* accumulate real and imaginary powers */
- out4 = mul7 + mul8;
-
- /* store output to destination */
- pDst[2] = out3;
-
- /* increment destination pointer by 8 to process next samples */
- pSrc += 8U;
-
- /* store output to destination */
- pDst[3] = out4;
-
- /* increment destination pointer by 4 to process next samples */
- pDst += 4U;
+ vst1q_f32(pDst, vMagSqB);
+ pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ blkCnt = numSamples & 7;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ *pDst++ = (real * real) + (imag * imag);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ *pDst++ = (real * real) + (imag * imag);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ *pDst++ = (real * real) + (imag * imag);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ *pDst++ = (real * real) + (imag * imag);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
+ /* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
real = *pSrc++;
imag = *pSrc++;
- /* out = (real * real) + (imag * imag) */
- /* store the result in the destination buffer. */
+ /* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of cmplx_mag_squared group
+ @} end of cmplx_mag_squared group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
index 3f740c3..fa5f4e6 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q15.c
* Description: Q15 complex magnitude squared
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,108 +29,133 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_mag_squared
- * @{
+ @addtogroup cmplx_mag_squared
+ @{
*/
/**
- * @brief Q15 complex magnitude squared
- * @param *pSrc points to the complex input vector
- * @param *pDst points to the real output vector
- * @param numSamples number of complex samples in the input vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
+ @brief Q15 complex magnitude squared.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
void arm_cmplx_mag_squared_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t numSamples)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
{
- q31_t acc0, acc1; /* Accumulators */
+ uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
+ q31_t in;
+ q31_t acc0; /* Accumulators */
+#else
+ q15_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
+#endif
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
- q31_t in1, in2, in3, in4;
- q31_t acc2, acc3;
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
- in1 = *__SIMD32(pSrc)++;
- in2 = *__SIMD32(pSrc)++;
- in3 = *__SIMD32(pSrc)++;
- in4 = *__SIMD32(pSrc)++;
- acc0 = __SMUAD(in1, in1);
- acc1 = __SMUAD(in2, in2);
- acc2 = __SMUAD(in3, in3);
- acc3 = __SMUAD(in4, in4);
-
- /* store the result in 3.13 format in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ /* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
- *pDst++ = (q15_t) (acc1 >> 17);
- *pDst++ = (q15_t) (acc2 >> 17);
- *pDst++ = (q15_t) (acc3 >> 17);
- /* Decrement the loop counter */
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ *pDst++ = (q15_t) (acc0 >> 17);
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ *pDst++ = (q15_t) (acc0 >> 17);
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+ *pDst++ = (q15_t) (acc0 >> 17);
+#else
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+ *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
- in1 = *__SIMD32(pSrc)++;
- acc0 = __SMUAD(in1, in1);
-
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ = (q15_t) (acc0 >> 17);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
- q15_t real, imag; /* Temporary variables to store real and imaginary values */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* out = ((real * real) + (imag * imag)) */
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+#if defined (ARM_MATH_DSP)
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) (acc0 >> 17);
+#else
real = *pSrc++;
imag = *pSrc++;
- acc0 = (real * real);
- acc1 = (imag * imag);
- /* store the result in 3.13 format in the destination buffer. */
+ acc0 = ((q31_t) real * real);
+ acc1 = ((q31_t) imag * imag);
+
+ /* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+#endif
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of cmplx_mag_squared group
+ @} end of cmplx_mag_squared group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
index c2b2c50..54863ef 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q31.c
* Description: Q31 complex magnitude squared
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,55 +29,43 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup cmplx_mag_squared
- * @{
+ @addtogroup cmplx_mag_squared
+ @{
*/
-
/**
- * @brief Q31 complex magnitude squared
- * @param *pSrc points to the complex input vector
- * @param *pDst points to the real output vector
- * @param numSamples number of complex samples in the input vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
- * Input down scaling is not required.
+ @brief Q31 complex magnitude squared.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
+ Input down scaling is not required.
*/
void arm_cmplx_mag_squared_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t numSamples)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
{
- q31_t real, imag; /* Temporary variables to store real and imaginary values */
- q31_t acc0, acc1; /* Accumulators */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counter */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
- real = *pSrc++;
- imag = *pSrc++;
- acc0 = (q31_t) (((q63_t) real * real) >> 33);
- acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 3.29 format in the destination buffer. */
- *pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
@@ -90,60 +78,52 @@ void arm_cmplx_mag_squared_q31(
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
- /* Decrement the loop counter */
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+ *pDst++ = acc0 + acc1;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
- real = *pSrc++;
- imag = *pSrc++;
- acc0 = (q31_t) (((q63_t) real * real) >> 33);
- acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 3.29 format in the destination buffer. */
- *pDst++ = acc0 + acc1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* out = ((real * real) + (imag * imag)) */
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
- /* store the result in 3.29 format in the destination buffer. */
+
+ /* store result in 3.29 format in destination buffer. */
*pDst++ = acc0 + acc1;
- /* Decrement the loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of cmplx_mag_squared group
+ @} end of cmplx_mag_squared group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
index 3717591..8d14821 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_f32.c
* Description: Floating-point complex-by-complex multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,168 +29,166 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
- *
- * Multiplies a complex vector by another complex vector and generates a complex result.
- * The data in the complex arrays is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The parameter numSamples represents the number of complex
- * samples processed. The complex arrays have a total of 2*numSamples
- * real values.
- *
- * The underlying algorithm is used:
- *
- *
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
+
+ Multiplies a complex vector by another complex vector and generates a complex result.
+ The data in the complex arrays is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ The parameter numSamples represents the number of complex
+ samples processed. The complex arrays have a total of 2*numSamples
+ real values.
+
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup CmplxByCmplxMult
- * @{
+ @addtogroup CmplxByCmplxMult
+ @{
*/
-
/**
- * @brief Floating-point complex-by-complex multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] numSamples number of complex samples in each vector
- * @return none.
+ @brief Floating-point complex-by-complex multiplication.
+ @param[in] pSrcA points to first input vector
+ @param[in] pSrcB points to second input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
*/
void arm_cmplx_mult_cmplx_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- float32_t * pDst,
- uint32_t numSamples)
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ float32_t * pDst,
+ uint32_t numSamples)
{
- float32_t a1, b1, c1, d1; /* Temporary variables to store real and imaginary values */
- uint32_t blkCnt; /* loop counters */
+ uint32_t blkCnt; /* Loop counter */
+ float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4x2_t va, vb;
+ float32x4_t real, imag;
+ float32x4x2_t outCplx;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t a2, b2, c2, d2; /* Temporary variables to store real and imaginary values */
- float32_t acc1, acc2, acc3, acc4;
+ /* Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+ while (blkCnt > 0U)
+ {
+ va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
+ vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB
- /* loop Unrolling */
+ /* Increment pointers */
+ pSrcA += 8;
+ pSrcB += 8;
+
+ /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+ outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
+ outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
+
+ /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+ outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
+ outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
+
+ vst2q_f32(pDst, outCplx);
+
+ /* Increment pointer */
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = numSamples & 3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a1 = *pSrcA; /* A[2 * i] */
- c1 = *pSrcB; /* B[2 * i] */
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
- b1 = *(pSrcA + 1); /* A[2 * i + 1] */
- acc1 = a1 * c1; /* acc1 = A[2 * i] * B[2 * i] */
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ /* store result in destination buffer. */
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
- a2 = *(pSrcA + 2); /* A[2 * i + 2] */
- acc2 = (b1 * c1); /* acc2 = A[2 * i + 1] * B[2 * i] */
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
- d1 = *(pSrcB + 1); /* B[2 * i + 1] */
- c2 = *(pSrcB + 2); /* B[2 * i + 2] */
- acc1 -= b1 * d1; /* acc1 = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
- d2 = *(pSrcB + 3); /* B[2 * i + 3] */
- acc3 = a2 * c2; /* acc3 = A[2 * i + 2] * B[2 * i + 2] */
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
- b2 = *(pSrcA + 3); /* A[2 * i + 3] */
- acc2 += (a1 * d1); /* acc2 = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
-
- a1 = *(pSrcA + 4); /* A[2 * i + 4] */
- acc4 = (a2 * d2); /* acc4 = A[2 * i + 2] * B[2 * i + 3] */
-
- c1 = *(pSrcB + 4); /* B[2 * i + 4] */
- acc3 -= (b2 * d2); /* acc3 = A[2 * i + 2] * B[2 * i + 2] - A[2 * i + 3] * B[2 * i + 3] */
- *pDst = acc1; /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
-
- b1 = *(pSrcA + 5); /* A[2 * i + 5] */
- acc4 += b2 * c2; /* acc4 = A[2 * i + 2] * B[2 * i + 3] + A[2 * i + 3] * B[2 * i + 2] */
-
- *(pDst + 1) = acc2; /* C[2 * i + 1] = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
- acc1 = (a1 * c1);
-
- d1 = *(pSrcB + 5);
- acc2 = (b1 * c1);
-
- *(pDst + 2) = acc3;
- *(pDst + 3) = acc4;
-
- a2 = *(pSrcA + 6);
- acc1 -= (b1 * d1);
-
- c2 = *(pSrcB + 6);
- acc2 += (a1 * d1);
-
- b2 = *(pSrcA + 7);
- acc3 = (a2 * c2);
-
- d2 = *(pSrcB + 7);
- acc4 = (b2 * c2);
-
- *(pDst + 4) = acc1;
- pSrcA += 8U;
-
- acc3 -= (b2 * d2);
- acc4 += (a2 * d2);
-
- *(pDst + 5) = acc2;
- pSrcB += 8U;
-
- *(pDst + 6) = acc3;
- *(pDst + 7) = acc4;
-
- pDst += 8U;
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a1 = *pSrcA++;
- b1 = *pSrcA++;
- c1 = *pSrcB++;
- d1 = *pSrcB++;
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
- /* store the result in the destination buffer. */
- *pDst++ = (a1 * c1) - (b1 * d1);
- *pDst++ = (a1 * d1) + (b1 * c1);
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
- /* Decrement the numSamples loop counter */
+ /* store result in destination buffer. */
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of CmplxByCmplxMult group
+ @} end of CmplxByCmplxMult group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
index 2869837..6659427 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q15.c
* Description: Q15 complex-by-complex multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,153 +29,108 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup CmplxByCmplxMult
- * @{
+ @addtogroup CmplxByCmplxMult
+ @{
*/
/**
- * @brief Q15 complex-by-complex multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] numSamples number of complex samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
+ @brief Q15 complex-by-complex multiplication.
+ @param[in] pSrcA points to first input vector
+ @param[in] pSrcB points to second input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
void arm_cmplx_mult_cmplx_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- q15_t * pDst,
- uint32_t numSamples)
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t numSamples)
{
- q15_t a, b, c, d; /* Temporary variables to store real and imaginary values */
+ uint32_t blkCnt; /* Loop counter */
+ q15_t a, b, c, d; /* Temporary variables */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counters */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
-
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
-
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
+
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
- /* store the result in 3.13 format in the destination buffer. */
- *pDst++ =
- (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
- /* Decrement the blockSize loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of CmplxByCmplxMult group
+ @} end of CmplxByCmplxMult group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
index b01c4f6..f6d6dc6 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q31.c
* Description: Q31 complex-by-complex multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,286 +29,109 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup CmplxByCmplxMult
- * @{
+ @addtogroup CmplxByCmplxMult
+ @{
*/
-
/**
- * @brief Q31 complex-by-complex multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] numSamples number of complex samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
- * Input down scaling is not required.
+ @brief Q31 complex-by-complex multiplication.
+ @param[in] pSrcA points to first input vector
+ @param[in] pSrcB points to second input vector
+ @param[out] pDst points to output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
+ Input down scaling is not required.
*/
void arm_cmplx_mult_cmplx_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- q31_t * pDst,
- uint32_t numSamples)
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t numSamples)
{
- q31_t a, b, c, d; /* Temporary variables to store real and imaginary values */
- uint32_t blkCnt; /* loop counters */
- q31_t mul1, mul2, mul3, mul4;
- q31_t out1, out2;
+ uint32_t blkCnt; /* Loop counter */
+ q31_t a, b, c, d; /* Temporary variables */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
+ /* store result in 3.29 format in destination buffer. */
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
-
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- /* loop Unrolling */
- blkCnt = numSamples >> 1U;
-
- /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
- ** a second loop below computes the remaining 1 sample. */
- while (blkCnt > 0U)
- {
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
-
- a = *pSrcA++;
- b = *pSrcA++;
- c = *pSrcB++;
- d = *pSrcB++;
-
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples % 0x2U;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
- /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
+
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
- mul1 = (q31_t) (((q63_t) a * c) >> 32);
- mul2 = (q31_t) (((q63_t) b * d) >> 32);
- mul3 = (q31_t) (((q63_t) a * d) >> 32);
- mul4 = (q31_t) (((q63_t) b * c) >> 32);
+ /* store result in 3.29 format in destination buffer. */
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
- mul1 = (mul1 >> 1);
- mul2 = (mul2 >> 1);
- mul3 = (mul3 >> 1);
- mul4 = (mul4 >> 1);
-
- out1 = mul1 - mul2;
- out2 = mul3 + mul4;
-
- /* store the real result in 3.29 format in the destination buffer. */
- *pDst++ = out1;
- /* store the imag result in 3.29 format in the destination buffer. */
- *pDst++ = out2;
-
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of CmplxByCmplxMult group
+ @} end of CmplxByCmplxMult group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
index 8c7ca31..9651999 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_f32.c
* Description: Floating-point complex by real multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,185 +29,141 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @defgroup CmplxByRealMult Complex-by-Real Multiplication
- *
- * Multiplies a complex vector by a real vector and generates a complex result.
- * The data in the complex arrays is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The parameter numSamples represents the number of complex
- * samples processed. The complex arrays have a total of 2*numSamples
- * real values while the real array has a total of numSamples
- * real values.
- *
- * The underlying algorithm is used:
- *
- *
- * for(n=0; n
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+ @defgroup CmplxByRealMult Complex-by-Real Multiplication
+
+ Multiplies a complex vector by a real vector and generates a complex result.
+ The data in the complex arrays is stored in an interleaved fashion
+ (real, imag, real, imag, ...).
+ The parameter numSamples represents the number of complex
+ samples processed. The complex arrays have a total of 2*numSamples
+ real values while the real array has a total of numSamples
+ real values.
+
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
- * @addtogroup CmplxByRealMult
- * @{
+ @addtogroup CmplxByRealMult
+ @{
*/
-
/**
- * @brief Floating-point complex-by-real multiplication
- * @param[in] *pSrcCmplx points to the complex input vector
- * @param[in] *pSrcReal points to the real input vector
- * @param[out] *pCmplxDst points to the complex output vector
- * @param[in] numSamples number of samples in each vector
- * @return none.
+ @brief Floating-point complex-by-real multiplication.
+ @param[in] pSrcCmplx points to complex input vector
+ @param[in] pSrcReal points to real input vector
+ @param[out] pCmplxDst points to complex output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
*/
void arm_cmplx_mult_real_f32(
- float32_t * pSrcCmplx,
- float32_t * pSrcReal,
- float32_t * pCmplxDst,
- uint32_t numSamples)
+ const float32_t * pSrcCmplx,
+ const float32_t * pSrcReal,
+ float32_t * pCmplxDst,
+ uint32_t numSamples)
{
- float32_t in; /* Temporary variable to store input value */
- uint32_t blkCnt; /* loop counters */
+ uint32_t blkCnt; /* Loop counter */
+ float32_t in; /* Temporary variable */
-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+ float32x4_t r;
+ float32x4x2_t ab,outCplx;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inA1, inA2, inA3, inA4; /* Temporary variables to hold input data */
- float32_t inA5, inA6, inA7, inA8; /* Temporary variables to hold input data */
- float32_t inB1, inB2, inB3, inB4; /* Temporary variables to hold input data */
- float32_t out1, out2, out3, out4; /* Temporary variables to hold output data */
- float32_t out5, out6, out7, out8; /* Temporary variables to hold output data */
+ /* Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
- /* loop Unrolling */
+ while (blkCnt > 0U)
+ {
+ ab = vld2q_f32(pSrcCmplx); // load & separate real/imag pSrcA (de-interleave 2)
+ r = vld1q_f32(pSrcReal); // load & separate real/imag pSrcB
+
+ /* Increment pointers */
+ pSrcCmplx += 8;
+ pSrcReal += 4;
+
+ outCplx.val[0] = vmulq_f32(ab.val[0], r);
+ outCplx.val[1] = vmulq_f32(ab.val[1], r);
+
+ vst2q_f32(pCmplxDst, outCplx);
+ pCmplxDst += 8;
+
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = numSamples & 3;
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* read input from complex input buffer */
- inA1 = pSrcCmplx[0];
- inA2 = pSrcCmplx[1];
- /* read input from real input buffer */
- inB1 = pSrcReal[0];
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* read input from complex input buffer */
- inA3 = pSrcCmplx[2];
+ in = *pSrcReal++;
+ /* store result in destination buffer. */
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
- /* multiply complex buffer real input with real buffer input */
- out1 = inA1 * inB1;
+ in = *pSrcReal++;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
- /* read input from complex input buffer */
- inA4 = pSrcCmplx[3];
+ in = *pSrcReal++;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
- /* multiply complex buffer imaginary input with real buffer input */
- out2 = inA2 * inB1;
+ in = *pSrcReal++;
+ *pCmplxDst++ = *pSrcCmplx++* in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
- /* read input from real input buffer */
- inB2 = pSrcReal[1];
- /* read input from complex input buffer */
- inA5 = pSrcCmplx[4];
-
- /* multiply complex buffer real input with real buffer input */
- out3 = inA3 * inB2;
-
- /* read input from complex input buffer */
- inA6 = pSrcCmplx[5];
- /* read input from real input buffer */
- inB3 = pSrcReal[2];
-
- /* multiply complex buffer imaginary input with real buffer input */
- out4 = inA4 * inB2;
-
- /* read input from complex input buffer */
- inA7 = pSrcCmplx[6];
-
- /* multiply complex buffer real input with real buffer input */
- out5 = inA5 * inB3;
-
- /* read input from complex input buffer */
- inA8 = pSrcCmplx[7];
-
- /* multiply complex buffer imaginary input with real buffer input */
- out6 = inA6 * inB3;
-
- /* read input from real input buffer */
- inB4 = pSrcReal[3];
-
- /* store result to destination bufer */
- pCmplxDst[0] = out1;
-
- /* multiply complex buffer real input with real buffer input */
- out7 = inA7 * inB4;
-
- /* store result to destination bufer */
- pCmplxDst[1] = out2;
-
- /* multiply complex buffer imaginary input with real buffer input */
- out8 = inA8 * inB4;
-
- /* store result to destination bufer */
- pCmplxDst[2] = out3;
- pCmplxDst[3] = out4;
- pCmplxDst[4] = out5;
-
- /* incremnet complex input buffer by 8 to process next samples */
- pSrcCmplx += 8U;
-
- /* store result to destination bufer */
- pCmplxDst[5] = out6;
-
- /* increment real input buffer by 4 to process next samples */
- pSrcReal += 4U;
-
- /* store result to destination bufer */
- pCmplxDst[6] = out7;
- pCmplxDst[7] = out8;
-
- /* increment destination buffer by 8 to process next sampels */
- pCmplxDst += 8U;
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- in = *pSrcReal++;
- /* store the result in the destination buffer. */
- *pCmplxDst++ = (*pSrcCmplx++) * (in);
- *pCmplxDst++ = (*pSrcCmplx++) * (in);
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* Decrement the numSamples loop counter */
+ in = *pSrcReal++;
+ /* store result in destination buffer. */
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+
+ /* Decrement loop counter */
blkCnt--;
}
+
}
/**
- * @} end of CmplxByRealMult group
+ @} end of CmplxByRealMult group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
index 340d852..4877d20 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q15.c
* Description: Q15 complex by real multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,77 +29,71 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup CmplxByRealMult
- * @{
+ @addtogroup CmplxByRealMult
+ @{
*/
-
/**
- * @brief Q15 complex-by-real multiplication
- * @param[in] *pSrcCmplx points to the complex input vector
- * @param[in] *pSrcReal points to the real input vector
- * @param[out] *pCmplxDst points to the complex output vector
- * @param[in] numSamples number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ @brief Q15 complex-by-real multiplication.
+ @param[in] pSrcCmplx points to complex input vector
+ @param[in] pSrcReal points to real input vector
+ @param[out] pCmplxDst points to complex output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_cmplx_mult_real_q15(
- q15_t * pSrcCmplx,
- q15_t * pSrcReal,
- q15_t * pCmplxDst,
- uint32_t numSamples)
+ const q15_t * pSrcCmplx,
+ const q15_t * pSrcReal,
+ q15_t * pCmplxDst,
+ uint32_t numSamples)
{
- q15_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* Loop counter */
+ q15_t in; /* Temporary variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
+ q31_t inA1, inA2; /* Temporary variables to hold input data */
+ q31_t inB1; /* Temporary variables to hold input data */
+ q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
+ q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
+#endif
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counters */
- q31_t inA1, inA2; /* Temporary variables to hold input data */
- q31_t inB1; /* Temporary variables to hold input data */
- q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
- q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* read complex number both real and imaginary from complex input buffer */
- inA1 = *__SIMD32(pSrcCmplx)++;
- /* read two real values at a time from real input buffer */
- inB1 = *__SIMD32(pSrcReal)++;
- /* read complex number both real and imaginary from complex input buffer */
- inA2 = *__SIMD32(pSrcCmplx)++;
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+#if defined (ARM_MATH_DSP)
+ /* read 2 complex numbers both real and imaginary from complex input buffer */
+ inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+ inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+ /* read 2 real values at a time from real input buffer */
+ inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
/* multiply complex number with real numbers */
#ifndef ARM_MATH_BIG_ENDIAN
-
- mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
+ mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
- mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
+ mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
-
#else
-
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
- mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
+ mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
- mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
-
+ mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* saturate the result */
@@ -109,27 +103,23 @@ void arm_cmplx_mult_real_q15(
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
/* pack real and imaginary outputs and store them to destination */
- *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
- *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
+ write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
+ write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
- inA1 = *__SIMD32(pSrcCmplx)++;
- inB1 = *__SIMD32(pSrcReal)++;
- inA2 = *__SIMD32(pSrcCmplx)++;
+ inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+ inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+ inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
#ifndef ARM_MATH_BIG_ENDIAN
-
- mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
+ mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
- mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
+ mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
-
#else
-
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
- mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
+ mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
@@ -137,55 +127,56 @@ void arm_cmplx_mult_real_q15(
out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
- *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
- *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
-
- /* Decrement the numSamples loop counter */
- blkCnt--;
- }
-
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = numSamples % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+ write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
+ write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
+#else
in = *pSrcReal++;
- /* store the result in the destination buffer. */
- *pCmplxDst++ =
- (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
- *pCmplxDst++ =
- (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
- /* Decrement the numSamples loop counter */
+ in = *pSrcReal++;
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+ in = *pSrcReal++;
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+ in = *pSrcReal++;
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* realOut = realA * realB. */
- /* imagOut = imagA * realB. */
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
in = *pSrcReal++;
/* store the result in the destination buffer. */
- *pCmplxDst++ =
- (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
- *pCmplxDst++ =
- (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
- /* Decrement the numSamples loop counter */
- numSamples--;
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of CmplxByRealMult group
+ @} end of CmplxByRealMult group
*/
diff --git a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
index 19fc55b..906410f 100644
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q31.c
* Description: Q31 complex by real multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,183 +29,120 @@
#include "arm_math.h"
/**
- * @ingroup groupCmplxMath
+ @ingroup groupCmplxMath
*/
/**
- * @addtogroup CmplxByRealMult
- * @{
+ @addtogroup CmplxByRealMult
+ @{
*/
-
/**
- * @brief Q31 complex-by-real multiplication
- * @param[in] *pSrcCmplx points to the complex input vector
- * @param[in] *pSrcReal points to the real input vector
- * @param[out] *pCmplxDst points to the complex output vector
- * @param[in] numSamples number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+ @brief Q31 complex-by-real multiplication.
+ @param[in] pSrcCmplx points to complex input vector
+ @param[in] pSrcReal points to real input vector
+ @param[out] pCmplxDst points to complex output vector
+ @param[in] numSamples number of samples in each vector
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_cmplx_mult_real_q31(
- q31_t * pSrcCmplx,
- q31_t * pSrcReal,
- q31_t * pCmplxDst,
- uint32_t numSamples)
+ const q31_t * pSrcCmplx,
+ const q31_t * pSrcReal,
+ q31_t * pCmplxDst,
+ uint32_t numSamples)
{
- q31_t inA1; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t in; /* Temporary variable */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- uint32_t blkCnt; /* loop counters */
- q31_t inA2, inA3, inA4; /* Temporary variables to hold input data */
- q31_t inB1, inB2; /* Temporary variabels to hold input data */
- q31_t out1, out2, out3, out4; /* Temporary variables to hold output data */
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* read real input from complex input buffer */
- inA1 = *pSrcCmplx++;
- inA2 = *pSrcCmplx++;
- /* read input from real input bufer */
- inB1 = *pSrcReal++;
- inB2 = *pSrcReal++;
- /* read imaginary input from complex input buffer */
- inA3 = *pSrcCmplx++;
- inA4 = *pSrcCmplx++;
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* multiply complex input with real input */
- out1 = ((q63_t) inA1 * inB1) >> 32;
- out2 = ((q63_t) inA2 * inB1) >> 32;
- out3 = ((q63_t) inA3 * inB2) >> 32;
- out4 = ((q63_t) inA4 * inB2) >> 32;
+ in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+ /* store saturated result in 1.31 format to destination buffer */
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+ /* store result in destination buffer. */
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
- /* sature the result */
- out1 = __SSAT(out1, 31);
- out2 = __SSAT(out2, 31);
- out3 = __SSAT(out3, 31);
- out4 = __SSAT(out4, 31);
+ in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
- /* get result in 1.31 format */
- out1 = out1 << 1;
- out2 = out2 << 1;
- out3 = out3 << 1;
- out4 = out4 << 1;
+ in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
- /* store the result to destination buffer */
- *pCmplxDst++ = out1;
- *pCmplxDst++ = out2;
- *pCmplxDst++ = out3;
- *pCmplxDst++ = out4;
+ in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
- /* read real input from complex input buffer */
- inA1 = *pSrcCmplx++;
- inA2 = *pSrcCmplx++;
- /* read input from real input bufer */
- inB1 = *pSrcReal++;
- inB2 = *pSrcReal++;
- /* read imaginary input from complex input buffer */
- inA3 = *pSrcCmplx++;
- inA4 = *pSrcCmplx++;
-
- /* multiply complex input with real input */
- out1 = ((q63_t) inA1 * inB1) >> 32;
- out2 = ((q63_t) inA2 * inB1) >> 32;
- out3 = ((q63_t) inA3 * inB2) >> 32;
- out4 = ((q63_t) inA4 * inB2) >> 32;
-
- /* sature the result */
- out1 = __SSAT(out1, 31);
- out2 = __SSAT(out2, 31);
- out3 = __SSAT(out3, 31);
- out4 = __SSAT(out4, 31);
-
- /* get result in 1.31 format */
- out1 = out1 << 1;
- out2 = out2 << 1;
- out3 = out3 << 1;
- out4 = out4 << 1;
-
- /* store the result to destination buffer */
- *pCmplxDst++ = out1;
- *pCmplxDst++ = out2;
- *pCmplxDst++ = out3;
- *pCmplxDst++ = out4;
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C[2 * i] = A[2 * i] * B[i]. */
- /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* read real input from complex input buffer */
- inA1 = *pSrcCmplx++;
- inA2 = *pSrcCmplx++;
- /* read input from real input bufer */
- inB1 = *pSrcReal++;
-
- /* multiply complex input with real input */
- out1 = ((q63_t) inA1 * inB1) >> 32;
- out2 = ((q63_t) inA2 * inB1) >> 32;
-
- /* sature the result */
- out1 = __SSAT(out1, 31);
- out2 = __SSAT(out2, 31);
-
- /* get result in 1.31 format */
- out1 = out1 << 1;
- out2 = out2 << 1;
-
- /* store the result to destination buffer */
- *pCmplxDst++ = out1;
- *pCmplxDst++ = out2;
-
- /* Decrement the numSamples loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
- while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- /* realOut = realA * realB. */
- /* imagReal = imagA * realB. */
- inA1 = *pSrcReal++;
- /* store the result in the destination buffer. */
- *pCmplxDst++ =
- (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
- *pCmplxDst++ =
- (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
- /* Decrement the numSamples loop counter */
- numSamples--;
+ in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+ /* store saturated result in 1.31 format to destination buffer */
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+ /* store result in destination buffer. */
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of CmplxByRealMult group
+ @} end of CmplxByRealMult group
*/
diff --git a/DSP/Source/ControllerFunctions/CMakeLists.txt b/DSP/Source/ControllerFunctions/CMakeLists.txt
new file mode 100644
index 0000000..705f5b8
--- /dev/null
+++ b/DSP/Source/ControllerFunctions/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPController)
+
+add_library(CMSISDSPController STATIC)
+
+configdsp(CMSISDSPController ..)
+
+include(interpol)
+interpol(CMSISDSPController)
+
+if (CONFIGTABLE AND ALLFAST)
+ target_compile_definitions(CMSISDSPController PUBLIC ARM_ALL_FAST_TABLES)
+endif()
+
+target_sources(CMSISDSPController PRIVATE arm_pid_init_f32.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_init_q15.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_init_q31.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_f32.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_q15.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_q31.c)
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_F32)
+target_sources(CMSISDSPController PRIVATE arm_sin_cos_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_Q31)
+target_sources(CMSISDSPController PRIVATE arm_sin_cos_q31.c)
+endif()
+
+
+
+### Includes
+target_include_directories(CMSISDSPController PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/ControllerFunctions/ControllerFunctions.c b/DSP/Source/ControllerFunctions/ControllerFunctions.c
new file mode 100644
index 0000000..51720bc
--- /dev/null
+++ b/DSP/Source/ControllerFunctions/ControllerFunctions.c
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: ControllerFunctions.c
+ * Description: Combination of all controller function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_pid_init_f32.c"
+#include "arm_pid_init_q15.c"
+#include "arm_pid_init_q31.c"
+#include "arm_pid_reset_f32.c"
+#include "arm_pid_reset_q15.c"
+#include "arm_pid_reset_q31.c"
+#include "arm_sin_cos_f32.c"
+#include "arm_sin_cos_q31.c"
+
diff --git a/DSP/Source/ControllerFunctions/arm_pid_init_f32.c b/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
index f75d61f..433a65a 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_pid_init_f32.c
* Description: Floating-point PID Control initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,29 +28,30 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
- * @brief Initialization function for the floating-point PID Control.
- * @param[in,out] *S points to an instance of the PID structure.
- * @param[in] resetStateFlag flag to reset the state. 0 = no change in state & 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The resetStateFlag specifies whether to set state to zero or not. \n
- * The function computes the structure fields: A0, A1A2
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+ @brief Initialization function for the floating-point PID Control.
+ @param[in,out] S points to an instance of the PID structure
+ @param[in] resetStateFlag
+ - value = 0: no change in state
+ - value = 1: reset state
+ @return none
+
+ @par Details
+ The resetStateFlag specifies whether to set state to zero or not. \n
+ The function computes the structure fields: A0, A1A2
+ using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+ also sets the state variables to all zeros.
*/
void arm_pid_init_f32(
arm_pid_instance_f32 * S,
int32_t resetStateFlag)
{
-
/* Derived coefficient A0 */
S->A0 = S->Kp + S->Ki + S->Kd;
@@ -63,12 +64,12 @@ void arm_pid_init_f32(
/* Check whether state needs reset or not */
if (resetStateFlag)
{
- /* Clear the state buffer. The size will be always 3 samples */
+ /* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(float32_t));
}
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_pid_init_q15.c b/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
index 61049cf..c88a3d9 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_pid_init_q15.c
* Description: Q15 PID Control initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,22 +28,24 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
- * @details
- * @param[in,out] *S points to an instance of the Q15 PID structure.
- * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The resetStateFlag specifies whether to set state to zero or not. \n
- * The function computes the structure fields: A0, A1A2
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+ @brief Initialization function for the Q15 PID Control.
+ @param[in,out] S points to an instance of the Q15 PID structure
+ @param[in] resetStateFlag
+ - value = 0: no change in state
+ - value = 1: reset state
+ @return none
+
+ @par Details
+ The resetStateFlag specifies whether to set state to zero or not. \n
+ The function computes the structure fields: A0, A1A2
+ using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+ also sets the state variables to all zeros.
*/
void arm_pid_init_q15(
@@ -53,35 +55,20 @@ void arm_pid_init_q15(
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
/* Derived coefficient A0 */
S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);
/* Derived coefficients and pack into A1 */
#ifndef ARM_MATH_BIG_ENDIAN
-
S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);
-
#else
-
S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Check whether state needs reset or not */
- if (resetStateFlag)
- {
- /* Clear the state buffer. The size will be always 3 samples */
- memset(S->state, 0, 3U * sizeof(q15_t));
- }
+#endif
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t temp; /*to store the sum */
+ q31_t temp; /* to store the sum */
/* Derived coefficient A0 */
temp = S->Kp + S->Ki + S->Kd;
@@ -92,19 +79,17 @@ void arm_pid_init_q15(
S->A1 = (q15_t) __SSAT(temp, 16);
S->A2 = S->Kd;
-
+#endif /* #if defined (ARM_MATH_DSP) */
/* Check whether state needs reset or not */
if (resetStateFlag)
{
- /* Clear the state buffer. The size will be always 3 samples */
+ /* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q15_t));
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_pid_init_q31.c b/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
index 17b3b09..1625a5f 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_pid_init_q31.c
* Description: Q31 PID Control initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,22 +28,24 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
- * @brief Initialization function for the Q31 PID Control.
- * @param[in,out] *S points to an instance of the Q31 PID structure.
- * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The resetStateFlag specifies whether to set state to zero or not. \n
- * The function computes the structure fields: A0, A1A2
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+ @brief Initialization function for the Q31 PID Control.
+ @param[in,out] S points to an instance of the Q31 PID structure
+ @param[in] resetStateFlag
+ - value = 0: no change in state
+ - value = 1: reset state
+ @return none
+
+ @par Details
+ The resetStateFlag specifies whether to set state to zero or not. \n
+ The function computes the structure fields: A0, A1A2
+ using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+ also sets the state variables to all zeros.
*/
void arm_pid_init_q31(
@@ -53,20 +55,15 @@ void arm_pid_init_q31(
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
/* Derived coefficient A0 */
S->A0 = __QADD(__QADD(S->Kp, S->Ki), S->Kd);
/* Derived coefficient A1 */
S->A1 = -__QADD(__QADD(S->Kd, S->Kd), S->Kp);
-
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t temp;
+ q31_t temp; /* to store the sum */
/* Derived coefficient A0 */
temp = clip_q63_to_q31((q63_t) S->Kp + S->Ki);
@@ -84,12 +81,12 @@ void arm_pid_init_q31(
/* Check whether state needs reset or not */
if (resetStateFlag)
{
- /* Clear the state buffer. The size will be always 3 samples */
+ /* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q31_t));
}
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c b/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
index 318ec89..d839e55 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
@@ -3,13 +3,13 @@
* Title: arm_pid_reset_f32.c
* Description: Floating-point PID Control reset function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,26 +28,27 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
-* @brief Reset function for the floating-point PID Control.
-* @param[in] *S Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+ @brief Reset function for the floating-point PID Control.
+ @param[in,out] S points to an instance of the floating-point PID structure
+ @return none
+
+ @par Details
+ The function resets the state buffer to zeros.
+ */
+
void arm_pid_reset_f32(
arm_pid_instance_f32 * S)
{
-
- /* Clear the state buffer. The size will be always 3 samples */
+ /* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(float32_t));
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c b/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
index 93c0e7c..256fd8c 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
@@ -3,13 +3,13 @@
* Title: arm_pid_reset_q15.c
* Description: Q15 PID Control reset function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,18 +28,20 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
-* @brief Reset function for the Q15 PID Control.
-* @param[in] *S Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+ @brief Reset function for the Q15 PID Control.
+ @param[in,out] S points to an instance of the Q15 PID structure
+ @return none
+
+ @par Details
+ The function resets the state buffer to zeros.
+ */
+
void arm_pid_reset_q15(
arm_pid_instance_q15 * S)
{
@@ -48,5 +50,5 @@ void arm_pid_reset_q15(
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c b/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
index 4c5b14e..2aa391c 100644
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
@@ -3,13 +3,13 @@
* Title: arm_pid_reset_q31.c
* Description: Q31 PID Control reset function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,26 +28,27 @@
#include "arm_math.h"
- /**
- * @addtogroup PID
- * @{
+/**
+ @addtogroup PID
+ @{
*/
/**
-* @brief Reset function for the Q31 PID Control.
-* @param[in] *S Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+ @brief Reset function for the Q31 PID Control.
+ @param[in,out] S points to an instance of the Q31 PID structure
+ @return none
+
+ @par Details
+ The function resets the state buffer to zeros.
+ */
+
void arm_pid_reset_q31(
arm_pid_instance_q31 * S)
{
-
- /* Clear the state buffer. The size will be always 3 samples */
+ /* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q31_t));
}
/**
- * @} end of PID group
+ @} end of PID group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c b/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
index 7ec1b53..12a1c83 100644
--- a/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
@@ -3,13 +3,13 @@
* Title: arm_sin_cos_f32.c
* Description: Sine and Cosine calculation for floating-point values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,115 +30,117 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupController
+ @ingroup groupController
*/
/**
- * @defgroup SinCos Sine Cosine
- *
- * Computes the trigonometric sine and cosine values using a combination of table lookup
- * and linear interpolation.
- * There are separate functions for Q31 and floating-point data types.
- * The input to the floating-point version is in degrees while the
- * fixed-point Q31 have a scaled input with the range
- * [-1 0.9999] mapping to [-180 +180] degrees.
- *
- * The floating point function also allows values that are out of the usual range. When this happens, the function will
- * take extra time to adjust the input value to the range of [-180 180].
- *
- * The result is accurate to 5 digits after the decimal point.
- *
- * The implementation is based on table lookup using 360 values together with linear interpolation.
- * The steps used are:
- * -# Calculation of the nearest integer table index.
- * -# Compute the fractional portion (fract) of the input.
- * -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
- * -# Sine value is computed as *psinVal = y0 + (fract * (y1 - y0)).
- * -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
- * -# Cosine value is computed as *pcosVal = y0 + (fract * (y1 - y0)).
- */
+ @defgroup SinCos Sine Cosine
- /**
- * @addtogroup SinCos
- * @{
+ Computes the trigonometric sine and cosine values using a combination of table lookup
+ and linear interpolation.
+ There are separate functions for Q31 and floating-point data types.
+ The input to the floating-point version is in degrees while the
+ fixed-point Q31 have a scaled input with the range
+ [-1 0.9999] mapping to [-180 +180] degrees.
+
+ The floating point function also allows values that are out of the usual range. When this happens, the function will
+ take extra time to adjust the input value to the range of [-180 180].
+
+ The result is accurate to 5 digits after the decimal point.
+
+ The implementation is based on table lookup using 360 values together with linear interpolation.
+ The steps used are:
+ -# Calculation of the nearest integer table index.
+ -# Compute the fractional portion (fract) of the input.
+ -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
+ -# Sine value is computed as *psinVal = y0 + (fract * (y1 - y0)).
+ -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
+ -# Cosine value is computed as *pcosVal = y0 + (fract * (y1 - y0)).
*/
/**
- * @brief Floating-point sin_cos function.
- * @param[in] theta input value in degrees
- * @param[out] *pSinVal points to the processed sine output.
- * @param[out] *pCosVal points to the processed cos output.
- * @return none.
+ @addtogroup SinCos
+ @{
+ */
+
+/**
+ @brief Floating-point sin_cos function.
+ @param[in] theta input value in degrees
+ @param[out] pSinVal points to processed sine output
+ @param[out] pCosVal points to processed cosine output
+ @return none
*/
void arm_sin_cos_f32(
- float32_t theta,
- float32_t * pSinVal,
- float32_t * pCosVal)
+ float32_t theta,
+ float32_t * pSinVal,
+ float32_t * pCosVal)
{
- float32_t fract, in; /* Temporary variables for input, output */
- uint16_t indexS, indexC; /* Index variable */
- float32_t f1, f2, d1, d2; /* Two nearest output values */
- float32_t findex, Dn, Df, temp;
+ float32_t fract, in; /* Temporary input, output variables */
+ uint16_t indexS, indexC; /* Index variable */
+ float32_t f1, f2, d1, d2; /* Two nearest output values */
+ float32_t Dn, Df;
+ float32_t temp, findex;
- /* input x is in degrees */
- /* Scale the input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
- in = theta * 0.00277777777778f;
+ /* input x is in degrees */
+ /* Scale input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
+ in = theta * 0.00277777777778f;
- if (in < 0.0f)
- {
- in = -in;
- }
+ if (in < 0.0f)
+ {
+ in = -in;
+ }
- in = in - (int32_t)in;
+ in = in - (int32_t)in;
- /* Calculation of index of the table */
- findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
- indexS = ((uint16_t)findex) & 0x1ff;
- indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
+ /* Calculate the nearest index */
+ findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+ indexS = ((uint16_t)findex) & 0x1ff;
+ indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
- /* fractional value calculation */
- fract = findex - (float32_t) indexS;
+ /* Calculation of fractional value */
+ fract = findex - (float32_t) indexS;
- /* Read two nearest values of input value from the cos & sin tables */
- f1 = sinTable_f32[indexC+0];
- f2 = sinTable_f32[indexC+1];
- d1 = -sinTable_f32[indexS+0];
- d2 = -sinTable_f32[indexS+1];
+ /* Read two nearest values of input value from the cos & sin tables */
+ f1 = sinTable_f32[indexC ];
+ f2 = sinTable_f32[indexC+1];
+ d1 = -sinTable_f32[indexS ];
+ d2 = -sinTable_f32[indexS+1];
- temp = (1.0f - fract) * f1 + fract * f2;
+ temp = (1.0f - fract) * f1 + fract * f2;
- Dn = 0.0122718463030f; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
- Df = f2 - f1; // delta between the values of the functions
+ Dn = 0.0122718463030f; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
+ Df = f2 - f1; /* delta between the values of the functions */
- temp = Dn *(d1 + d2) - 2 * Df;
- temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
- temp = fract * temp + d1 * Dn;
+ temp = Dn * (d1 + d2) - 2 * Df;
+ temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
+ temp = fract * temp + d1 * Dn;
- /* Calculation of cosine value */
- *pCosVal = fract * temp + f1;
+ /* Calculation of cosine value */
+ *pCosVal = fract * temp + f1;
- /* Read two nearest values of input value from the cos & sin tables */
- f1 = sinTable_f32[indexS+0];
- f2 = sinTable_f32[indexS+1];
- d1 = sinTable_f32[indexC+0];
- d2 = sinTable_f32[indexC+1];
+ /* Read two nearest values of input value from the cos & sin tables */
+ f1 = sinTable_f32[indexS ];
+ f2 = sinTable_f32[indexS+1];
+ d1 = sinTable_f32[indexC ];
+ d2 = sinTable_f32[indexC+1];
- temp = (1.0f - fract) * f1 + fract * f2;
+ temp = (1.0f - fract) * f1 + fract * f2;
- Df = f2 - f1; // delta between the values of the functions
- temp = Dn*(d1 + d2) - 2*Df;
- temp = fract*temp + (3*Df - (d2 + 2*d1)*Dn);
- temp = fract*temp + d1*Dn;
+ Df = f2 - f1; // delta between the values of the functions
+ temp = Dn * (d1 + d2) - 2 * Df;
+ temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
+ temp = fract * temp + d1 * Dn;
- /* Calculation of sine value */
- *pSinVal = fract*temp + f1;
+ /* Calculation of sine value */
+ *pSinVal = fract * temp + f1;
- if (theta < 0.0f)
- {
- *pSinVal = -*pSinVal;
- }
+ if (theta < 0.0f)
+ {
+ *pSinVal = -*pSinVal;
+ }
}
+
/**
- * @} end of SinCos group
+ @} end of SinCos group
*/
diff --git a/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c b/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
index d661830..84ee3d2 100644
--- a/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
@@ -3,13 +3,13 @@
* Title: arm_sin_cos_q31.c
* Description: Cosine & Sine calculation for Q31 values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,23 +30,22 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupController
- */
-
- /**
- * @addtogroup SinCos
- * @{
+ @ingroup groupController
*/
/**
- * @brief Q31 sin_cos function.
- * @param[in] theta scaled input value in degrees
- * @param[out] *pSinVal points to the processed sine output.
- * @param[out] *pCosVal points to the processed cosine output.
- * @return none.
- *
- * The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
- *
+ @addtogroup SinCos
+ @{
+ */
+
+/**
+ @brief Q31 sin_cos function.
+ @param[in] theta scaled input value in degrees
+ @param[out] pSinVal points to processed sine output
+ @param[out] pCosVal points to processed cosine output
+ @return none
+
+ The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
*/
void arm_sin_cos_q31(
@@ -54,9 +53,9 @@ void arm_sin_cos_q31(
q31_t * pSinVal,
q31_t * pCosVal)
{
- q31_t fract; /* Temporary variables for input, output */
- uint16_t indexS, indexC; /* Index variable */
- q31_t f1, f2, d1, d2; /* Two nearest output values */
+ q31_t fract; /* Temporary input, output variables */
+ uint16_t indexS, indexC; /* Index variable */
+ q31_t f1, f2, d1, d2; /* Two nearest output values */
q31_t Dn, Df;
q63_t temp;
@@ -68,43 +67,44 @@ void arm_sin_cos_q31(
fract = (theta - (indexS << CONTROLLER_Q31_SHIFT)) << 8;
/* Read two nearest values of input value from the cos & sin tables */
- f1 = sinTable_q31[indexC+0];
- f2 = sinTable_q31[indexC+1];
- d1 = -sinTable_q31[indexS+0];
+ f1 = sinTable_q31[indexC ];
+ f2 = sinTable_q31[indexC+1];
+ d1 = -sinTable_q31[indexS ];
d2 = -sinTable_q31[indexS+1];
- Dn = 0x1921FB5; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
- Df = f2 - f1; // delta between the values of the functions
- temp = Dn*((q63_t)d1 + d2);
+ Dn = 0x1921FB5; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
+ Df = f2 - f1; /* delta between the values of the functions */
+
+ temp = Dn * ((q63_t)d1 + d2);
temp = temp - ((q63_t)Df << 32);
- temp = (q63_t)fract*(temp >> 31);
- temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
- temp = (q63_t)fract*(temp >> 31);
- temp = temp + (q63_t)d1*Dn;
- temp = (q63_t)fract*(temp >> 31);
+ temp = (q63_t)fract * (temp >> 31);
+ temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
+ temp = (q63_t)fract * (temp >> 31);
+ temp = temp + (q63_t)d1 * Dn;
+ temp = (q63_t)fract * (temp >> 31);
/* Calculation of cosine value */
*pCosVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
/* Read two nearest values of input value from the cos & sin tables */
- f1 = sinTable_q31[indexS+0];
+ f1 = sinTable_q31[indexS ];
f2 = sinTable_q31[indexS+1];
- d1 = sinTable_q31[indexC+0];
+ d1 = sinTable_q31[indexC ];
d2 = sinTable_q31[indexC+1];
Df = f2 - f1; // delta between the values of the functions
- temp = Dn*((q63_t)d1 + d2);
+ temp = Dn * ((q63_t)d1 + d2);
temp = temp - ((q63_t)Df << 32);
- temp = (q63_t)fract*(temp >> 31);
- temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
- temp = (q63_t)fract*(temp >> 31);
- temp = temp + (q63_t)d1*Dn;
- temp = (q63_t)fract*(temp >> 31);
+ temp = (q63_t)fract * (temp >> 31);
+ temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
+ temp = (q63_t)fract * (temp >> 31);
+ temp = temp + (q63_t)d1 * Dn;
+ temp = (q63_t)fract * (temp >> 31);
/* Calculation of sine value */
*pSinVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
}
/**
- * @} end of SinCos group
+ @} end of SinCos group
*/
diff --git a/DSP/Source/FastMathFunctions/CMakeLists.txt b/DSP/Source/FastMathFunctions/CMakeLists.txt
new file mode 100644
index 0000000..6719b41
--- /dev/null
+++ b/DSP/Source/FastMathFunctions/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPFastMath)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPFastMath STATIC)
+
+include(interpol)
+interpol(CMSISDSPFastMath)
+
+if (CONFIGTABLE AND ALLFAST)
+ target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_ALL_FAST_TABLES)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_F32)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q15)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q31)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_F32)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q15)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q31)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_q31.c)
+endif()
+
+target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q15.c)
+target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q31.c)
+
+
+configdsp(CMSISDSPFastMath ..)
+
+### Includes
+target_include_directories(CMSISDSPFastMath PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/FastMathFunctions/FastMathFunctions.c b/DSP/Source/FastMathFunctions/FastMathFunctions.c
new file mode 100644
index 0000000..abd919e
--- /dev/null
+++ b/DSP/Source/FastMathFunctions/FastMathFunctions.c
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: FastMathFunctions.c
+ * Description: Combination of all fast math function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_cos_f32.c"
+#include "arm_cos_q15.c"
+#include "arm_cos_q31.c"
+#include "arm_sin_f32.c"
+#include "arm_sin_q15.c"
+#include "arm_sin_q31.c"
+#include "arm_sqrt_q15.c"
+#include "arm_sqrt_q31.c"
+
diff --git a/DSP/Source/FastMathFunctions/arm_cos_f32.c b/DSP/Source/FastMathFunctions/arm_cos_f32.c
index e604b3c..26bd66e 100644
--- a/DSP/Source/FastMathFunctions/arm_cos_f32.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cos_f32.c
* Description: Fast cosine calculation for floating-point values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,56 +28,57 @@
#include "arm_math.h"
#include "arm_common_tables.h"
+
/**
- * @ingroup groupFastMath
+ @ingroup groupFastMath
*/
/**
- * @defgroup cos Cosine
- *
- * Computes the trigonometric cosine function using a combination of table lookup
- * and linear interpolation. There are separate functions for
- * Q15, Q31, and floating-point data types.
- * The input to the floating-point version is in radians and in the range [0 2*pi) while the
- * fixed-point Q15 and Q31 have a scaled input with the range
- * [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
- * value of 2*pi wraps around to 0.
- *
- * The implementation is based on table lookup using 256 values together with linear interpolation.
- * The steps used are:
- * -# Calculation of the nearest integer table index
- * -# Compute the fractional portion (fract) of the table index.
- * -# The final result equals (1.0f-fract)*a + fract*b;
- *
- * where
- *
- * b=Table[index+0];
- * c=Table[index+1];
- *
- */
+ @defgroup cos Cosine
- /**
- * @addtogroup cos
- * @{
+ Computes the trigonometric cosine function using a combination of table lookup
+ and linear interpolation. There are separate functions for
+ Q15, Q31, and floating-point data types.
+ The input to the floating-point version is in radians while the
+ fixed-point Q15 and Q31 have a scaled input with the range
+ [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
+ value of 2*pi wraps around to 0.
+
+ The implementation is based on table lookup using 256 values together with linear interpolation.
+ The steps used are:
+ -# Calculation of the nearest integer table index
+ -# Compute the fractional portion (fract) of the table index.
+ -# The final result equals (1.0f-fract)*a + fract*b;
+
+ where
+
+ b = Table[index];
+ c = Table[index+1];
+
*/
/**
- * @brief Fast approximation to the trigonometric cosine function for floating-point data.
- * @param[in] x input value in radians.
- * @return cos(x).
+ @addtogroup cos
+ @{
+ */
+
+/**
+ @brief Fast approximation to the trigonometric cosine function for floating-point data.
+ @param[in] x input value in radians
+ @return cos(x)
*/
float32_t arm_cos_f32(
float32_t x)
{
- float32_t cosVal, fract, in; /* Temporary variables for input, output */
+ float32_t cosVal, fract, in; /* Temporary input, output variables */
uint16_t index; /* Index variable */
float32_t a, b; /* Two nearest output values */
int32_t n;
float32_t findex;
/* input x is in radians */
- /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
+ /* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
in = x * 0.159154943092f + 0.25f;
/* Calculation of floor value of input */
@@ -93,8 +94,14 @@ float32_t arm_cos_f32(
in = in - (float32_t) n;
/* Calculation of index of the table */
- findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
- index = ((uint16_t)findex) & 0x1ff;
+ findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+ index = (uint16_t)findex;
+
+ /* when "in" is exactly 1, we need to rotate the index down to 0 */
+ if (index >= FAST_MATH_TABLE_SIZE) {
+ index = 0;
+ findex -= (float32_t)FAST_MATH_TABLE_SIZE;
+ }
/* fractional value calculation */
fract = findex - (float32_t) index;
@@ -104,12 +111,12 @@ float32_t arm_cos_f32(
b = sinTable_f32[index+1];
/* Linear interpolation process */
- cosVal = (1.0f-fract)*a + fract*b;
+ cosVal = (1.0f - fract) * a + fract * b;
- /* Return the output value */
+ /* Return output value */
return (cosVal);
}
/**
- * @} end of cos group
+ @} end of cos group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_cos_q15.c b/DSP/Source/FastMathFunctions/arm_cos_q15.c
index 7fa2e18..3bb829c 100644
--- a/DSP/Source/FastMathFunctions/arm_cos_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cos_q15.c
* Description: Fast cosine calculation for Q15 values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,36 +30,35 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup cos
- * @{
+ @ingroup groupFastMath
*/
/**
- * @brief Fast approximation to the trigonometric cosine function for Q15 data.
- * @param[in] x Scaled input value in radians.
- * @return cos(x).
- *
- * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian
- * value in the range [0 2*pi).
+ @addtogroup cos
+ @{
+ */
+
+/**
+ @brief Fast approximation to the trigonometric cosine function for Q15 data.
+ @param[in] x Scaled input value in radians
+ @return cos(x)
+
+ The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q15_t arm_cos_q15(
q15_t x)
{
- q15_t cosVal; /* Temporary variables for input, output */
- int32_t index; /* Index variables */
- q15_t a, b; /* Four nearest output values */
+ q15_t cosVal; /* Temporary input, output variables */
+ int32_t index; /* Index variable */
+ q15_t a, b; /* Two nearest output values */
q15_t fract; /* Temporary values for fractional values */
/* add 0.25 (pi/2) to read sine table */
x = (uint16_t)x + 0x2000;
if (x < 0)
- { /* convert negative numbers to corresponding positive ones */
- x = (uint16_t)x + 0x8000;
+ { /* convert negative numbers to corresponding positive ones */
+ x = (uint16_t)x + 0x8000;
}
/* Calculate the nearest index */
@@ -73,12 +72,13 @@ q15_t arm_cos_q15(
b = sinTable_q15[index+1];
/* Linear interpolation process */
- cosVal = (q31_t)(0x8000-fract)*a >> 16;
- cosVal = (q15_t)((((q31_t)cosVal << 16) + ((q31_t)fract*b)) >> 16);
+ cosVal = (q31_t) (0x8000 - fract) * a >> 16;
+ cosVal = (q15_t) ((((q31_t) cosVal << 16) + ((q31_t) fract * b)) >> 16);
- return cosVal << 1;
+ /* Return output value */
+ return (cosVal << 1);
}
/**
- * @} end of cos group
+ @} end of cos group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_cos_q31.c b/DSP/Source/FastMathFunctions/arm_cos_q31.c
index fde5368..8b7ff78 100644
--- a/DSP/Source/FastMathFunctions/arm_cos_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cos_q31.c
* Description: Fast cosine calculation for Q31 values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,36 +30,35 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup cos
- * @{
+ @ingroup groupFastMath
*/
/**
- * @brief Fast approximation to the trigonometric cosine function for Q31 data.
- * @param[in] x Scaled input value in radians.
- * @return cos(x).
- *
- * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian
- * value in the range [0 2*pi).
+ @addtogroup cos
+ @{
+ */
+
+/**
+ @brief Fast approximation to the trigonometric cosine function for Q31 data.
+ @param[in] x Scaled input value in radians
+ @return cos(x)
+
+ The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q31_t arm_cos_q31(
q31_t x)
{
- q31_t cosVal; /* Temporary variables for input, output */
- int32_t index; /* Index variables */
- q31_t a, b; /* Four nearest output values */
+ q31_t cosVal; /* Temporary input, output variables */
+ int32_t index; /* Index variable */
+ q31_t a, b; /* Two nearest output values */
q31_t fract; /* Temporary values for fractional values */
/* add 0.25 (pi/2) to read sine table */
x = (uint32_t)x + 0x20000000;
if (x < 0)
- { /* convert negative numbers to corresponding positive ones */
- x = (uint32_t)x + 0x80000000;
+ { /* convert negative numbers to corresponding positive ones */
+ x = (uint32_t)x + 0x80000000;
}
/* Calculate the nearest index */
@@ -73,12 +72,13 @@ q31_t arm_cos_q31(
b = sinTable_q31[index+1];
/* Linear interpolation process */
- cosVal = (q63_t)(0x80000000-fract)*a >> 32;
- cosVal = (q31_t)((((q63_t)cosVal << 32) + ((q63_t)fract*b)) >> 32);
+ cosVal = (q63_t) (0x80000000 - fract) * a >> 32;
+ cosVal = (q31_t) ((((q63_t) cosVal << 32) + ((q63_t) fract * b)) >> 32);
- return cosVal << 1;
+ /* Return output value */
+ return (cosVal << 1);
}
/**
- * @} end of cos group
+ @} end of cos group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_sin_f32.c b/DSP/Source/FastMathFunctions/arm_sin_f32.c
index ce8b9b9..97c6902 100644
--- a/DSP/Source/FastMathFunctions/arm_sin_f32.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_f32.c
@@ -3,13 +3,13 @@
* Title: arm_sin_f32.c
* Description: Fast sine calculation for floating-point values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,70 +28,64 @@
#include "arm_math.h"
#include "arm_common_tables.h"
-#include
/**
- * @ingroup groupFastMath
+ @ingroup groupFastMath
*/
/**
- * @defgroup sin Sine
- *
- * Computes the trigonometric sine function using a combination of table lookup
- * and linear interpolation. There are separate functions for
- * Q15, Q31, and floating-point data types.
- * The input to the floating-point version is in radians and in the range [0 2*pi) while the
- * fixed-point Q15 and Q31 have a scaled input with the range
- * [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
- * value of 2*pi wraps around to 0.
- *
- * The implementation is based on table lookup using 256 values together with linear interpolation.
- * The steps used are:
- * -# Calculation of the nearest integer table index
- * -# Compute the fractional portion (fract) of the table index.
- * -# The final result equals (1.0f-fract)*a + fract*b;
- *
- * where
- *
- * b=Table[index+0];
- * c=Table[index+1];
- *
+ @defgroup sin Sine
+
+ Computes the trigonometric sine function using a combination of table lookup
+ and linear interpolation. There are separate functions for
+ Q15, Q31, and floating-point data types.
+ The input to the floating-point version is in radians while the
+ fixed-point Q15 and Q31 have a scaled input with the range
+ [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
+ value of 2*pi wraps around to 0.
+
+ The implementation is based on table lookup using 256 values together with linear interpolation.
+ The steps used are:
+ -# Calculation of the nearest integer table index
+ -# Compute the fractional portion (fract) of the table index.
+ -# The final result equals (1.0f-fract)*a + fract*b;
+
+ where
+
+ b = Table[index];
+ c = Table[index+1];
+
*/
/**
- * @addtogroup sin
- * @{
+ @addtogroup sin
+ @{
*/
/**
- * @brief Fast approximation to the trigonometric sine function for floating-point data.
- * @param[in] x input value in radians.
- * @return sin(x).
+ @brief Fast approximation to the trigonometric sine function for floating-point data.
+ @param[in] x input value in radians.
+ @return sin(x)
*/
float32_t arm_sin_f32(
float32_t x)
{
- float32_t sinVal, fract, in; /* Temporary variables for input, output */
- uint16_t index; /* Index variable */
- float32_t a, b; /* Two nearest output values */
+ float32_t sinVal, fract, in; /* Temporary input, output variables */
+ uint16_t index; /* Index variable */
+ float32_t a, b; /* Two nearest output values */
int32_t n;
float32_t findex;
- /* Special case for small negative inputs */
- if ((x < 0.0f) && (x >= -1.9e-7f)) {
- return x;
- }
-
/* input x is in radians */
- /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi */
+ /* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi */
in = x * 0.159154943092f;
/* Calculation of floor value of input */
n = (int32_t) in;
/* Make negative values towards -infinity */
- if (x < 0.0f)
+ if (in < 0.0f)
{
n--;
}
@@ -100,9 +94,14 @@ float32_t arm_sin_f32(
in = in - (float32_t) n;
/* Calculation of index of the table */
- findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
+ findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+ index = (uint16_t)findex;
- index = ((uint16_t)findex) & 0x1ff;
+ /* when "in" is exactly 1, we need to rotate the index down to 0 */
+ if (index >= FAST_MATH_TABLE_SIZE) {
+ index = 0;
+ findex -= (float32_t)FAST_MATH_TABLE_SIZE;
+ }
/* fractional value calculation */
fract = findex - (float32_t) index;
@@ -112,12 +111,12 @@ float32_t arm_sin_f32(
b = sinTable_f32[index+1];
/* Linear interpolation process */
- sinVal = (1.0f-fract)*a + fract*b;
+ sinVal = (1.0f - fract) * a + fract * b;
- /* Return the output value */
+ /* Return output value */
return (sinVal);
}
/**
- * @} end of sin group
+ @} end of sin group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_sin_q15.c b/DSP/Source/FastMathFunctions/arm_sin_q15.c
index 7c8f627..1f0c2bf 100644
--- a/DSP/Source/FastMathFunctions/arm_sin_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_q15.c
@@ -3,13 +3,13 @@
* Title: arm_sin_q15.c
* Description: Fast sine calculation for Q15 values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,28 +30,28 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup sin
- * @{
+ @ingroup groupFastMath
*/
/**
- * @brief Fast approximation to the trigonometric sine function for Q15 data.
- * @param[in] x Scaled input value in radians.
- * @return sin(x).
- *
- * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi).
+ @addtogroup sin
+ @{
+ */
+
+/**
+ @brief Fast approximation to the trigonometric sine function for Q15 data.
+ @param[in] x Scaled input value in radians
+ @return sin(x)
+
+ The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q15_t arm_sin_q15(
q15_t x)
{
- q15_t sinVal; /* Temporary variables for input, output */
- int32_t index; /* Index variables */
- q15_t a, b; /* Four nearest output values */
+ q15_t sinVal; /* Temporary input, output variables */
+ int32_t index; /* Index variable */
+ q15_t a, b; /* Two nearest output values */
q15_t fract; /* Temporary values for fractional values */
/* Calculate the nearest index */
@@ -65,12 +65,13 @@ q15_t arm_sin_q15(
b = sinTable_q15[index+1];
/* Linear interpolation process */
- sinVal = (q31_t)(0x8000-fract)*a >> 16;
- sinVal = (q15_t)((((q31_t)sinVal << 16) + ((q31_t)fract*b)) >> 16);
+ sinVal = (q31_t) (0x8000 - fract) * a >> 16;
+ sinVal = (q15_t) ((((q31_t) sinVal << 16) + ((q31_t) fract * b)) >> 16);
- return sinVal << 1;
+ /* Return output value */
+ return (sinVal << 1);
}
/**
- * @} end of sin group
+ @} end of sin group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_sin_q31.c b/DSP/Source/FastMathFunctions/arm_sin_q31.c
index 8d3c7ac..8cefabb 100644
--- a/DSP/Source/FastMathFunctions/arm_sin_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_q31.c
@@ -3,13 +3,13 @@
* Title: arm_sin_q31.c
* Description: Fast sine calculation for Q31 values
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,27 +30,28 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup sin
- * @{
+ @ingroup groupFastMath
*/
/**
- * @brief Fast approximation to the trigonometric sine function for Q31 data.
- * @param[in] x Scaled input value in radians.
- * @return sin(x).
- *
- * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi). */
+ @addtogroup sin
+ @{
+ */
+
+/**
+ @brief Fast approximation to the trigonometric sine function for Q31 data.
+ @param[in] x Scaled input value in radians
+ @return sin(x)
+
+ The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
+ */
q31_t arm_sin_q31(
q31_t x)
{
q31_t sinVal; /* Temporary variables for input, output */
- int32_t index; /* Index variables */
- q31_t a, b; /* Four nearest output values */
+ int32_t index; /* Index variable */
+ q31_t a, b; /* Two nearest output values */
q31_t fract; /* Temporary values for fractional values */
/* Calculate the nearest index */
@@ -64,12 +65,13 @@ q31_t arm_sin_q31(
b = sinTable_q31[index+1];
/* Linear interpolation process */
- sinVal = (q63_t)(0x80000000-fract)*a >> 32;
- sinVal = (q31_t)((((q63_t)sinVal << 32) + ((q63_t)fract*b)) >> 32);
+ sinVal = (q63_t) (0x80000000 - fract) * a >> 32;
+ sinVal = (q31_t) ((((q63_t) sinVal << 32) + ((q63_t) fract * b)) >> 32);
- return sinVal << 1;
+ /* Return output value */
+ return (sinVal << 1);
}
/**
- * @} end of sin group
+ @} end of sin group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_sqrt_q15.c b/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
index 8487ed3..fab0a32 100644
--- a/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_sqrt_q15.c
* Description: Q15 square root function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,31 +29,30 @@
#include "arm_math.h"
#include "arm_common_tables.h"
-
/**
- * @ingroup groupFastMath
+ @ingroup groupFastMath
*/
/**
- * @addtogroup SQRT
- * @{
+ @addtogroup SQRT
+ @{
*/
- /**
- * @brief Q15 square root function.
- * @param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF.
- * @param[out] *pOut square root of input value.
- * @return The function returns ARM_MATH_SUCCESS if the input value is positive
- * and ARM_MATH_ARGUMENT_ERROR if the input is negative. For
- * negative inputs, the function returns *pOut = 0.
- */
+/**
+ @brief Q15 square root function.
+ @param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF
+ @param[out] pOut points to square root of input value
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : input value is positive
+ - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
arm_status arm_sqrt_q15(
q15_t in,
q15_t * pOut)
{
- q15_t number, temp1, var1, signBits1, half;
q31_t bits_val1;
+ q15_t number, temp1, var1, signBits1, half;
float32_t temp_float1;
union
{
@@ -85,7 +84,7 @@ arm_status arm_sqrt_q15(
/* Convert to float */
temp_float1 = number * 3.051757812500000e-005f;
- /*Store as integer */
+ /* Store as integer */
tempconv.floatval = temp_float1;
bits_val1 = tempconv.fracval;
/* Subtract the shifted value from the magic number to give intial guess */
@@ -135,10 +134,11 @@ arm_status arm_sqrt_q15(
else
{
*pOut = 0;
+
return (ARM_MATH_ARGUMENT_ERROR);
}
}
/**
- * @} end of SQRT group
+ @} end of SQRT group
*/
diff --git a/DSP/Source/FastMathFunctions/arm_sqrt_q31.c b/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
index 0deea04..9889b13 100644
--- a/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
@@ -3,13 +3,13 @@
* Title: arm_sqrt_q31.c
* Description: Q31 square root function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,33 +30,34 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupFastMath
+ @ingroup groupFastMath
*/
/**
- * @addtogroup SQRT
- * @{
+ @addtogroup SQRT
+ @{
*/
/**
- * @brief Q31 square root function.
- * @param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF.
- * @param[out] *pOut square root of input value.
- * @return The function returns ARM_MATH_SUCCESS if the input value is positive
- * and ARM_MATH_ARGUMENT_ERROR if the input is negative. For
- * negative inputs, the function returns *pOut = 0.
+ @brief Q31 square root function.
+ @param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
+ @param[out] pOut points to square root of input value
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : input value is positive
+ - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
arm_status arm_sqrt_q31(
q31_t in,
q31_t * pOut)
{
- q31_t number, temp1, bits_val1, var1, signBits1, half;
+ q31_t bits_val1;
+ q31_t number, temp1, var1, signBits1, half;
float32_t temp_float1;
union
{
- q31_t fracval;
- float32_t floatval;
+ q31_t fracval;
+ float32_t floatval;
} tempconv;
number = in;
@@ -81,9 +82,9 @@ arm_status arm_sqrt_q31(
/* Store the number for later use */
temp1 = number;
- /*Convert to float */
+ /* Convert to float */
temp_float1 = number * 4.6566128731e-010f;
- /*Store as integer */
+ /* Store as integer */
tempconv.floatval = temp_float1;
bits_val1 = tempconv.fracval;
/* Subtract the shifted value from the magic number to give intial guess */
@@ -133,10 +134,11 @@ arm_status arm_sqrt_q31(
else
{
*pOut = 0;
+
return (ARM_MATH_ARGUMENT_ERROR);
}
}
/**
- * @} end of SQRT group
+ @} end of SQRT group
*/
diff --git a/DSP/Source/FilteringFunctions/CMakeLists.txt b/DSP/Source/FilteringFunctions/CMakeLists.txt
new file mode 100644
index 0000000..59471ad
--- /dev/null
+++ b/DSP/Source/FilteringFunctions/CMakeLists.txt
@@ -0,0 +1,128 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPFiltering)
+
+
+add_library(CMSISDSPFiltering STATIC)
+
+include(interpol)
+interpol(CMSISDSPFiltering)
+
+configdsp(CMSISDSPFiltering ..)
+
+if (CONFIGTABLE AND ALLFAST)
+target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_ALL_FAST_TABLES)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q31)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q15)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q15.c)
+endif()
+
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f64.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f64.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_q31.c)
+
+
+### Includes
+target_include_directories(CMSISDSPFiltering PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/FilteringFunctions/FilteringFunctions.c b/DSP/Source/FilteringFunctions/FilteringFunctions.c
new file mode 100644
index 0000000..7ce0cdb
--- /dev/null
+++ b/DSP/Source/FilteringFunctions/FilteringFunctions.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: FilteringFunctions.c
+ * Description: Combination of all filtering function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_biquad_cascade_df1_32x64_init_q31.c"
+#include "arm_biquad_cascade_df1_32x64_q31.c"
+#include "arm_biquad_cascade_df1_f32.c"
+#include "arm_biquad_cascade_df1_fast_q15.c"
+#include "arm_biquad_cascade_df1_fast_q31.c"
+#include "arm_biquad_cascade_df1_init_f32.c"
+#include "arm_biquad_cascade_df1_init_q15.c"
+#include "arm_biquad_cascade_df1_init_q31.c"
+#include "arm_biquad_cascade_df1_q15.c"
+#include "arm_biquad_cascade_df1_q31.c"
+#include "arm_biquad_cascade_df2T_f32.c"
+#include "arm_biquad_cascade_df2T_f64.c"
+#include "arm_biquad_cascade_df2T_init_f32.c"
+#include "arm_biquad_cascade_df2T_init_f64.c"
+#include "arm_biquad_cascade_stereo_df2T_f32.c"
+#include "arm_biquad_cascade_stereo_df2T_init_f32.c"
+#include "arm_conv_f32.c"
+#include "arm_conv_fast_opt_q15.c"
+#include "arm_conv_fast_q15.c"
+#include "arm_conv_fast_q31.c"
+#include "arm_conv_opt_q15.c"
+#include "arm_conv_opt_q7.c"
+#include "arm_conv_partial_f32.c"
+#include "arm_conv_partial_fast_opt_q15.c"
+#include "arm_conv_partial_fast_q15.c"
+#include "arm_conv_partial_fast_q31.c"
+#include "arm_conv_partial_opt_q15.c"
+#include "arm_conv_partial_opt_q7.c"
+#include "arm_conv_partial_q15.c"
+#include "arm_conv_partial_q31.c"
+#include "arm_conv_partial_q7.c"
+#include "arm_conv_q15.c"
+#include "arm_conv_q31.c"
+#include "arm_conv_q7.c"
+#include "arm_correlate_f32.c"
+#include "arm_correlate_fast_opt_q15.c"
+#include "arm_correlate_fast_q15.c"
+#include "arm_correlate_fast_q31.c"
+#include "arm_correlate_opt_q15.c"
+#include "arm_correlate_opt_q7.c"
+#include "arm_correlate_q15.c"
+#include "arm_correlate_q31.c"
+#include "arm_correlate_q7.c"
+#include "arm_fir_decimate_f32.c"
+#include "arm_fir_decimate_fast_q15.c"
+#include "arm_fir_decimate_fast_q31.c"
+#include "arm_fir_decimate_init_f32.c"
+#include "arm_fir_decimate_init_q15.c"
+#include "arm_fir_decimate_init_q31.c"
+#include "arm_fir_decimate_q15.c"
+#include "arm_fir_decimate_q31.c"
+#include "arm_fir_f32.c"
+#include "arm_fir_fast_q15.c"
+#include "arm_fir_fast_q31.c"
+#include "arm_fir_init_f32.c"
+#include "arm_fir_init_q15.c"
+#include "arm_fir_init_q31.c"
+#include "arm_fir_init_q7.c"
+#include "arm_fir_interpolate_f32.c"
+#include "arm_fir_interpolate_init_f32.c"
+#include "arm_fir_interpolate_init_q15.c"
+#include "arm_fir_interpolate_init_q31.c"
+#include "arm_fir_interpolate_q15.c"
+#include "arm_fir_interpolate_q31.c"
+#include "arm_fir_lattice_f32.c"
+#include "arm_fir_lattice_init_f32.c"
+#include "arm_fir_lattice_init_q15.c"
+#include "arm_fir_lattice_init_q31.c"
+#include "arm_fir_lattice_q15.c"
+#include "arm_fir_lattice_q31.c"
+#include "arm_fir_q15.c"
+#include "arm_fir_q31.c"
+#include "arm_fir_q7.c"
+#include "arm_fir_sparse_f32.c"
+#include "arm_fir_sparse_init_f32.c"
+#include "arm_fir_sparse_init_q15.c"
+#include "arm_fir_sparse_init_q31.c"
+#include "arm_fir_sparse_init_q7.c"
+#include "arm_fir_sparse_q15.c"
+#include "arm_fir_sparse_q31.c"
+#include "arm_fir_sparse_q7.c"
+#include "arm_iir_lattice_f32.c"
+#include "arm_iir_lattice_init_f32.c"
+#include "arm_iir_lattice_init_q15.c"
+#include "arm_iir_lattice_init_q31.c"
+#include "arm_iir_lattice_q15.c"
+#include "arm_iir_lattice_q31.c"
+#include "arm_lms_f32.c"
+#include "arm_lms_init_f32.c"
+#include "arm_lms_init_q15.c"
+#include "arm_lms_init_q31.c"
+#include "arm_lms_norm_f32.c"
+#include "arm_lms_norm_init_f32.c"
+#include "arm_lms_norm_init_q15.c"
+#include "arm_lms_norm_init_q31.c"
+#include "arm_lms_norm_q15.c"
+#include "arm_lms_norm_q31.c"
+#include "arm_lms_q15.c"
+#include "arm_lms_q31.c"
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
index 8f92496..ac2313f 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_32x64_init_q31.c
* Description: High precision Q31 Biquad cascade filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,53 +29,49 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1_32x64
- * @{
+ @addtogroup BiquadCascadeDF1_32x64
+ @{
*/
/**
- * @details
- *
- * @param[in,out] *S points to an instance of the high precision Q31 Biquad cascade filter structure.
- * @param[in] numStages number of 2nd order stages in the filter.
- * @param[in] *pCoeffs points to the filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format.
- * @return none
- *
- * Coefficient and State Ordering:
- *
- * \par
- * The coefficients are stored in the array pCoeffs in the following order:
- *
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState points to state variables array and size of each state variable is 1.63 format.
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
- * The state variables are arranged in the state array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the Q31 Biquad cascade 32x64 filter.
+ @param[in,out] S points to an instance of the high precision Q31 Biquad cascade filter structure
+ @param[in] numStages number of 2nd order stages in the filter
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState points to state variables array and size of each state variable is 1.63 format.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ The state variables are arranged in the state array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cas_df1_32x64_init_q31(
- arm_biquad_cas_df1_32x64_ins_q31 * S,
- uint8_t numStages,
- q31_t * pCoeffs,
- q63_t * pState,
- uint8_t postShift)
+ arm_biquad_cas_df1_32x64_ins_q31 * S,
+ uint8_t numStages,
+ const q31_t * pCoeffs,
+ q63_t * pState,
+ uint8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -94,5 +90,5 @@ void arm_biquad_cas_df1_32x64_init_q31(
}
/**
- * @} end of BiquadCascadeDF1_32x64 group
+ @} end of BiquadCascadeDF1_32x64 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
index c77cc8e..9a284b8 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_32x64_q31.c
* Description: High precision Q31 Biquad cascade filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,174 +29,169 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
- *
- * This function implements a high precision Biquad cascade filter which operates on
- * Q31 data values. The filter coefficients are in 1.31 format and the state variables
- * are in 1.63 format. The double precision state variables reduce quantization noise
- * in the filter and provide a cleaner output.
- * These filters are particularly useful when implementing filters in which the
- * singularities are close to the unit circle. This is common for low pass or high
- * pass filters with very low cutoff frequencies.
- *
- * The function operates on blocks of input and output data
- * and each call to the function processes blockSize samples through
- * the filter. pSrc and pDst points to input and output arrays
- * containing blockSize Q31 values.
- *
- * \par Algorithm
- * Each Biquad stage implements a second order filter using the difference equation:
- *
- * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
- * \image html Biquad.gif "Single Biquad filter stage"
- * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
- * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
- * Pay careful attention to the sign of the feedback coefficients.
- * Some design tools use the difference equation
- *
- * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
- *
- * \par
- * Higher order filters are realized as a cascade of second order sections.
- * numStages refers to the number of second order stages used.
- * For example, an 8th order filter would be realized with numStages=4 second order stages.
- * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
- * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
- *
- * \par
- * The pState points to state variables array .
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2] and each state variable in 1.63 format to improve precision.
- * The state variables are arranged in the array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- *
- * \par
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values of data in 1.63 format.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- *
- * \par Init Function
- * There is also an associated initialization function which performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * For example, to statically initialize the filter instance structure use
- *
- * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer;
- * pCoeffs is the address of the coefficient buffer; postShift shift to be applied which is described in detail below.
- * \par Fixed-Point Behavior
- * Care must be taken while using Biquad Cascade 32x64 filter function.
- * Following issues must be considered:
- * - Scaling of coefficients
- * - Filter gain
- * - Overflow and saturation
- *
- * \par
- * Filter coefficients are represented as fractional values and
- * restricted to lie in the range [-1 +1).
- * The processing function has an additional scaling parameter postShift
- * which allows the filter coefficients to exceed the range [+1 -1).
- * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
- * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
- * This essentially scales the filter coefficients by 2^postShift.
- * For example, to realize the coefficients
- *
- * {1.5, -0.8, 1.2, 1.6, -0.9}
- *
- * set the Coefficient array to:
- *
- * {0.75, -0.4, 0.6, 0.8, -0.45}
- *
- * and set postShift=1
- *
- * \par
- * The second thing to keep in mind is the gain through the filter.
- * The frequency response of a Biquad filter is a function of its coefficients.
- * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
- * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
- * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
- *
- * \par
- * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
- * This is described in the function specific documentation below.
+ @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
+
+ This function implements a high precision Biquad cascade filter which operates on
+ Q31 data values. The filter coefficients are in 1.31 format and the state variables
+ are in 1.63 format. The double precision state variables reduce quantization noise
+ in the filter and provide a cleaner output.
+ These filters are particularly useful when implementing filters in which the
+ singularities are close to the unit circle. This is common for low pass or high
+ pass filters with very low cutoff frequencies.
+
+ The function operates on blocks of input and output data
+ and each call to the function processes blockSize samples through
+ the filter. pSrc and pDst points to input and output arrays
+ containing blockSize Q31 values.
+
+ @par Algorithm
+ Each Biquad stage implements a second order filter using the difference equation:
+
+ A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+ \image html Biquad.gif "Single Biquad filter stage"
+ Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ Pay careful attention to the sign of the feedback coefficients.
+ Some design tools use the difference equation
+
+ In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
+ @par
+ Higher order filters are realized as a cascade of second order sections.
+ numStages refers to the number of second order stages used.
+ For example, an 8th order filter would be realized with numStages=4 second order stages.
+ \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+ A 9th order filter would be realized with numStages=5 second order stages
+ with the coefficients for one of the stages configured as a first order filter
+ (b2=0 and a2=0).
+ @par
+ The pState points to state variables array.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2] and each state variable in 1.63 format to improve precision.
+ The state variables are arranged in the array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+ @par
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values of data in 1.63 format.
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+ @par Init Function
+ There is also an associated initialization function which performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
+
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ For example, to statically initialize the filter instance structure use
+
+ where numStages is the number of Biquad stages in the filter;
+ pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer;
+ postShift shift to be applied which is described in detail below.
+ @par Fixed-Point Behavior
+ Care must be taken while using Biquad Cascade 32x64 filter function.
+ Following issues must be considered:
+ - Scaling of coefficients
+ - Filter gain
+ - Overflow and saturation
+
+ @par
+ Filter coefficients are represented as fractional values and
+ restricted to lie in the range [-1 +1).
+ The processing function has an additional scaling parameter postShift
+ which allows the filter coefficients to exceed the range [+1 -1).
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+ This essentially scales the filter coefficients by 2^postShift.
+ For example, to realize the coefficients
+
+ {1.5, -0.8, 1.2, 1.6, -0.9}
+
+ set the Coefficient array to:
+
+ {0.75, -0.4, 0.6, 0.8, -0.45}
+
+ and set postShift=1
+ @par
+ The second thing to keep in mind is the gain through the filter.
+ The frequency response of a Biquad filter is a function of its coefficients.
+ It is possible for the gain through the filter to exceed 1.0 meaning that the
+ filter increases the amplitude of certain frequencies.
+ This means that an input signal with amplitude < 1.0 may result in an output > 1.0
+ and these are saturated or overflowed based on the implementation of the filter.
+ To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
+ or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+ @par
+ The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
+ This is described in the function specific documentation below.
*/
/**
- * @addtogroup BiquadCascadeDF1_32x64
- * @{
+ @addtogroup BiquadCascadeDF1_32x64
+ @{
*/
/**
- * @details
+ @brief Processing function for the Q31 Biquad cascade 32x64 filter.
+ @param[in] S points to an instance of the high precision Q31 Biquad cascade filter
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
- * @param[in] *S points to an instance of the high precision Q31 Biquad cascade filter.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
- * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
- * 1.31 format by discarding the low 32 bits.
- *
- * \par
- * Two related functions are provided in the CMSIS DSP library.
- * arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
- * arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
+ @par Details
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+ After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
+ 1.31 format by discarding the low 32 bits.
+ @par
+ Two related functions are provided in the CMSIS DSP library.
+ - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
+ - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
*/
void arm_biquad_cas_df1_32x64_q31(
const arm_biquad_cas_df1_32x64_ins_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pIn = pSrc; /* input pointer initialization */
- q31_t *pOut = pDst; /* output pointer initialization */
- q63_t *pState = S->pState; /* state pointer initialization */
- q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
- q63_t acc; /* accumulator */
- q31_t Xn1, Xn2; /* Input Filter state variables */
- q63_t Yn1, Yn2; /* Output Filter state variables */
- q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
- q31_t Xn; /* temporary input */
- int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
- uint32_t sample, stage = S->numStages; /* loop counters */
- q31_t acc_l, acc_h; /* temporary output */
- uint32_t uShift = ((uint32_t) S->postShift + 1U);
- uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
-
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t *pIn = pSrc; /* input pointer initialization */
+ q31_t *pOut = pDst; /* output pointer initialization */
+ q63_t *pState = S->pState; /* state pointer initialization */
+ const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
+ q63_t acc; /* accumulator */
+ q31_t Xn1, Xn2; /* Input Filter state variables */
+ q63_t Yn1, Yn2; /* Output Filter state variables */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t Xn; /* temporary input */
+ int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+ q31_t acc_l, acc_h; /* temporary output */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
do
{
@@ -213,16 +208,16 @@ void arm_biquad_cas_df1_32x64_q31(
Yn1 = pState[2];
Yn2 = pState[3];
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Apply loop unrolling and compute 4 output values simultaneously. */
- /* The variable acc hold output value that is being computed and
- * stored in the destination buffer
+ /* Variable acc hold output value that is being computed and stored in destination buffer
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
+ /* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the input */
@@ -231,13 +226,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- acc = (q63_t) Xn *b0;
+ acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
- acc += (q63_t) Xn1 *b1;
+ acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn2 *b2;
+ acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
@@ -266,13 +261,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc += b1 * x[n-1] */
- acc = (q63_t) Xn *b1;
+ acc = (q63_t) Xn * b1;
/* acc = b0 * x[n] */
- acc += (q63_t) Xn2 *b0;
+ acc += (q63_t) Xn2 * b0;
/* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn1 *b2;
+ acc += (q63_t) Xn1 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn2, a1);
@@ -302,13 +297,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- acc = (q63_t) Xn1 *b0;
+ acc = (q63_t) Xn1 * b0;
/* acc += b1 * x[n-1] */
- acc += (q63_t) Xn2 *b1;
+ acc += (q63_t) Xn2 * b1;
/* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn *b2;
+ acc += (q63_t) Xn * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
@@ -336,13 +331,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- acc = (q63_t) Xn *b0;
+ acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
- acc += (q63_t) Xn1 *b1;
+ acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn2 *b2;
+ acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn2, a1);
@@ -366,139 +361,55 @@ void arm_biquad_cas_df1_32x64_q31(
*(pOut + 3U) = acc_h;
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
/* update output pointer */
pOut += 4U;
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- sample = (blockSize & 0x3U);
-
- while (sample > 0U)
- {
- /* Read the input */
- Xn = *pIn++;
-
- /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) Xn *b0;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) Xn1 *b1;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn2 *b2;
- /* acc += a1 * y[n-1] */
- acc += mult32x64(Yn1, a1);
- /* acc += a2 * y[n-2] */
- acc += mult32x64(Yn2, a2);
-
- /* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
- Xn2 = Xn1;
- Xn1 = Xn;
- Yn2 = Yn1;
- /* The result is converted to 1.63, Yn1 variable is reused */
- Yn1 = acc << shift;
-
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
- /* Apply shift for lower part of acc and upper part of acc */
- acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
- /* Store the output in the destination buffer in 1.31 format. */
- *pOut++ = acc_h;
- /* Yn1 = acc << shift; */
-
- /* Store the output in the destination buffer in 1.31 format. */
-/* *pOut++ = (q31_t) (acc >> (32 - shift)); */
-
- /* decrement the loop counter */
- sample--;
- }
-
- /* The first stage output is given as input to the second stage. */
- pIn = pDst;
-
- /* Reset to destination buffer working pointer */
- pOut = pDst;
-
- /* Store the updated state variables back into the pState array */
- /* Store the updated state variables back into the pState array */
- *pState++ = (q63_t) Xn1;
- *pState++ = (q63_t) Xn2;
- *pState++ = Yn1;
- *pState++ = Yn2;
-
- } while (--stage);
+ /* Loop unrolling: Compute remaining outputs */
+ sample = blockSize & 0x3U;
#else
- /* Run the below code for Cortex-M0 */
-
- do
- {
- /* Reading the coefficients */
- b0 = *pCoeffs++;
- b1 = *pCoeffs++;
- b2 = *pCoeffs++;
- a1 = *pCoeffs++;
- a2 = *pCoeffs++;
-
- /* Reading the state values */
- Xn1 = pState[0];
- Xn2 = pState[1];
- Yn1 = pState[2];
- Yn2 = pState[3];
-
- /* The variable acc hold output value that is being computed and
- * stored in the destination buffer
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- */
-
+ /* Initialize blkCnt with number of samples */
sample = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
/* acc = b0 * x[n] */
- acc = (q63_t) Xn *b0;
+ acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
- acc += (q63_t) Xn1 *b1;
+ acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
- acc += (q63_t) Xn2 *b2;
+ acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
/* acc += a2 * y[n-2] */
acc += mult32x64(Yn2, a2);
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@@ -517,17 +428,16 @@ void arm_biquad_cas_df1_32x64_q31(
/* Store the output in the destination buffer in 1.31 format. */
*pOut++ = acc_h;
-
/* Yn1 = acc << shift; */
/* Store the output in the destination buffer in 1.31 format. */
- /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
+/* *pOut++ = (q31_t) (acc >> (32 - shift)); */
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* The first stage output is given as input to the second stage. */
+ /* The first stage output is given as input to the second stage. */
pIn = pDst;
/* Reset to destination buffer working pointer */
@@ -541,9 +451,8 @@ void arm_biquad_cas_df1_32x64_q31(
} while (--stage);
-#endif /* #if defined (ARM_MATH_DSP) */
}
- /**
- * @} end of BiquadCascadeDF1_32x64 group
- */
+/**
+ @} end of BiquadCascadeDF1_32x64 group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
index 0ffb29e..d28509d 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_f32.c
* Description: Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,159 +29,308 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
- *
- * This set of functions implements arbitrary order recursive (IIR) filters.
- * The filters are implemented as a cascade of second order Biquad sections.
- * The functions support Q15, Q31 and floating-point data types.
- * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3.
- *
- * The functions operate on blocks of input and output data and each call to the function
- * processes blockSize samples through the filter.
- * pSrc points to the array of input data and
- * pDst points to the array of output data.
- * Both arrays contain blockSize values.
- *
- * \par Algorithm
- * Each Biquad stage implements a second order filter using the difference equation:
- *
- * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
- * \image html Biquad.gif "Single Biquad filter stage"
- * Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
- * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
- * Pay careful attention to the sign of the feedback coefficients.
- * Some design tools use the difference equation
- *
- * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
- *
- * \par
- * Higher order filters are realized as a cascade of second order sections.
- * numStages refers to the number of second order stages used.
- * For example, an 8th order filter would be realized with numStages=4 second order stages.
- * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
- * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
- *
- * \par
- * The pState points to state variables array.
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
- * The state variables are arranged in the pState array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- *
- * \par
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values.
- * The state variables are updated after each block of data is processed, the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Init Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pCoeffs, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * The code below statically initializes each of the 3 different data type filter instance structures
- *
- * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer;
- * pCoeffs is the address of the coefficient buffer; postShift shift to be applied.
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
- * Following issues must be considered:
- * - Scaling of coefficients
- * - Filter gain
- * - Overflow and saturation
- *
- * \par
- * Scaling of coefficients:
- * Filter coefficients are represented as fractional values and
- * coefficients are restricted to lie in the range [-1 +1).
- * The fixed-point functions have an additional scaling parameter postShift
- * which allow the filter coefficients to exceed the range [+1 -1).
- * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
- * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
- * This essentially scales the filter coefficients by 2^postShift.
- * For example, to realize the coefficients
- *
- * {1.5, -0.8, 1.2, 1.6, -0.9}
- *
- * set the pCoeffs array to:
- *
- * {0.75, -0.4, 0.6, 0.8, -0.45}
- *
- * and set postShift=1
- *
- * \par
- * Filter gain:
- * The frequency response of a Biquad filter is a function of its coefficients.
- * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
- * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
- * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
- *
- * \par
- * Overflow and saturation:
- * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
+ @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
+
+ This set of functions implements arbitrary order recursive (IIR) filters.
+ The filters are implemented as a cascade of second order Biquad sections.
+ The functions support Q15, Q31 and floating-point data types.
+ Fast version of Q15 and Q31 also available.
+
+ The functions operate on blocks of input and output data and each call to the function
+ processes blockSize samples through the filter.
+ pSrc points to the array of input data and
+ pDst points to the array of output data.
+ Both arrays contain blockSize values.
+
+ @par Algorithm
+ Each Biquad stage implements a second order filter using the difference equation:
+
+ A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+ \image html Biquad.gif "Single Biquad filter stage"
+ Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ Pay careful attention to the sign of the feedback coefficients.
+ Some design tools use the difference equation
+
+ In this case the feedback coefficients a1 and a2
+ must be negated when used with the CMSIS DSP Library.
+
+ @par
+ Higher order filters are realized as a cascade of second order sections.
+ numStages refers to the number of second order stages used.
+ For example, an 8th order filter would be realized with numStages=4 second order stages.
+ \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+ A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+
+ @par
+ The pState points to state variables array.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ The state variables are arranged in the pState array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+
+ @par
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values.
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Init Function
+ There is also an associated initialization function for each data type.
+ The initialization function performs following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ where numStages is the number of Biquad stages in the filter;
+ pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer;
+ postShift shift to be applied.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
+ Following issues must be considered:
+ - Scaling of coefficients
+ - Filter gain
+ - Overflow and saturation
+
+ @par Scaling of coefficients
+ Filter coefficients are represented as fractional values and
+ coefficients are restricted to lie in the range [-1 +1).
+ The fixed-point functions have an additional scaling parameter postShift
+ which allow the filter coefficients to exceed the range [+1 -1).
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+ This essentially scales the filter coefficients by 2^postShift.
+ For example, to realize the coefficients
+
+ {1.5, -0.8, 1.2, 1.6, -0.9}
+
+ set the pCoeffs array to:
+
+ {0.75, -0.4, 0.6, 0.8, -0.45}
+
+ and set postShift=1
+
+ @par Filter gain
+ The frequency response of a Biquad filter is a function of its coefficients.
+ It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
+ This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
+ To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+
+ @par Overflow and saturation
+ For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @param[in] *S points to an instance of the floating-point Biquad cascade structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
+ @brief Processing function for the floating-point Biquad cascade filter.
+ @param[in] S points to an instance of the floating-point Biquad cascade structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
*/
+#if defined(ARM_MATH_NEON)
void arm_biquad_cascade_df1_f32(
const arm_biquad_casd_df1_inst_f32 * S,
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
- float32_t *pIn = pSrc; /* source pointer */
+
+ const float32_t *pIn = pSrc; /* source pointer */
float32_t *pOut = pDst; /* destination pointer */
float32_t *pState = S->pState; /* pState pointer */
- float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
float32_t acc; /* Simulates the accumulator */
- float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
- float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
- float32_t Xn; /* temporary input */
+
uint32_t sample, stage = S->numStages; /* loop counters */
+ float32x4_t Xn;
+ float32x2_t Yn;
+ float32x2_t a;
+ float32x4_t b;
+
+ float32x4_t x,tmp;
+ float32x2_t t;
+ float32x2x2_t y;
-#if defined (ARM_MATH_DSP)
+ float32_t Xns;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ while (stage > 0U)
+ {
+ /* Reading the coefficients */
+ Xn = vld1q_f32(pState);
+ Yn = vld1_f32(pState + 2);
+
+ b = vld1q_f32(pCoeffs);
+ b = vrev64q_f32(b);
+ b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
+
+ a = vld1_f32(pCoeffs + 3);
+ a = vrev64_f32(a);
+ b[0] = 0.0;
+ pCoeffs += 5;
+
+ /* Reading the pState values */
+
+ /* Apply loop unrolling and compute 4 output values simultaneously. */
+ /* The variable acc hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ sample = blockSize >> 2U;
+
+ while (sample > 0U)
+ {
+ /* Read the first 4 inputs */
+ x = vld1q_f32(pIn);
+
+ pIn += 4;
+
+ tmp = vextq_f32(Xn, x, 1);
+ t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+ t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+ t = vmla_f32(t, a, Yn);
+ t = vpadd_f32(t, t);
+ Yn = vext_f32(Yn, t, 1);
+
+ tmp = vextq_f32(Xn, x, 2);
+ t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+ t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+ t = vmla_f32(t, a, Yn);
+ t = vpadd_f32(t, t);
+ Yn = vext_f32(Yn, t, 1);
+
+ y.val[0] = Yn;
+
+ tmp = vextq_f32(Xn, x, 3);
+ t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+ t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+ t = vmla_f32(t, a, Yn);
+ t = vpadd_f32(t, t);
+ Yn = vext_f32(Yn, t, 1);
+
+ Xn = x;
+ t = vmul_f32(vget_high_f32(b), vget_high_f32(Xn));
+ t = vmla_f32(t, vget_low_f32(b), vget_low_f32(Xn));
+ t = vmla_f32(t, a, Yn);
+ t = vpadd_f32(t, t);
+ Yn = vext_f32(Yn, t, 1);
+
+ y.val[1] = Yn;
+
+ tmp = vcombine_f32(y.val[0], y.val[1]);
+
+ /* Store the 4 outputs and increment the pointer */
+ vst1q_f32(pOut, tmp);
+ pOut += 4;
+
+ /* Decrement the loop counter */
+ sample--;
+ }
+
+ /* If the block size is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ sample = blockSize & 0x3U;
+
+ while (sample > 0U)
+ {
+ /* Read the input */
+ Xns = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ acc = (b[1] * Xn[2]) + (b[2] * Xn[3]) + (b[3] * Xns) + (a[0] * Yn[0]) + (a[1] * Yn[1]);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn[2] = Xn[3];
+ Xn[3] = Xns;
+ Yn[0] = Yn[1];
+ Yn[1] = acc;
+
+ /* Decrement the loop counter */
+ sample--;
+
+ }
+
+ vst1q_f32(pState,vcombine_f32(vrev64_f32(vget_high_f32(Xn)),vrev64_f32(Yn)));
+ pState += 4;
+ /* Store the updated state variables back into the pState array */
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent numStages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset the output pointer */
+ pOut = pDst;
+
+ /* Decrement the loop counter */
+ stage--;
+ }
+}
+
+#else
+void arm_biquad_cascade_df1_f32(
+ const arm_biquad_casd_df1_inst_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Source pointer */
+ float32_t *pOut = pDst; /* Destination pointer */
+ float32_t *pState = S->pState; /* pState pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t acc; /* Accumulator */
+ float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
+ float32_t Xn; /* Temporary input */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
@@ -198,19 +347,20 @@ void arm_biquad_cascade_df1_f32(
Yn1 = pState[2];
Yn2 = pState[3];
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Apply loop unrolling and compute 4 output values simultaneously. */
- /* The variable acc hold output values that are being computed:
+ /* Variable acc hold output values that are being computed:
*
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
+ /* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the first input */
@@ -219,15 +369,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
- /* Store the result in the accumulator in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn2;
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* Read the second input */
Xn2 = *pIn++;
@@ -235,15 +385,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
- /* Store the result in the accumulator in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn1;
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* Read the third input */
Xn1 = *pIn++;
@@ -251,15 +401,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
- /* Store the result in the accumulator in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn2;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* Read the forth input */
Xn = *pIn++;
@@ -267,97 +417,32 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
- /* Store the result in the accumulator in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn1;
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
-
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0x3U;
- while (sample > 0U)
- {
- /* Read the input */
- Xn = *pIn++;
-
- /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
- acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = acc;
-
- /* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
- Xn2 = Xn1;
- Xn1 = Xn;
- Yn2 = Yn1;
- Yn1 = acc;
-
- /* decrement the loop counter */
- sample--;
-
- }
-
- /* Store the updated state variables back into the pState array */
- *pState++ = Xn1;
- *pState++ = Xn2;
- *pState++ = Yn1;
- *pState++ = Yn2;
-
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent numStages occur in-place in the output buffer */
- pIn = pDst;
-
- /* Reset the output pointer */
- pOut = pDst;
-
- /* decrement the loop counter */
- stage--;
-
- } while (stage > 0U);
-
#else
- /* Run the below code for Cortex-M0 */
-
- do
- {
- /* Reading the coefficients */
- b0 = *pCoeffs++;
- b1 = *pCoeffs++;
- b2 = *pCoeffs++;
- a1 = *pCoeffs++;
- a2 = *pCoeffs++;
-
- /* Reading the pState values */
- Xn1 = pState[0];
- Xn2 = pState[1];
- Yn1 = pState[2];
- Yn2 = pState[3];
-
- /* The variables acc holds the output value that is computed:
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- */
-
+ /* Initialize blkCnt with number of samples */
sample = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (sample > 0U)
{
/* Read the input */
@@ -366,47 +451,45 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
- /* Store the result in the accumulator in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = acc;
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = acc;
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* Store the updated state variables back into the pState array */
+ /* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent numStages occur in-place in the output buffer */
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent numStages occur in-place in the output buffer */
pIn = pDst;
- /* Reset the output pointer */
+ /* Reset output pointer */
pOut = pDst;
- /* decrement the loop counter */
+ /* decrement loop counter */
stage--;
} while (stage > 0U);
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
-
- /**
- * @} end of BiquadCascadeDF1 group
- */
+#endif /* #if defined(ARM_MATH_NEON) */
+/**
+ @} end of BiquadCascadeDF1 group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
index ab517d8..1a568d7 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_fast_q15.c
* Description: Fast processing function for the Q15 Biquad cascade filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,90 +29,87 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @details
- * @param[in] *S points to an instance of the Q15 Biquad cascade structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
- * The 2.30 accumulator is then shifted by postShift bits and the result truncated to 1.15 format by discarding the low 16 bits.
- *
- * \par
- * Refer to the function arm_biquad_cascade_df1_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
- * Use the function arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
- *
+ @brief Processing function for the Q15 Biquad cascade filter (fast variant).
+ @param[in] S points to an instance of the Q15 Biquad cascade structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
+ The 2.30 accumulator is then shifted by postShift bits and the result truncated to 1.15 format by discarding the low 16 bits.
+ @remark
+ Refer to \ref arm_biquad_cascade_df1_q15() for a slower implementation of this function
+ which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+ Use the function \ref arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
*/
void arm_biquad_cascade_df1_fast_q15(
const arm_biquad_casd_df1_inst_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pIn = pSrc; /* Source pointer */
- q15_t *pOut = pDst; /* Destination pointer */
- q31_t in; /* Temporary variable to hold input value */
- q31_t out; /* Temporary variable to hold output value */
- q31_t b0; /* Temporary variable to hold bo value */
- q31_t b1, a1; /* Filter coefficients */
- q31_t state_in, state_out; /* Filter state variables */
- q31_t acc; /* Accumulator */
- int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- uint32_t sample, stage = S->numStages; /* Stage loop counter */
-
-
+ const q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t acc; /* Accumulator */
+ q31_t in; /* Temporary variable to hold input value */
+ q31_t out; /* Temporary variable to hold output value */
+ q31_t b0; /* Temporary variable to hold bo value */
+ q31_t b1, a1; /* Filter coefficients */
+ q31_t state_in, state_out; /* Filter state variables */
+ int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
-
/* Read the b0 and 0 coefficients using SIMD */
- b0 = *__SIMD32(pCoeffs)++;
+ b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the b1 and b2 coefficients using SIMD */
- b1 = *__SIMD32(pCoeffs)++;
+ b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the a1 and a2 coefficients using SIMD */
- a1 = *__SIMD32(pCoeffs)++;
+ a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the input state values from the state buffer: x[n-1], x[n-2] */
- state_in = *__SIMD32(pState)++;
+ state_in = read_q15x2_ia (&pState);
/* Read the output state values from the state buffer: y[n-1], y[n-2] */
- state_out = *__SIMD32(pState)--;
+ state_out = read_q15x2_da (&pState);
+
+#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 2 output values simultaneously. */
- /* The variable acc hold output values that are being computed:
+ /* Variable acc hold output values that are being computed:
*
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
+
+ /* Loop unrolling: Compute 2 outputs at a time */
sample = blockSize >> 1U;
- /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
- ** a second loop below computes the remaining 1 sample. */
while (sample > 0U)
{
/* Read the input */
- in = *__SIMD32(pIn)++;
+ in = read_q15x2_ia ((q15_t **) &pIn);
/* out = b0 * x[n] + 0 * 0 */
out = __SMUAD(b0, in);
@@ -126,24 +123,20 @@ void arm_biquad_cascade_df1_fast_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
-
- state_in = __PKHBT(in, state_in, 16);
+ state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
-
#else
-
- state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+ state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_out = __PKHBT(state_out >> 16, (out), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* out = b0 * x[n] + 0 * 0 */
out = __SMUADX(b0, in);
@@ -155,64 +148,53 @@ void arm_biquad_cascade_df1_fast_q15(
/* The result is converted from 3.29 to 1.31 and then saturation is applied */
out = __SSAT((acc >> shift), 16);
-
/* Store the output in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
#else
-
- *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- state_in = __PKHBT(in >> 16, state_in, 16);
+ state_in = __PKHBT(in >> 16, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
+#else
+ state_in = __PKHBT(state_in >> 16, in, 16);
+ state_out = __PKHBT(state_out >> 16, out, 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement loop counter */
+ sample--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ sample = (blockSize & 0x1U);
#else
- state_in = __PKHBT(state_in >> 16, in, 16);
- state_out = __PKHBT(state_out >> 16, out, 16);
+ /* Initialize blkCnt with number of samples */
+ sample = blockSize;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
- /* Decrement the loop counter */
- sample--;
-
- }
-
- /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
- ** No loop unrolling is used. */
-
- if ((blockSize & 0x1U) != 0U)
+ while (sample > 0U)
{
/* Read the input */
in = *pIn++;
/* out = b0 * x[n] + 0 * 0 */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
out = __SMUAD(b0, in);
-
#else
-
out = __SMUADX(b0, in);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */
@@ -228,46 +210,41 @@ void arm_biquad_cascade_df1_fast_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
-
#else
-
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ /* decrement loop counter */
+ sample--;
}
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent (numStages - 1) occur in-place in the output buffer */
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent (numStages - 1) occur in-place in the output buffer */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
- /* Store the updated state variables back into the state array */
- *__SIMD32(pState)++ = state_in;
- *__SIMD32(pState)++ = state_out;
+ /* Store the updated state variables back into the state array */
+ write_q15x2_ia(&pState, state_in);
+ write_q15x2_ia(&pState, state_out);
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
stage--;
} while (stage > 0U);
}
-
/**
- * @} end of BiquadCascadeDF1 group
+ @} end of BiquadCascadeDF1 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
index 00dbae1..586296b 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_fast_q31.c
* Description: Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,55 +29,52 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @details
- *
- * @param[in] *S points to an instance of the Q31 Biquad cascade structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are added to a 2.30 accumulator.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
- * arm_biquad_cascade_df1_init_q31() to initialize filter structure.
- *
- * \par
- * Refer to the function arm_biquad_cascade_df1_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
- * Use the function arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
+ @brief Processing function for the Q31 Biquad cascade filter (fast variant).
+ @param[in] S points to an instance of the Q31 Biquad cascade structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are added to a 2.30 accumulator.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
+ arm_biquad_cascade_df1_init_q31() to initialize filter structure.
+ @remark
+ Refer to \ref arm_biquad_cascade_df1_q31() for a slower implementation of this function
+ which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
+ Use the function \ref arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
*/
void arm_biquad_cascade_df1_fast_q31(
const arm_biquad_casd_df1_inst_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t acc = 0; /* accumulator */
- q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
- q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
- q31_t *pIn = pSrc; /* input pointer initialization */
- q31_t *pOut = pDst; /* output pointer initialization */
- q31_t *pState = S->pState; /* pState pointer initialization */
- q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
- q31_t Xn; /* temporary input */
- int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
- uint32_t sample, stage = S->numStages; /* loop counters */
-
+ const q31_t *pIn = pSrc; /* Source pointer */
+ q31_t *pOut = pDst; /* Destination pointer */
+ q31_t *pState = S->pState; /* pState pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t acc = 0; /* Accumulator */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
+ q31_t Xn; /* Temporary input */
+ int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
@@ -88,22 +85,23 @@ void arm_biquad_cascade_df1_fast_q31(
a1 = *pCoeffs++;
a2 = *pCoeffs++;
- /* Reading the state values */
+ /* Reading the pState values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Apply loop unrolling and compute 4 output values simultaneously. */
- /* The variables acc ... acc3 hold output values that are being computed:
+ /* Variables acc ... acc3 hold output values that are being computed:
*
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
+ /* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the input */
@@ -111,19 +109,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- /*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
+ /* acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
mult_32x32_keep32_R(acc, b1, Xn1);
/* acc += b1 * x[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b0, Xn);
/* acc += b[2] * x[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn2);
/* acc += a1 * y[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31 , Yn2 variable is reused */
@@ -137,19 +135,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- /*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
+ /* acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn2);
/* acc += b1 * x[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn);
/* acc += b[2] * x[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn1);
/* acc += a1 * y[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn2);
/* acc += a2 * y[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
@@ -163,19 +161,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- /*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
+ /* acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn1);
/* acc += b1 * x[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn2);
/* acc += b[2] * x[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn);
/* acc += a1 * y[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31, Yn2 variable is reused */
@@ -190,7 +188,7 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
+ /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
@@ -207,47 +205,53 @@ void arm_biquad_cascade_df1_fast_q31(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
+ /* Xn2 = Xn1 */
Xn2 = Xn1;
/* The result is converted to 1.31, Yn1 variable is reused */
Yn1 = acc << shift;
- /* Xn1 = Xn */
+ /* Xn1 = Xn */
Xn1 = Xn;
/* Store the output in the destination buffer. */
*(pOut + 3U) = Yn1;
pOut += 4U;
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
sample = (blockSize & 0x3U);
- while (sample > 0U)
- {
+#else
+
+ /* Initialize blkCnt with number of samples */
+ sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (sample > 0U)
+ {
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
- /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
+ /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn);
/* acc += b1 * x[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn1);
/* acc += b[2] * x[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn2);
/* acc += a1 * y[n-1] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
- /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+ /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31 */
@@ -255,10 +259,10 @@ void arm_biquad_cascade_df1_fast_q31(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@@ -267,18 +271,18 @@ void arm_biquad_cascade_df1_fast_q31(
/* Store the output in the destination buffer. */
*pOut++ = acc;
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
- }
+ }
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent stages occur in-place in the output buffer */
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent stages occur in-place in the output buffer */
pIn = pDst;
/* Reset to destination pointer */
pOut = pDst;
- /* Store the updated state variables back into the pState array */
+ /* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
@@ -288,5 +292,5 @@ void arm_biquad_cascade_df1_fast_q31(
}
/**
- * @} end of BiquadCascadeDF1 group
- */
+ @} end of BiquadCascadeDF1 group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
index 35ceed4..f51c262 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_f32.c
* Description: Floating-point Biquad cascade DirectFormI(DF1) filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,55 +29,49 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @details
- * @brief Initialization function for the floating-point Biquad cascade filter.
- * @param[in,out] *S points to an instance of the floating-point Biquad cascade structure.
- * @param[in] numStages number of 2nd order stages in the filter.
- * @param[in] *pCoeffs points to the filter coefficients array.
- * @param[in] *pState points to the state array.
- * @return none
- *
- *
- * Coefficient and State Ordering:
- *
- * \par
- * The coefficients are stored in the array pCoeffs in the following order:
- *
- *
- * \par
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState is a pointer to state array.
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
- * The state variables are arranged in the pState array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- *
+ @brief Initialization function for the floating-point Biquad cascade filter.
+ @param[in,out] S points to an instance of the floating-point Biquad cascade structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState is a pointer to state array.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ The state variables are arranged in the pState array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_f32(
- arm_biquad_casd_df1_inst_f32 * S,
- uint8_t numStages,
- float32_t * pCoeffs,
- float32_t * pState)
+ arm_biquad_casd_df1_inst_f32 * S,
+ uint8_t numStages,
+ const float32_t * pCoeffs,
+ float32_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -93,5 +87,5 @@ void arm_biquad_cascade_df1_init_f32(
}
/**
- * @} end of BiquadCascadeDF1 group
+ @} end of BiquadCascadeDF1 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
index 2b3243d..c2e542c 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_q15.c
* Description: Q15 Biquad cascade DirectFormI(DF1) filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,54 +29,51 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @details
- *
- * @param[in,out] *S points to an instance of the Q15 Biquad cascade structure.
- * @param[in] numStages number of 2nd order stages in the filter.
- * @param[in] *pCoeffs points to the filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format
- * @return none
- *
- * Coefficient and State Ordering:
- *
- * \par
- * The coefficients are stored in the array pCoeffs in the following order:
- *
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 6*numStages values.
- * The zero coefficient between b1 and b2 facilities use of 16-bit SIMD instructions on the Cortex-M4.
- *
- * \par
- * The state variables are stored in the array pState.
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
- * The state variables are arranged in the pState array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the Q15 Biquad cascade filter.
+ @param[in,out] S points to an instance of the Q15 Biquad cascade structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 6*numStages values.
+ The zero coefficient between b1 and b2 facilities use of 16-bit SIMD instructions on the Cortex-M4.
+ @par
+ The state variables are stored in the array pState.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ The state variables are arranged in the pState array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_q15(
- arm_biquad_casd_df1_inst_q15 * S,
- uint8_t numStages,
- q15_t * pCoeffs,
- q15_t * pState,
- int8_t postShift)
+ arm_biquad_casd_df1_inst_q15 * S,
+ uint8_t numStages,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ int8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -95,5 +92,5 @@ void arm_biquad_cascade_df1_init_q15(
}
/**
- * @} end of BiquadCascadeDF1 group
+ @} end of BiquadCascadeDF1 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
index 5c60e4a..8637889 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_q31.c
* Description: Q31 Biquad cascade DirectFormI(DF1) filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,53 +29,50 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @details
- *
- * @param[in,out] *S points to an instance of the Q31 Biquad cascade structure.
- * @param[in] numStages number of 2nd order stages in the filter.
- * @param[in] *pCoeffs points to the filter coefficients buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
- * @return none
- *
- * Coefficient and State Ordering:
- *
- * \par
- * The coefficients are stored in the array pCoeffs in the following order:
- *
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState points to state variables array.
- * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
- * The state variables are arranged in the pState array as:
- *
- * {x[n-1], x[n-2], y[n-1], y[n-2]}
- *
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of 4*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the Q31 Biquad cascade filter.
+ @param[in,out] S points to an instance of the Q31 Biquad cascade structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState points to state variables array.
+ Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ The state variables are arranged in the pState array as:
+
+ {x[n-1], x[n-2], y[n-1], y[n-2]}
+
+ The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ The state array has a total length of 4*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_q31(
- arm_biquad_casd_df1_inst_q31 * S,
- uint8_t numStages,
- q31_t * pCoeffs,
- q31_t * pState,
- int8_t postShift)
+ arm_biquad_casd_df1_inst_q31 * S,
+ uint8_t numStages,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ int8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -94,5 +91,5 @@ void arm_biquad_cascade_df1_init_q31(
}
/**
- * @} end of BiquadCascadeDF1 group
+ @} end of BiquadCascadeDF1 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
index 382b744..9e23897 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_q15.c
* Description: Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,79 +29,74 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @brief Processing function for the Q15 Biquad cascade filter.
- * @param[in] *S points to an instance of the Q15 Biquad cascade structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * The accumulator is then shifted by postShift bits to truncate the result to 1.15 format by discarding the low 16 bits.
- * Finally, the result is saturated to 1.15 format.
- *
- * \par
- * Refer to the function arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ @brief Processing function for the Q15 Biquad cascade filter.
+ @param[in] S points to an instance of the Q15 Biquad cascade structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the location where the output result is written
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ The accumulator is then shifted by postShift bits to truncate the result to 1.15 format by discarding the low 16 bits.
+ Finally, the result is saturated to 1.15 format.
+ @remark
+ Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
*/
void arm_biquad_cascade_df1_q15(
const arm_biquad_casd_df1_inst_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t *pIn = pSrc; /* Source pointer */
- q15_t *pOut = pDst; /* Destination pointer */
- q31_t in; /* Temporary variable to hold input value */
- q31_t out; /* Temporary variable to hold output value */
- q31_t b0; /* Temporary variable to hold bo value */
- q31_t b1, a1; /* Filter coefficients */
- q31_t state_in, state_out; /* Filter state variables */
- q31_t acc_l, acc_h;
- q63_t acc; /* Accumulator */
- int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
- int32_t uShift = (32 - lShift);
+ const q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q31_t in; /* Temporary variable to hold input value */
+ q31_t out; /* Temporary variable to hold output value */
+ q31_t b0; /* Temporary variable to hold bo value */
+ q31_t b1, a1; /* Filter coefficients */
+ q31_t state_in, state_out; /* Filter state variables */
+ q31_t acc_l, acc_h;
+ q63_t acc; /* Accumulator */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
+ int32_t uShift = (32 - lShift);
do
{
/* Read the b0 and 0 coefficients using SIMD */
- b0 = *__SIMD32(pCoeffs)++;
+ b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the b1 and b2 coefficients using SIMD */
- b1 = *__SIMD32(pCoeffs)++;
+ b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the a1 and a2 coefficients using SIMD */
- a1 = *__SIMD32(pCoeffs)++;
+ a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the input state values from the state buffer: x[n-1], x[n-2] */
- state_in = *__SIMD32(pState)++;
+ state_in = read_q15x2_ia (&pState);
/* Read the output state values from the state buffer: y[n-1], y[n-2] */
- state_out = *__SIMD32(pState)--;
+ state_out = read_q15x2_da (&pState);
/* Apply loop unrolling and compute 2 output values simultaneously. */
/* The variable acc hold output values that are being computed:
@@ -117,7 +112,7 @@ void arm_biquad_cascade_df1_q15(
{
/* Read the input */
- in = *__SIMD32(pIn)++;
+ in = read_q15x2_ia ((q15_t **) &pIn);
/* out = b0 * x[n] + 0 * 0 */
out = __SMUAD(b0, in);
@@ -141,23 +136,19 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
-
- state_in = __PKHBT(in, state_in, 16);
+ state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
-
#else
-
- state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+ state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_out = __PKHBT(state_out >> 16, (out), 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* out = b0 * x[n] + 0 * 0 */
@@ -180,41 +171,30 @@ void arm_biquad_cascade_df1_q15(
out = __SSAT(out, 16);
/* Store the output in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
#else
-
- *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
-
- state_in = __PKHBT(in >> 16, state_in, 16);
+ state_in = __PKHBT(in >> 16, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
-
#else
-
- state_in = __PKHBT(state_in >> 16, in, 16);
+ state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
sample--;
-
}
/* If the blockSize is not a multiple of 2, compute any remaining output samples here.
@@ -226,15 +206,10 @@ void arm_biquad_cascade_df1_q15(
in = *pIn++;
/* out = b0 * x[n] + 0 * 0 */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
out = __SMUAD(b0, in);
-
#else
-
out = __SMUADX(b0, in);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* acc = b1 * x[n-1] + b2 * x[n-2] + out */
@@ -259,58 +234,49 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
-
#else
-
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
}
- /* The first stage goes from the input wire to the output wire. */
- /* Subsequent numStages occur in-place in the output wire */
+ /* The first stage goes from the input wire to the output wire. */
+ /* Subsequent numStages occur in-place in the output wire */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
- /* Store the updated state variables back into the state array */
- *__SIMD32(pState)++ = state_in;
- *__SIMD32(pState)++ = state_out;
+ /* Store the updated state variables back into the state array */
+ write_q15x2_ia (&pState, state_in);
+ write_q15x2_ia (&pState, state_out);
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
stage--;
} while (stage > 0U);
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t *pIn = pSrc; /* Source pointer */
- q15_t *pOut = pDst; /* Destination pointer */
- q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
- q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
- q15_t Xn; /* temporary input */
- q63_t acc; /* Accumulator */
- int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
+ q15_t Xn; /* temporary input */
+ q63_t acc; /* Accumulator */
+ int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
do
{
@@ -328,7 +294,7 @@ void arm_biquad_cascade_df1_q15(
Yn1 = pState[2];
Yn2 = pState[3];
- /* The variables acc holds the output value that is computed:
+ /* The variables acc holds the output value that is computed:
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
@@ -357,10 +323,10 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@@ -392,7 +358,6 @@ void arm_biquad_cascade_df1_q15(
}
-
/**
- * @} end of BiquadCascadeDF1 group
+ @} end of BiquadCascadeDF1 group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
index 4ca3f85..011e21d 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_q31.c
* Description: Processing function for the Q31 Biquad cascade filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,59 +29,54 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup BiquadCascadeDF1
- * @{
+ @addtogroup BiquadCascadeDF1
+ @{
*/
/**
- * @brief Processing function for the Q31 Biquad cascade filter.
- * @param[in] *S points to an instance of the Q31 Biquad cascade structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
- * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
- * 1.31 format by discarding the low 32 bits.
- *
- * \par
- * Refer to the function arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ @brief Processing function for the Q31 Biquad cascade filter.
+ @param[in] S points to an instance of the Q31 Biquad cascade structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+ After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
+ 1.31 format by discarding the low 32 bits.
+ @remark
+ Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
*/
void arm_biquad_cascade_df1_q31(
const arm_biquad_casd_df1_inst_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q63_t acc; /* accumulator */
- uint32_t uShift = ((uint32_t) S->postShift + 1U);
- uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
- q31_t *pIn = pSrc; /* input pointer initialization */
- q31_t *pOut = pDst; /* output pointer initialization */
- q31_t *pState = S->pState; /* pState pointer initialization */
- q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
- q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
- q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
- q31_t Xn; /* temporary input */
- uint32_t sample, stage = S->numStages; /* loop counters */
+ const q31_t *pIn = pSrc; /* Source pointer */
+ q31_t *pOut = pDst; /* Destination pointer */
+ q31_t *pState = S->pState; /* pState pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q63_t acc; /* Accumulator */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
+ q31_t Xn; /* Temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
-
-#if defined (ARM_MATH_DSP)
-
- q31_t acc_l, acc_h; /* temporary output variables */
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc_l, acc_h; /* temporary output variables */
+#endif
do
{
@@ -92,301 +87,161 @@ void arm_biquad_cascade_df1_q31(
a1 = *pCoeffs++;
a2 = *pCoeffs++;
- /* Reading the state values */
+ /* Reading the pState values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Apply loop unrolling and compute 4 output values simultaneously. */
- /* The variable acc hold output values that are being computed:
+ /* Variable acc hold output values that are being computed:
*
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
+ /* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
- /* Read the input */
+ /* Read the first input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn1;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn2;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn1;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn2;
+ acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31 , Yn2 variable is reused */
-
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
+ acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
+ acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
- /* Store the output in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn2;
/* Read the second input */
Xn2 = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn2;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn1;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn2;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn1;
-
+ acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
-
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
+ acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
+ acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
- /* Store the output in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn1;
- /* Read the third input */
+ /* Read the third input */
Xn1 = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn1;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn2;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn1;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn2;
+ acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31, Yn2 variable is reused */
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
+ acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
+ acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
- /* Store the output in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn2;
/* Read the forth input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn1;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn2;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn2;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn1;
+ acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
+ acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
+ acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
- /* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
- Xn2 = Xn1;
- Xn1 = Xn;
-
- /* Store the output in the destination buffer. */
+ /* Store output in destination buffer. */
*pOut++ = Yn1;
- /* decrement the loop counter */
- sample--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- sample = (blockSize & 0x3U);
-
- while (sample > 0U)
- {
- /* Read the input */
- Xn = *pIn++;
-
- /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn;
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn1;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn2;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn1;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn2;
-
- /* The result is converted to 1.31 */
- acc = acc >> lShift;
-
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
- Yn2 = Yn1;
- Yn1 = (q31_t) acc;
- /* Store the output in the destination buffer. */
- *pOut++ = (q31_t) acc;
-
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent stages occur in-place in the output buffer */
- pIn = pDst;
-
- /* Reset to destination pointer */
- pOut = pDst;
-
- /* Store the updated state variables back into the pState array */
- *pState++ = Xn1;
- *pState++ = Xn2;
- *pState++ = Yn1;
- *pState++ = Yn2;
-
- } while (--stage);
+ /* Loop unrolling: Compute remaining outputs */
+ sample = blockSize & 0x3U;
#else
- /* Run the below code for Cortex-M0 */
-
- do
- {
- /* Reading the coefficients */
- b0 = *pCoeffs++;
- b1 = *pCoeffs++;
- b2 = *pCoeffs++;
- a1 = *pCoeffs++;
- a2 = *pCoeffs++;
-
- /* Reading the state values */
- Xn1 = pState[0];
- Xn2 = pState[1];
- Yn1 = pState[2];
- Yn2 = pState[3];
-
- /* The variables acc holds the output value that is computed:
- * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- */
-
+ /* Initialize blkCnt with number of samples */
sample = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
- /* acc = b0 * x[n] */
- acc = (q63_t) b0 *Xn;
-
- /* acc += b1 * x[n-1] */
- acc += (q63_t) b1 *Xn1;
- /* acc += b[2] * x[n-2] */
- acc += (q63_t) b2 *Xn2;
- /* acc += a1 * y[n-1] */
- acc += (q63_t) a1 *Yn1;
- /* acc += a2 * y[n-2] */
- acc += (q63_t) a2 *Yn2;
+ acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31 */
acc = acc >> lShift;
+ /* Store output in destination buffer. */
+ *pOut++ = (q31_t) acc;
+
/* Every time after the output is computed state should be updated. */
- /* The states should be updated as: */
- /* Xn2 = Xn1 */
- /* Xn1 = Xn */
- /* Yn2 = Yn1 */
- /* Yn1 = acc */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = (q31_t) acc;
- /* Store the output in the destination buffer. */
- *pOut++ = (q31_t) acc;
-
- /* decrement the loop counter */
+ /* decrement loop counter */
sample--;
}
- /* The first stage goes from the input buffer to the output buffer. */
- /* Subsequent stages occur in-place in the output buffer */
- pIn = pDst;
-
- /* Reset to destination pointer */
- pOut = pDst;
-
- /* Store the updated state variables back into the pState array */
+ /* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
- } while (--stage);
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent numStages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset output pointer */
+ pOut = pDst;
+
+ /* decrement loop counter */
+ stage--;
+
+ } while (stage > 0U);
-#endif /* #if defined (ARM_MATH_DSP) */
}
-
-
-
/**
- * @} end of BiquadCascadeDF1 group
- */
+ @} end of BiquadCascadeDF1 group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
index c5a81d4..596b434 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df2T_f32.c
* Description: Processing function for floating-point transposed direct form II Biquad cascade filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,381 +29,141 @@
#include "arm_math.h"
/**
-* @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes blockSize samples through the filter.
-* pSrc points to the array of input data and
-* pDst points to the array of output data.
-* Both arrays contain blockSize values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-*
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
-* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-*
-* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* numStages refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with numStages=4 second order stages.
-* A 9th order filter would be realized with numStages=5 second order stages with the
-* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
-*
-* \par
-* pState points to the state variable array.
-* Each Biquad stage has 2 state variables d1 and d2.
-* The state variables are arranged in the pState array as:
-*
-* {d11, d12, d21, d22, ...}
-*
-* where d1x refers to the state variables for the first Biquad and
-* d2x refers to the state variables for the second Biquad.
-* The state array has a total length of 2*numStages values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-*
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
-* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-*
-* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* numStages refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with numStages=4 second order stages.
-* A 9th order filter would be realized with numStages=5 second order stages with the
-* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
-*
-* \par
-* pState points to the state variable array.
-* Each Biquad stage has 2 state variables d1 and d2.
-* The state variables are arranged in the pState array as:
-*
-* {d11, d12, d21, d22, ...}
-*
-* where d1x refers to the state variables for the first Biquad and
-* d2x refers to the state variables for the second Biquad.
-* The state array has a total length of 2*numStages values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-*
-* where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer.
-* pCoeffs is the address of the coefficient buffer;
-*
+ @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
+
+ This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
+ The filters are implemented as a cascade of second order Biquad sections.
+ These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
+ Only floating-point data is supported.
+
+ This function operate on blocks of input and output data and each call to the function
+ processes blockSize samples through the filter.
+ pSrc points to the array of input data and
+ pDst points to the array of output data.
+ Both arrays contain blockSize values.
+
+ @par Algorithm
+ Each Biquad stage implements a second order filter using the difference equation:
+
+ where d1 and d2 represent the two state values.
+ @par
+ A Biquad filter using a transposed Direct Form II structure is shown below.
+ \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
+ Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ Pay careful attention to the sign of the feedback coefficients.
+ Some design tools flip the sign of the feedback coefficients:
+
+ In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
+ @par
+ Higher order filters are realized as a cascade of second order sections.
+ numStages refers to the number of second order stages used.
+ For example, an 8th order filter would be realized with numStages=4 second order stages.
+ A 9th order filter would be realized with numStages=5 second order stages with the
+ coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+ @par
+ pState points to the state variable array.
+ Each Biquad stage has 2 state variables d1 and d2.
+ The state variables are arranged in the pState array as:
+
+ {d11, d12, d21, d22, ...}
+
+ where d1x refers to the state variables for the first Biquad and
+ d2x refers to the state variables for the second Biquad.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @par
+ The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
+ The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
+ That is why the Direct Form I structure supports Q15 and Q31 data types.
+ The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
+ Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
+ The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+ @par Init Functions
+ There is also an associated initialization function.
+ The initialization function performs following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ For example, to statically initialize the instance structure use
+
- *
- * \par
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState is a pointer to state array.
- * Each Biquad stage has 2 state variables d1, and d2.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of 2*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in,out] S points to an instance of the filter data structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order
+ in the not Neon version.
+
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+
+ For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
+ 32*x + 5*y
+ and it must be initialized using the function
+ arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
+ standard array coefficient as parameters.
+
+ But, an array of 8*numstages is a good approximation.
+
+ Then, the initialization can be done with:
+
- *
- * \par
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState is a pointer to state array.
- * Each Biquad stage has 2 state variables d1, and d2.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of 2*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in,out] S points to an instance of the filter data structure
+ @param[in] numStages number of 2nd order stages in the filter
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState is a pointer to state array.
+ Each Biquad stage has 2 state variables d1, and d2.
+ The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df2T_init_f64(
- arm_biquad_cascade_df2T_instance_f64 * S,
- uint8_t numStages,
- float64_t * pCoeffs,
- float64_t * pState)
+ arm_biquad_cascade_df2T_instance_f64 * S,
+ uint8_t numStages,
+ float64_t * pCoeffs,
+ float64_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -85,5 +82,5 @@ void arm_biquad_cascade_df2T_init_f64(
}
/**
- * @} end of BiquadCascadeDF2T group
+ @} end of BiquadCascadeDF2T group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c b/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
index b7e9359..14ae008 100644
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
@@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_stereo_df2T_f32.c
* Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,139 +29,39 @@
#include "arm_math.h"
/**
-* @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes blockSize samples through the filter.
-* pSrc points to the array of input data and
-* pDst points to the array of output data.
-* Both arrays contain blockSize values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-*
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
-* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-*
-* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* numStages refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with numStages=4 second order stages.
-* A 9th order filter would be realized with numStages=5 second order stages with the
-* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
-*
-* \par
-* pState points to the state variable array.
-* Each Biquad stage has 2 state variables d1 and d2.
-* The state variables are arranged in the pState array as:
-*
-* {d11, d12, d21, d22, ...}
-*
-* where d1x refers to the state variables for the first Biquad and
-* d2x refers to the state variables for the second Biquad.
-* The state array has a total length of 2*numStages values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-*
- *
- * \par
- * where b1x and a1x are the coefficients for the first stage,
- * b2x and a2x are the coefficients for the second stage,
- * and so on. The pCoeffs array contains a total of 5*numStages values.
- *
- * \par
- * The pState is a pointer to state array.
- * Each Biquad stage has 2 state variables d1, and d2 for each channel.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of 2*numStages values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in,out] S points to an instance of the filter data structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState is a pointer to state array.
+ Each Biquad stage has 2 state variables d1, and d2 for each channel.
+ The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_stereo_df2T_init_f32(
- arm_biquad_cascade_stereo_df2T_instance_f32 * S,
- uint8_t numStages,
- float32_t * pCoeffs,
- float32_t * pState)
+ arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+ uint8_t numStages,
+ const float32_t * pCoeffs,
+ float32_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@@ -85,5 +82,5 @@ void arm_biquad_cascade_stereo_df2T_init_f32(
}
/**
- * @} end of BiquadCascadeDF2T group
+ @} end of BiquadCascadeDF2T group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_f32.c b/DSP/Source/FilteringFunctions/arm_conv_f32.c
index 9ce5bf0..8fa1308 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_f32.c
@@ -3,13 +3,13 @@
* Title: arm_conv_f32.c
* Description: Convolution of floating-point sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,108 +29,97 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup Conv Convolution
- *
- * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
- * Convolution is similar to correlation and is frequently used in filtering and data analysis.
- * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
- * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
- *
- * \par Algorithm
- * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
- * Then the convolution
- *
- *
- * c[n] = a[n] * b[n]
- *
- *
- * \par
- * is defined as
- * \image html ConvolutionEquation.gif
- * \par
- * Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2.
- * pSrcA points to the first input vector of length srcALen and
- * pSrcB points to the second input vector of length srcBLen.
- * The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
- *
- * \par
- * Conceptually, when two signals a[n] and b[n] are convolved,
- * the signal b[n] slides over a[n].
- * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
- *
- * \par
- * Note that convolution is a commutative operation:
- *
- *
- * a[n] * b[n] = b[n] * a[n].
- *
- *
- * \par
- * This means that switching the A and B arguments to the convolution functions has no effect.
- *
- * Fixed-Point Behavior
- *
- * \par
- * Convolution requires summing up a large number of intermediate products.
- * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
- * Refer to the function specific documentation below for further details of the particular algorithm used.
- *
- *
- * Fast Versions
- *
- * \par
- * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
- * the input signals should be scaled down to avoid intermediate overflows.
- *
- *
- * Opt Versions
- *
- * \par
- * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
- * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
+ @defgroup Conv Convolution
+
+ Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
+ Convolution is similar to correlation and is frequently used in filtering and data analysis.
+ The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
+ The library also provides fast versions of the Q15 and Q31 functions.
+
+ @par Algorithm
+ Let a[n] and b[n] be sequences of length srcALen and
+ srcBLen samples respectively. Then the convolution
+
+ c[n] = a[n] * b[n]
+
+ @par
+ is defined as
+ \image html ConvolutionEquation.gif
+ @par
+ Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2.
+ pSrcA points to the first input vector of length srcALen and
+ pSrcB points to the second input vector of length srcBLen.
+ The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
+ @par
+ Conceptually, when two signals a[n] and b[n] are convolved,
+ the signal b[n] slides over a[n].
+ For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
+ @par
+ Note that convolution is a commutative operation:
+
+ a[n] * b[n] = b[n] * a[n].
+
+ @par
+ This means that switching the A and B arguments to the convolution functions has no effect.
+
+ @par Fixed-Point Behavior
+ Convolution requires summing up a large number of intermediate products.
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ Refer to the function specific documentation below for further details of the particular algorithm used.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of floating-point sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
+ @brief Convolution of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
*/
void arm_conv_f32(
- float32_t * pSrcA,
- uint32_t srcALen,
- float32_t * pSrcB,
- uint32_t srcBLen,
- float32_t * pDst)
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
+ const float32_t *pIn1; /* InputA pointer */
+ const float32_t *pIn2; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t *pIn1; /* inputA pointer */
- float32_t *pIn2; /* inputB pointer */
- float32_t *pOut = pDst; /* output pointer */
- float32_t *px; /* Intermediate inputA pointer */
- float32_t *py; /* Intermediate inputB pointer */
- float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
- float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ float32_t acc0, acc1, acc2, acc3; /* Accumulators */
+ float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -173,7 +162,7 @@ void arm_conv_f32(
blockSize3 = blockSize1;
/* --------------------------
- * initializations of stage1
+ * Initializations of stage1
* -------------------------*/
/* sum = x[0] * y[0]
@@ -196,6 +185,12 @@ void arm_conv_f32(
/* ------------------------
* Stage1 process
* ----------------------*/
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+#endif /* #if defined(ARM_MATH_NEON) */
/* The first stage starts here */
while (blockSize1 > 0U)
@@ -203,11 +198,44 @@ void arm_conv_f32(
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+#if defined(ARM_MATH_NEON)
+ res = vdupq_n_f32(0) ;
+ accum = vdup_n_f32(0);
+
+ /* Compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
+
+ while (k > 0U)
+ {
+ vec1 = vld1q_f32(px);
+ vec2 = vld1q_f32(py-3);
+ vec2 = vrev64q_f32(vec2);
+ vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2));
+
+ res = vmlaq_f32(res,vec1, vec2);
+
+ /* Increment pointers */
+ px += 4;
+ py -= 4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count & 3;
+#else
while (k > 0U)
{
/* x[0] * y[srcBLen - 1] */
@@ -222,20 +250,27 @@ void arm_conv_f32(
/* x[3] * y[srcBLen - 4] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#endif /* #if defined(ARM_MATH_NEON) */
+
+#else
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -246,10 +281,10 @@ void arm_conv_f32(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -258,7 +293,7 @@ void arm_conv_f32(
* ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
- * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
*/
@@ -282,7 +317,21 @@ void arm_conv_f32(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t c;
+ float32x4_t x1v;
+ float32x4_t x2v;
+ uint32x4_t x1v_u;
+ uint32x4_t x2v_u;
+ uint32x4_t x_u;
+ float32x4_t x;
+ float32x4_t res = vdupq_n_f32(0) ;
+#endif /* #if defined(ARM_MATH_NEON) */
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U)
@@ -293,40 +342,100 @@ void arm_conv_f32(
acc2 = 0.0f;
acc3 = 0.0f;
- /* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
+#if defined(ARM_MATH_NEON)
+ res = vdupq_n_f32(0) ;
+
+ x1v = vld1q_f32(px);
+ x2v = vld1q_f32(px+4);
+
+ do
+ {
+ c = vld1q_f32(py-3);
+
+ px += 4;
+ x = x1v;
+ res = vmlaq_n_f32(res,x,c[3]);
+
+ x = vextq_f32(x1v,x2v,1);
+
+ res = vmlaq_n_f32(res,x,c[2]);
+
+ x = vextq_f32(x1v,x2v,2);
+
+ res = vmlaq_n_f32(res,x,c[1]);
+
+ x = vextq_f32(x1v,x2v,3);
+
+ res = vmlaq_n_f32(res,x,c[0]);
+
+ py -= 4;
+
+ x1v = x2v ;
+ x2v = vld1q_f32(px+4);
+
+ } while (--k);
+
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen & 0x3;
+
+ x1v = vld1q_f32(px);
+ px += 4;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ res = vmlaq_n_f32(res,x1v,c0);
+
+ /* Reuse the present samples for the next MAC */
+ x1v[0] = x1v[1];
+ x1v[1] = x1v[2];
+ x1v[2] = x1v[3];
+
+ x1v[3] = *(px++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ acc0 = res[0];
+ acc1 = res[1];
+ acc2 = res[2];
+ acc3 = res[3];
+
+#else
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[3] sample */
x3 = *(px);
/* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 += x0 * c0;
-
/* acc1 += x[1] * y[srcBLen - 1] */
acc1 += x1 * c0;
-
/* acc2 += x[2] * y[srcBLen - 1] */
acc2 += x2 * c0;
-
/* acc3 += x[3] * y[srcBLen - 1] */
acc3 += x3 * c0;
/* Read y[srcBLen - 2] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[4] sample */
x0 = *(px + 1U);
@@ -341,12 +450,11 @@ void arm_conv_f32(
acc3 += x0 * c0;
/* Read y[srcBLen - 3] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[5] sample */
x1 = *(px + 2U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[srcBLen - 3] */
acc0 += x2 * c0;
/* acc1 += x[3] * y[srcBLen - 2] */
@@ -357,13 +465,12 @@ void arm_conv_f32(
acc3 += x1 * c0;
/* Read y[srcBLen - 4] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[6] sample */
x2 = *(px + 3U);
px += 4U;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[3] * y[srcBLen - 4] */
acc0 += x3 * c0;
/* acc1 += x[4] * y[srcBLen - 4] */
@@ -373,7 +480,6 @@ void arm_conv_f32(
/* acc3 += x[6] * y[srcBLen - 4] */
acc3 += x2 * c0;
-
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
@@ -383,12 +489,11 @@ void arm_conv_f32(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[4] * y[srcBLen - 5] */
acc0 += x0 * c0;
/* acc1 += x[5] * y[srcBLen - 5] */
@@ -406,6 +511,7 @@ void arm_conv_f32(
/* Decrement the loop counter */
k--;
}
+#endif /* #if defined(ARM_MATH_NEON) */
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc0;
@@ -420,42 +526,85 @@ void arm_conv_f32(
px = pIn1 + count;
py = pSrc2;
-
/* Decrement the loop counter */
blkCnt--;
}
-
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+#if defined (ARM_MATH_NEON)
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x4_t x = vdupq_n_f32(0) ;
+ float32x4_t y = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0) ;
+
+ /* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += *px++ * *py--;
- sum += *px++ * *py--;
- sum += *px++ * *py--;
- sum += *px++ * *py--;
+ x = vld1q_f32(px);
+ y = vld1q_f32(py-3);
+
+ y = vrev64q_f32(y);
+ y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
+
+ res = vmlaq_f32(res,x,y);
+
+ px += 4 ;
+ py -= 4 ;
/* Decrement the loop counter */
k--;
}
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
+ k = srcBLen & 0x3U;
+
+#else
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#endif /* if defined (ARM_MATH_NEON) */
+#else
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
@@ -533,7 +682,7 @@ void arm_conv_f32(
The blockSize3 variable holds the number of MAC operations performed */
/* Working pointer of inputA */
- pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
px = pSrc1;
/* Working pointer of inputB */
@@ -543,19 +692,45 @@ void arm_conv_f32(
/* -------------------
* Stage3 process
* ------------------*/
-
while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ /* Loop unrolling: Compute 4 outputs at a time */
k = blockSize3 >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+#if defined(ARM_MATH_NEON)
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x4_t x = vdupq_n_f32(0) ;
+ float32x4_t y = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0) ;
+
while (k > 0U)
{
+ x = vld1q_f32(px);
+ y = vld1q_f32(py-3);
+
+ y = vrev64q_f32(y);
+ y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
+
+ res = vmlaq_f32(res,x,y);
+
+ px += 4 ;
+ py -= 4 ;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+#else
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
sum += *px++ * *py--;
@@ -568,21 +743,27 @@ void arm_conv_f32(
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
+#endif /* #if defined (ARM_MATH_NEON) */
- /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = blockSize3 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -598,16 +779,15 @@ void arm_conv_f32(
}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- float32_t *pIn1 = pSrcA; /* inputA pointer */
- float32_t *pIn2 = pSrcB; /* inputB pointer */
- float32_t sum; /* Accumulator */
- uint32_t i, j; /* loop counters */
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
/* Loop to calculate convolution for output length number of times */
- for (i = 0U; i < ((srcALen + srcBLen) - 1U); i++)
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
{
/* Initialize sum with zero to carry out MAC operations */
sum = 0.0f;
@@ -616,20 +796,21 @@ void arm_conv_f32(
for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
- if ((((i - j) < srcBLen) && (j < srcALen)))
+ if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += pIn1[j] * pIn2[i - j];
+ sum += ( pIn1[j] * pIn2[i - j]);
}
}
+
/* Store the output in the destination buffer */
pDst[i] = sum;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
index c6e05b8..ed2aea9 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_fast_opt_q15.c
* Description: Fast Q15 Convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,71 +29,65 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
- * @return none.
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results
- * but provides only a single guard bit. There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
- * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
- * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
- *
- * \par
- * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ @brief Convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results
+ but provides only a single guard bit. There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
*/
void arm_conv_fast_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
- q31_t y1, y2; /* State variables */
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- uint32_t tapCnt; /* loop count */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+ q31_t acc0; /* Accumulators */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
- q15_t a, b;
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
-#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -126,11 +120,10 @@ void arm_conv_fast_opt_q15(
/* points to smaller length sequence */
px = pIn2;
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcBLen >> 2U;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
/* Copy smaller length input sequence in reverse order into second scratch buffer */
while (k > 0U)
@@ -141,20 +134,26 @@ void arm_conv_fast_opt_q15(
*pScr2-- = *px++;
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -170,50 +169,12 @@ void arm_conv_fast_opt_q15(
/* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Copy (srcALen) samples in scratch buffer */
arm_copy_q15(pIn1, pScr1, srcALen);
/* Update pointers */
pScr1 += srcALen;
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcALen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
@@ -221,40 +182,6 @@ void arm_conv_fast_opt_q15(
/* Update pointer */
pScr1 += (srcBLen - 1U);
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
/* Temporary pointer for scratch2 */
py = pScratch2;
@@ -262,10 +189,9 @@ void arm_conv_fast_opt_q15(
/* Initialization of pIn2 pointer */
pIn2 = py;
- /* First part of the processing with loop unrolling process 4 data points at a time.
- ** a second loop below process for the remaining 1 to 3 samples. */
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Actual convolution process starts here */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (srcALen + srcBLen - 1U) >> 2;
while (blkCnt > 0)
@@ -280,21 +206,19 @@ void arm_conv_fast_opt_q15(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
/* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
@@ -311,7 +235,7 @@ void arm_conv_fast_opt_q15(
acc1 = __SMLADX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = _SIMD32_OFFSET(pScr1);
+ x1 = read_q15x2_ia (&pScr1);
/* multiply and accumlate */
acc0 = __SMLAD(x2, y2, acc0);
@@ -327,7 +251,7 @@ void arm_conv_fast_opt_q15(
acc3 = __SMLADX(x3, y1, acc3);
acc1 = __SMLADX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr1 + 2U);
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -337,86 +261,7 @@ void arm_conv_fast_opt_q15(
acc3 = __SMLADX(x3, y2, acc3);
-#else
-
- /* Read four samples from smaller buffer */
- a = *pIn2;
- b = *(pIn2 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- y1 = __PKHBT(a, b, 16);
-#else
- y1 = __PKHBT(b, a, 16);
-#endif
-
- a = *(pIn2 + 2);
- b = *(pIn2 + 3);
-#ifndef ARM_MATH_BIG_ENDIAN
- y2 = __PKHBT(a, b, 16);
-#else
- y2 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLAD(x1, y1, acc0);
-
- acc2 = __SMLAD(x2, y1, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc1 = __SMLADX(x3, y1, acc1);
-
- a = *pScr1;
- b = *(pScr1 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x1 = __PKHBT(a, b, 16);
-#else
- x1 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLAD(x2, y2, acc0);
-
- acc2 = __SMLAD(x1, y2, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x1, x2, 0);
-#else
- x3 = __PKHBT(x2, x1, 0);
-#endif
-
- acc3 = __SMLADX(x3, y1, acc3);
-
- acc1 = __SMLADX(x3, y2, acc1);
-
- a = *(pScr1 + 2);
- b = *(pScr1 + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x2 = __PKHBT(a, b, 16);
-#else
- x2 = __PKHBT(b, a, 16);
-#endif
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc3 = __SMLADX(x3, y2, acc3);
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* update scratch pointers */
- pIn2 += 4U;
- pScr1 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -428,7 +273,6 @@ void arm_conv_fast_opt_q15(
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2);
acc1 += (*pScr1++ * *pIn2);
@@ -437,46 +281,37 @@ void arm_conv_fast_opt_q15(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
-
/* Store the results in the accumulators in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 4U;
-
}
-
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate convolution for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
@@ -491,10 +326,11 @@ void arm_conv_fast_opt_q15(
while (tapCnt > 0U)
{
+ /* Read next two samples from scratch1 buffer */
acc0 += (*pScr1++ * *pIn2++);
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -507,25 +343,24 @@ void arm_conv_fast_opt_q15(
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
/* The result is in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the output in the destination buffer. */
+ Then store the output in the destination buffer. */
*pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 1U;
-
}
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c b/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
index 9625ae5..3102a05 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_fast_q15.c
* Description: Fast Q15 Convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,56 +29,54 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results
- * but provides only a single guard bit. There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
- * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
- * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
- *
- * \par
- * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ @brief Convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results
+ but provides only a single guard bit. There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
*/
void arm_conv_fast_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -175,10 +173,10 @@ void arm_conv_fast_q15(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -202,11 +200,11 @@ void arm_conv_fast_q15(
{
/* Perform the multiply-accumulates */
/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -234,10 +232,10 @@ void arm_conv_fast_q15(
py = pIn2 + (count - 1U);
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -261,7 +259,6 @@ void arm_conv_fast_q15(
/* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
-
/* --------------------
* Stage2 process
* -------------------*/
@@ -284,13 +281,11 @@ void arm_conv_fast_q15(
acc2 = 0;
acc3 = 0;
-
/* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 ((q15_t *) px);
/* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px+1);
- px+= 2U;
-
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -301,7 +296,7 @@ void arm_conv_fast_q15(
{
/* Read the last two inputB samples using SIMD:
* y[srcBLen - 1] and y[srcBLen - 2] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
acc0 = __SMLADX(x0, c0, acc0);
@@ -310,10 +305,10 @@ void arm_conv_fast_q15(
acc1 = __SMLADX(x1, c0, acc1);
/* Read x[2], x[3] */
- x2 = *__SIMD32(px);
+ x2 = read_q15x2 ((q15_t *) px);
/* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px+1);
+ x3 = read_q15x2 ((q15_t *) px + 1);
/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
acc2 = __SMLADX(x2, c0, acc2);
@@ -322,7 +317,7 @@ void arm_conv_fast_q15(
acc3 = __SMLADX(x3, c0, acc3);
/* Read y[srcBLen - 3] and y[srcBLen - 4] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
acc0 = __SMLADX(x2, c0, acc0);
@@ -331,10 +326,10 @@ void arm_conv_fast_q15(
acc1 = __SMLADX(x3, c0, acc1);
/* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px+2);
+ x0 = read_q15x2 ((q15_t *) px + 2);
/* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px+3);
+ x1 = read_q15x2 ((q15_t *) px + 3);
px += 4U;
/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
@@ -358,17 +353,13 @@ void arm_conv_fast_q15(
c0 = *(py+1);
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
-
#else
-
c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
px++;
/* Perform the multiply-accumulates */
@@ -381,13 +372,13 @@ void arm_conv_fast_q15(
if (k == 2U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
/* Perform the multiply-accumulates */
@@ -400,13 +391,13 @@ void arm_conv_fast_q15(
if (k == 3U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
/* Perform the multiply-accumulates */
acc0 = __SMLADX(x0, c0, acc0);
@@ -417,15 +408,13 @@ void arm_conv_fast_q15(
/* Read y[srcBLen - 7] */
c0 = *(py-1);
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
#else
-
c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[10] */
- x3 = _SIMD32_OFFSET(px+2);
+ x3 = read_q15x2 ((q15_t *) px + 2);
px += 3U;
/* Perform the multiply-accumulates */
@@ -435,18 +424,14 @@ void arm_conv_fast_q15(
acc3 = __SMLADX(x3, c0, acc3);
}
- /* Store the results in the accumulators in the destination buffer. */
+ /* Store the result in the accumulator in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
- *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT((acc0 >> 15), (acc1 >> 15), 16));
+ write_q15x2_ia (&pOut, __PKHBT((acc2 >> 15), (acc3 >> 15), 16));
#else
-
- *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
- *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT((acc1 >> 15), (acc0 >> 15), 16));
+ write_q15x2_ia (&pOut, __PKHBT((acc3 >> 15), (acc2 >> 15), 16));
+#endif /*#ifndef ARM_MATH_BIG_ENDIAN*/
/* Increment the pointer pIn1 index, count by 4 */
count += 4U;
@@ -455,7 +440,7 @@ void arm_conv_fast_q15(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -476,12 +461,12 @@ void arm_conv_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -492,9 +477,9 @@ void arm_conv_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -508,7 +493,7 @@ void arm_conv_fast_q15(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -529,28 +514,27 @@ void arm_conv_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q15_t) (sum >> 15);
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
-
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -599,12 +583,12 @@ void arm_conv_fast_q15(
{
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -621,7 +605,7 @@ void arm_conv_fast_q15(
/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
sum = __SMLAD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -632,7 +616,7 @@ void arm_conv_fast_q15(
px = ++pSrc1;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
j--;
@@ -657,7 +641,7 @@ void arm_conv_fast_q15(
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum = __SMLAD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -672,727 +656,8 @@ void arm_conv_fast_q15(
blockSize3--;
}
-#else
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
- q15_t a, b;
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
- }
-
- /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
- /* The function is internally
- * divided into three stages according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first stage of the
- * algorithm, the multiplications increase by one for every iteration.
- * In the second stage of the algorithm, srcBLen number of multiplications are done.
- * In the third stage of the algorithm, the multiplications decrease by one
- * for every iteration. */
-
- /* The algorithm is implemented in three stages.
- The loop counters of each stage is initiated here. */
- blockSize1 = srcBLen - 1U;
- blockSize2 = srcALen - (srcBLen - 1U);
- blockSize3 = blockSize1;
-
- /* --------------------------
- * Initializations of stage1
- * -------------------------*/
-
- /* sum = x[0] * y[0]
- * sum = x[0] * y[1] + x[1] * y[0]
- * ....
- * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
- */
-
- /* In this stage the MAC operations are increased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = 1U;
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- py = pIn2;
-
-
- /* ------------------------
- * Stage1 process
- * ----------------------*/
-
- /* For loop unrolling by 4, this stage is divided into two. */
- /* First part of this stage computes the MAC operations less than 4 */
- /* Second part of this stage computes the MAC operations greater than or equal to 4 */
-
- /* The first part of the stage starts here */
- while ((count < 4U) && (blockSize1 > 0U))
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Loop over number of MAC operations between
- * inputA samples and inputB samples */
- k = count;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = pIn2 + count;
- px = pIn1;
-
- /* Increment the MAC count */
- count++;
-
- /* Decrement the loop counter */
- blockSize1--;
- }
-
- /* The second part of the stage starts here */
- /* The internal loop, over count, is unrolled by 4 */
- /* To, read the last two inputB samples using SIMD:
- * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
- py = py - 1;
-
- while (blockSize1 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- py++;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = count % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = pIn2 + (count - 1U);
- px = pIn1;
-
- /* Increment the MAC count */
- count++;
-
- /* Decrement the loop counter */
- blockSize1--;
- }
-
- /* --------------------------
- * Initializations of stage2
- * ------------------------*/
-
- /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
- * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
- * ....
- * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
- */
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- pSrc2 = pIn2 + (srcBLen - 1U);
- py = pSrc2;
-
- /* count is the index by which the pointer pIn1 to be incremented */
- count = 0U;
-
-
- /* --------------------
- * Stage2 process
- * -------------------*/
-
- /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
- * So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4 */
- if (srcBLen >= 4U)
- {
- /* Loop unroll over blockSize2, by 4 */
- blkCnt = blockSize2 >> 2U;
-
- while (blkCnt > 0U)
- {
- py = py - 1U;
-
- /* Set all accumulators to zero */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* read x[0], x[1] samples */
- a = *px++;
- b = *px++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *px;
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *px;
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- do
- {
- /* Read the last two inputB samples using SIMD:
- * y[srcBLen - 1] and y[srcBLen - 2] */
- a = *py;
- b = *(py+1);
- py -= 2;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
- acc0 = __SMLADX(x0, c0, acc0);
-
- /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
- acc1 = __SMLADX(x1, c0, acc1);
-
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x2 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x3 = __PKHBT(b, a, 16);
-
-#else
-
- x2 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x3 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
- acc2 = __SMLADX(x2, c0, acc2);
-
- /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
- acc3 = __SMLADX(x3, c0, acc3);
-
- /* Read y[srcBLen - 3] and y[srcBLen - 4] */
- a = *py;
- b = *(py+1);
- py -= 2;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
- acc0 = __SMLADX(x2, c0, acc0);
-
- /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
- acc1 = __SMLADX(x3, c0, acc1);
-
- /* Read x[4], x[5], x[6] */
- a = *(px + 2);
- b = *(px + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *(px + 4);
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *(px + 4);
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 4U;
-
- /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
- acc2 = __SMLADX(x0, c0, acc2);
-
- /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
- acc3 = __SMLADX(x1, c0, acc3);
-
- } while (--k);
-
- /* For the next MAC operations, SIMD is not used
- * So, the 16 bit pointer if inputB, py is updated */
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- if (k == 1U)
- {
- /* Read y[srcBLen - 5] */
- c0 = *(py+1);
-
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-
-#else
-
- c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7] */
- a = *px;
- b = *(px+1);
- px++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
- acc2 = __SMLADX(x1, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- if (k == 2U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- a = *py;
- b = *(py+1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- px += 2U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x0, c0, acc0);
- acc1 = __SMLADX(x1, c0, acc1);
- acc2 = __SMLADX(x3, c0, acc2);
- acc3 = __SMLADX(x2, c0, acc3);
- }
-
- if (k == 3U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- a = *py;
- b = *(py+1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x0, c0, acc0);
- acc1 = __SMLADX(x1, c0, acc1);
- acc2 = __SMLADX(x3, c0, acc2);
- acc3 = __SMLADX(x2, c0, acc3);
-
- /* Read y[srcBLen - 7] */
- c0 = *(py-1);
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-#else
-
- c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[10] */
- a = *(px+2);
- b = *(px+3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 3U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x1, c0, acc0);
- acc1 = __SMLAD(x2, c0, acc1);
- acc2 = __SMLADX(x2, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- /* Store the results in the accumulators in the destination buffer. */
- *pOut++ = (q15_t)(acc0 >> 15);
- *pOut++ = (q15_t)(acc1 >> 15);
- *pOut++ = (q15_t)(acc2 >> 15);
- *pOut++ = (q15_t)(acc3 >> 15);
-
- /* Increment the pointer pIn1 index, count by 4 */
- count += 4U;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize2 % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Increment the pointer pIn1 index, count by 1 */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
- else
- {
- /* If the srcBLen is not a multiple of 4,
- * the blockSize2 loop cannot be unrolled by 4 */
- blkCnt = blockSize2;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* srcBLen number of MACS should be performed */
- k = srcBLen;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Increment the MAC count */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
-
-
- /* --------------------------
- * Initializations of stage3
- * -------------------------*/
-
- /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
- * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
- * ....
- * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
- * sum += x[srcALen-1] * y[srcBLen-1]
- */
-
- /* In this stage the MAC operations are decreased by 1 for every iteration.
- The blockSize3 variable holds the number of MAC operations performed */
-
- /* Working pointer of inputA */
- pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
- px = pSrc1;
-
- /* Working pointer of inputB */
- pSrc2 = pIn2 + (srcBLen - 1U);
- pIn2 = pSrc2 - 1U;
- py = pIn2;
-
- /* -------------------
- * Stage3 process
- * ------------------*/
-
- /* For loop unrolling by 4, this stage is divided into two. */
- /* First part of this stage computes the MAC operations greater than 4 */
- /* Second part of this stage computes the MAC operations less than or equal to 4 */
-
- /* The first part of the stage starts here */
- j = blockSize3 >> 2U;
-
- while ((j > 0U) && (blockSize3 > 0U))
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = blockSize3 >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- py++;
-
- while (k > 0U)
- {
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = blockSize3 % 0x4U;
-
- while (k > 0U)
- {
- /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pIn2;
-
- /* Decrement the loop counter */
- blockSize3--;
-
- j--;
- }
-
- /* The second part of the stage starts here */
- /* SIMD is not used for the next MAC operations,
- * so pointer py is updated to read only one sample at a time */
- py = py + 1U;
-
- while (blockSize3 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = blockSize3;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- /* sum += x[srcALen-1] * y[srcBLen-1] */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blockSize3--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c b/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
index ce3e334..e87eddc 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_conv_fast_q31.c
* Description: Fast Q31 Convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,57 +29,54 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are accumulated in a 32-bit register in 2.30 format.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- *
- * \par
- * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signals must be scaled down.
- * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
- * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
- *
- * \par
- * See arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ @brief Convolution of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence.
+ @param[in] srcALen length of the first input sequence.
+ @param[in] pSrcB points to the second input sequence.
+ @param[in] srcBLen length of the second input sequence.
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ @remark
+ Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
*/
void arm_conv_fast_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
{
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -161,21 +158,21 @@ void arm_conv_fast_q31(
{
/* x[0] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[1] * y[srcBLen - 2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[2] * y[srcBLen - 3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[3] * y[srcBLen - 4] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -187,9 +184,9 @@ void arm_conv_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -200,10 +197,10 @@ void arm_conv_fast_q31(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -248,9 +245,9 @@ void arm_conv_fast_q31(
acc3 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -260,29 +257,25 @@ void arm_conv_fast_q31(
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
-
/* acc1 += x[1] * y[srcBLen - 1] */
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
-
/* acc2 += x[2] * y[srcBLen - 1] */
acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
-
/* acc3 += x[3] * y[srcBLen - 1] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
- /* Read y[srcBLen - 2] sample */
- c0 = *(py--);
+ /* Read y[srcBLen - 2] sample */
+ c0 = *py--;
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[1] * y[srcBLen - 2] */
@@ -294,11 +287,11 @@ void arm_conv_fast_q31(
/* acc3 += x[4] * y[srcBLen - 2] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
- /* Read y[srcBLen - 3] sample */
- c0 = *(py--);
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[2] * y[srcBLen - 3] */
@@ -310,11 +303,11 @@ void arm_conv_fast_q31(
/* acc3 += x[5] * y[srcBLen - 3] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
- /* Read y[srcBLen - 4] sample */
- c0 = *(py--);
+ /* Read y[srcBLen - 4] sample */
+ c0 = *py--;
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[3] * y[srcBLen - 4] */
@@ -336,10 +329,9 @@ void arm_conv_fast_q31(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -356,11 +348,11 @@ void arm_conv_fast_q31(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* Store the results in the accumulators in the destination buffer. */
+ /* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q31_t) (acc0 << 1);
*pOut++ = (q31_t) (acc1 << 1);
*pOut++ = (q31_t) (acc2 << 1);
@@ -373,7 +365,7 @@ void arm_conv_fast_q31(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -395,15 +387,15 @@ void arm_conv_fast_q31(
{
/* Perform the multiply-accumulates */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -415,23 +407,23 @@ void arm_conv_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum << 1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -453,23 +445,23 @@ void arm_conv_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum << 1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -513,23 +505,24 @@ void arm_conv_fast_q31(
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -541,9 +534,9 @@ void arm_conv_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -554,12 +547,12 @@ void arm_conv_fast_q31(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c b/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
index 1b20399..6ad34cd 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_opt_q15.c
* Description: Convolution of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,73 +29,61 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
- * @return none.
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * This approach provides 33 guard bits and there is no risk of overflow.
- * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
- *
- *
- * \par
- * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- *
+ @brief Convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ @remark
+ Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
*/
void arm_conv_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
- q63_t acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
- q31_t y1, y2; /* State variables */
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- uint32_t tapCnt; /* loop count */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+ q63_t acc0; /* Accumulators */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
- q15_t a, b;
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -107,7 +95,6 @@ void arm_conv_opt_q15(
/* Initialization of inputB pointer */
pIn2 = pSrcB;
-
}
else
{
@@ -123,17 +110,17 @@ void arm_conv_opt_q15(
srcALen = j;
}
- /* pointer to take end of scratch2 buffer */
+ /* Pointer to take end of scratch2 buffer */
pScr2 = pScratch2 + srcBLen - 1;
/* points to smaller length sequence */
px = pIn2;
- /* Apply loop unrolling and do 4 Copies simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
/* Copy smaller length input sequence in reverse order into second scratch buffer */
while (k > 0U)
{
@@ -143,20 +130,26 @@ void arm_conv_opt_q15(
*pScr2-- = *px++;
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -164,7 +157,7 @@ void arm_conv_opt_q15(
pScr1 = pScratch1;
/* Assuming scratch1 buffer is aligned by 32-bit */
- /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update temporary scratch pointer */
@@ -172,50 +165,12 @@ void arm_conv_opt_q15(
/* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Copy (srcALen) samples in scratch buffer */
arm_copy_q15(pIn1, pScr1, srcALen);
/* Update pointers */
pScr1 += srcALen;
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcALen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif
-
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
@@ -223,40 +178,6 @@ void arm_conv_opt_q15(
/* Update pointer */
pScr1 += (srcBLen - 1U);
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif
-
/* Temporary pointer for scratch2 */
py = pScratch2;
@@ -264,10 +185,9 @@ void arm_conv_opt_q15(
/* Initialization of pIn2 pointer */
pIn2 = py;
- /* First part of the processing with loop unrolling process 4 data points at a time.
- ** a second loop below process for the remaining 1 to 3 samples. */
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Actual convolution process starts here */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (srcALen + srcBLen - 1U) >> 2;
while (blkCnt > 0)
@@ -282,21 +202,19 @@ void arm_conv_opt_q15(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
/* multiply and accumlate */
acc0 = __SMLALD(x1, y1, acc0);
@@ -313,7 +231,7 @@ void arm_conv_opt_q15(
acc1 = __SMLALDX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = _SIMD32_OFFSET(pScr1);
+ x1 = read_q15x2_ia (&pScr1);
/* multiply and accumlate */
acc0 = __SMLALD(x2, y2, acc0);
@@ -329,7 +247,7 @@ void arm_conv_opt_q15(
acc3 = __SMLALDX(x3, y1, acc3);
acc1 = __SMLALDX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr1 + 2U);
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -339,85 +257,7 @@ void arm_conv_opt_q15(
acc3 = __SMLALDX(x3, y2, acc3);
-#else
-
- /* Read four samples from smaller buffer */
- a = *pIn2;
- b = *(pIn2 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- y1 = __PKHBT(a, b, 16);
-#else
- y1 = __PKHBT(b, a, 16);
-#endif
-
- a = *(pIn2 + 2);
- b = *(pIn2 + 3);
-#ifndef ARM_MATH_BIG_ENDIAN
- y2 = __PKHBT(a, b, 16);
-#else
- y2 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLALD(x1, y1, acc0);
-
- acc2 = __SMLALD(x2, y1, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc1 = __SMLALDX(x3, y1, acc1);
-
- a = *pScr1;
- b = *(pScr1 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x1 = __PKHBT(a, b, 16);
-#else
- x1 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLALD(x2, y2, acc0);
-
- acc2 = __SMLALD(x1, y2, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x1, x2, 0);
-#else
- x3 = __PKHBT(x2, x1, 0);
-#endif
-
- acc3 = __SMLALDX(x3, y1, acc3);
-
- acc1 = __SMLALDX(x3, y2, acc1);
-
- a = *(pScr1 + 2);
- b = *(pScr1 + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x2 = __PKHBT(a, b, 16);
-#else
- x2 = __PKHBT(b, a, 16);
-#endif
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc3 = __SMLALDX(x3, y2, acc3);
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- pIn2 += 4U;
- pScr1 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -429,7 +269,6 @@ void arm_conv_opt_q15(
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2);
acc1 += (*pScr1++ * *pIn2);
@@ -438,44 +277,37 @@ void arm_conv_opt_q15(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
-
/* Store the results in the accumulators in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 4U;
-
}
-
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate convolution for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
@@ -494,7 +326,7 @@ void arm_conv_opt_q15(
acc0 += (*pScr1++ * *pIn2++);
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -507,27 +339,24 @@ void arm_conv_opt_q15(
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
/* The result is in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the output in the destination buffer. */
+ Then store the output in the destination buffer. */
*pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 1U;
-
}
}
-
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c b/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
index 24d378b..fb9e2ec 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
@@ -3,13 +3,13 @@
* Title: arm_conv_opt_q7.c
* Description: Convolution of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,61 +29,53 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
- * @return none.
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
- * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
- *
+ @brief Convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
*/
void arm_conv_opt_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
-
- q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
- q15_t x4; /* Temporary input variable */
- q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
- uint32_t j, k, blkCnt, tapCnt; /* loop counter */
- q7_t *px; /* Temporary input1 pointer */
- q15_t *py; /* Temporary input2 pointer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x1, x2, x3, y1; /* Temporary input variables */
- q7_t *pOut = pDst; /* output pointer */
- q7_t out0, out1, out2, out3; /* temporary variables */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
+ q15_t x4; /* Temporary input variable */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ const q7_t *px; /* Temporary input1 pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ q7_t out0, out1, out2, out3; /* Temporary variables */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -110,9 +102,6 @@ void arm_conv_opt_q7(
srcALen = j;
}
- /* pointer to take end of scratch2 buffer */
- pScr2 = pScratch2;
-
/* points to smaller length sequence */
px = pIn2 + srcBLen - 1;
@@ -124,16 +113,16 @@ void arm_conv_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -144,16 +133,13 @@ void arm_conv_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* Initialze temporary scratch pointer */
- pScr1 = pScratch1;
-
/* Fill (srcBLen - 1U) zeros in scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
@@ -169,16 +155,16 @@ void arm_conv_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -196,48 +182,12 @@ void arm_conv_opt_q7(
k--;
}
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update pointer */
pScr1 += (srcBLen - 1U);
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif
-
/* Temporary pointer for scratch2 */
py = pScratch2;
@@ -247,7 +197,7 @@ void arm_conv_opt_q7(
pScr2 = py;
/* Actual convolution process starts here */
- blkCnt = (srcALen + srcBLen - 1U) >> 2;
+ blkCnt = (srcALen + srcBLen - 1U) >> 2U;
while (blkCnt > 0)
{
@@ -261,18 +211,17 @@ void arm_conv_opt_q7(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2);
+ y1 = read_q15x2_ia (&pScr2);
/* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
@@ -289,7 +238,7 @@ void arm_conv_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
@@ -301,7 +250,7 @@ void arm_conv_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2 + 2U);
+ y1 = read_q15x2_ia (&pScr2);
acc0 = __SMLAD(x2, y1, acc0);
@@ -309,7 +258,7 @@ void arm_conv_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -319,25 +268,18 @@ void arm_conv_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
- pScr2 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-
/* Update scratch pointer for remaining samples of smaller length sequence */
pScr1 -= 4U;
-
/* apply same above for remaining samples of smaller length sequence */
tapCnt = (srcBLen) & 3U;
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2);
acc1 += (*pScr1++ * *pScr2);
@@ -346,7 +288,7 @@ void arm_conv_opt_q7(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -358,16 +300,14 @@ void arm_conv_opt_q7(
out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
- *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
+ write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
/* Initialization of inputB pointer */
pScr2 = py;
pScratch1 += 4U;
-
}
-
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
/* Calculate convolution for remaining samples of Bigger length sequence */
@@ -386,7 +326,7 @@ void arm_conv_opt_q7(
acc0 += (*pScr1++ * *pScr2++);
acc0 += (*pScr1++ * *pScr2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -395,11 +335,10 @@ void arm_conv_opt_q7(
/* apply same above for remaining samples of smaller length sequence */
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -412,12 +351,10 @@ void arm_conv_opt_q7(
pScr2 = py;
pScratch1 += 1U;
-
}
}
-
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c b/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
index f3b15b4..e25f9ab 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_f32.c
* Description: Partial convolution of floating-point sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,84 +29,82 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup PartialConv Partial Convolution
- *
- * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
- * Each function has two additional arguments.
- * firstIndex specifies the starting index of the subset of output samples.
- * numPoints is the number of output samples to compute.
- * The function computes the output in the range
- * [firstIndex, ..., firstIndex+numPoints-1].
- * The output array pDst contains numPoints values.
- *
- * The allowable range of output indices is [0 srcALen+srcBLen-2].
- * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
- * Otherwise the functions return ARM_MATH_SUCCESS.
- * \note Refer arm_conv_f32() for details on fixed point behavior.
- *
- *
- * Fast Versions
- *
- * \par
- * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
- * the input signals should be scaled down to avoid intermediate overflows.
- *
- *
- * Opt Versions
- *
- * \par
- * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
- * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
+ @defgroup PartialConv Partial Convolution
+
+ Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
+ Each function has two additional arguments.
+ firstIndex specifies the starting index of the subset of output samples.
+ numPoints is the number of output samples to compute.
+ The function computes the output in the range
+ [firstIndex, ..., firstIndex+numPoints-1].
+ The output array pDst contains numPoints values.
+
+ The allowable range of output indices is [0 srcALen+srcBLen-2].
+ If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
+ Otherwise the functions return ARM_MATH_SUCCESS.
+ \note Refer to \ref arm_conv_f32() for details on fixed point behavior.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15 of partial convolution.
+ Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of partial convolution
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of floating-point sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ @brief Partial convolution of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
*/
arm_status arm_conv_partial_f32(
- float32_t * pSrcA,
- uint32_t srcALen,
- float32_t * pSrcB,
- uint32_t srcBLen,
- float32_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t *pIn1 = pSrcA; /* inputA pointer */
- float32_t *pIn2 = pSrcB; /* inputB pointer */
- float32_t *pOut = pDst; /* output pointer */
- float32_t *px; /* Intermediate inputA pointer */
- float32_t *py; /* Intermediate inputB pointer */
- float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
- float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t j, k, count = 0U, blkCnt, check;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc0, acc1, acc2, acc3; /* Accumulator */
+ float32_t x0, x1, x2, x3, c0; /* Temporary variables */
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -116,7 +114,6 @@ arm_status arm_conv_partial_f32(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -148,10 +145,8 @@ arm_status arm_conv_partial_f32(
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = ((int32_t) check - blockSize3) -
- (blockSize1 + (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -179,7 +174,7 @@ arm_status arm_conv_partial_f32(
/* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed.
- Since the partial convolution starts from from firstIndex
+ Since the partial convolution starts from firstIndex
Number of Macs to be performed is firstIndex + 1 */
count = 1U + firstIndex;
@@ -195,16 +190,16 @@ arm_status arm_conv_partial_f32(
* ----------------------*/
/* The first stage starts here */
- while (blockSize1 > 0)
+ while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 1] */
@@ -219,20 +214,26 @@ arm_status arm_conv_partial_f32(
/* x[3] * y[srcBLen - 4] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -243,10 +244,10 @@ arm_status arm_conv_partial_f32(
py = ++pSrc1;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -263,12 +264,13 @@ arm_status arm_conv_partial_f32(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
@@ -286,7 +288,9 @@ arm_status arm_conv_partial_f32(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = ((uint32_t) blockSize2 >> 2U);
while (blkCnt > 0U)
@@ -298,9 +302,9 @@ arm_status arm_conv_partial_f32(
acc3 = 0.0f;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -310,29 +314,24 @@ arm_status arm_conv_partial_f32(
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 += x0 * c0;
-
/* acc1 += x[1] * y[srcBLen - 1] */
acc1 += x1 * c0;
-
/* acc2 += x[2] * y[srcBLen - 1] */
acc2 += x2 * c0;
-
/* acc3 += x[3] * y[srcBLen - 1] */
acc3 += x3 * c0;
/* Read y[srcBLen - 2] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[1] * y[srcBLen - 2] */
@@ -345,12 +344,11 @@ arm_status arm_conv_partial_f32(
acc3 += x0 * c0;
/* Read y[srcBLen - 3] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[srcBLen - 3] */
acc0 += x2 * c0;
/* acc1 += x[3] * y[srcBLen - 2] */
@@ -361,12 +359,11 @@ arm_status arm_conv_partial_f32(
acc3 += x1 * c0;
/* Read y[srcBLen - 4] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[3] * y[srcBLen - 4] */
acc0 += x3 * c0;
/* acc1 += x[4] * y[srcBLen - 4] */
@@ -376,7 +373,6 @@ arm_status arm_conv_partial_f32(
/* acc3 += x[6] * y[srcBLen - 4] */
acc3 += x2 * c0;
-
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
@@ -386,10 +382,9 @@ arm_status arm_conv_partial_f32(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -416,38 +411,37 @@ arm_status arm_conv_partial_f32(
*pOut++ = acc2;
*pOut++ = acc3;
- /* Increment the pointer pIn1 index, count by 1 */
+ /* Increment the pointer pIn1 index, count by 4 */
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (uint32_t) blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
@@ -456,41 +450,40 @@ arm_status arm_conv_partial_f32(
sum += *px++ * *py--;
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -513,7 +506,7 @@ arm_status arm_conv_partial_f32(
/* Perform the multiply-accumulate */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -524,14 +517,7 @@ arm_status arm_conv_partial_f32(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -552,7 +538,7 @@ arm_status arm_conv_partial_f32(
*/
/* In this stage the MAC operations are decreased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
+ The blockSize3 variable holds the number of MAC operations performed */
count = srcBLen - 1U;
/* Working pointer of inputA */
@@ -563,16 +549,20 @@ arm_status arm_conv_partial_f32(
pSrc2 = pIn2 + (srcBLen - 1U);
py = pSrc2;
- while (blockSize3 > 0)
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
@@ -587,21 +577,27 @@ arm_status arm_conv_partial_f32(
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum += *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -612,15 +608,14 @@ arm_status arm_conv_partial_f32(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -628,19 +623,18 @@ arm_status arm_conv_partial_f32(
return (status);
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- float32_t *pIn1 = pSrcA; /* inputA pointer */
- float32_t *pIn2 = pSrcB; /* inputB pointer */
- float32_t sum; /* Accumulator */
- uint32_t i, j; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
{
- /* Set status as ARM_ARGUMENT_ERROR */
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
status = ARM_MATH_ARGUMENT_ERROR;
}
else
@@ -654,25 +648,29 @@ arm_status arm_conv_partial_f32(
/* Loop to perform MAC operations according to convolution equation */
for (j = 0U; j <= i; j++)
{
- /* Check the array limitations for inputs */
- if ((((i - j) < srcBLen) && (j < srcALen)))
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += pIn1[j] * pIn2[i - j];
+ sum += ( pIn1[j] * pIn2[i - j]);
}
}
+
/* Store the output in the destination buffer */
pDst[i] = sum;
}
- /* set status as ARM_SUCCESS as there are no argument errors */
+
+ /* Set status as ARM_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
+ /* Return to application */
return (status);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
index cf2d711..7166b57 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_fast_opt_q15.c
* Description: Fast Q15 Partial convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,63 +29,63 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- */
+ @brief Partial convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ @remark
+ Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
arm_status arm_conv_partial_fast_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q31_t acc0; /* Accumulator */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+ arm_status status; /* Status variable */
+ q31_t x1; /* Temporary variables to hold state and coefficient values */
+ q31_t y1; /* State variables */
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q31_t acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
- q31_t y1, y2; /* State variables */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- arm_status status;
-
- uint32_t tapCnt; /* loop count */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulator */
+ q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y2; /* State variables */
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -95,7 +95,6 @@ arm_status arm_conv_partial_fast_opt_q15(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -130,11 +129,10 @@ arm_status arm_conv_partial_fast_opt_q15(
/* points to smaller length sequence */
px = pIn2;
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcBLen >> 2U;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
/* Copy smaller length input sequence in reverse order into second scratch buffer */
while (k > 0U)
@@ -145,20 +143,26 @@ arm_status arm_conv_partial_fast_opt_q15(
*pScr2-- = *px++;
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -193,10 +197,11 @@ arm_status arm_conv_partial_fast_opt_q15(
pOut = pDst + firstIndex;
- /* First part of the processing with loop unrolling process 4 data points at a time.
- ** a second loop below process for the remaining 1 to 3 samples. */
-
/* Actual convolution process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (numPoints) >> 2;
while (blkCnt > 0)
@@ -211,10 +216,10 @@ arm_status arm_conv_partial_fast_opt_q15(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
@@ -222,8 +227,8 @@ arm_status arm_conv_partial_fast_opt_q15(
{
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
/* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
@@ -240,11 +245,10 @@ arm_status arm_conv_partial_fast_opt_q15(
acc1 = __SMLADX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = _SIMD32_OFFSET(pScr1);
+ x1 = read_q15x2_ia (&pScr1);
/* multiply and accumlate */
acc0 = __SMLAD(x2, y2, acc0);
-
acc2 = __SMLAD(x1, y2, acc2);
/* pack input data */
@@ -257,7 +261,7 @@ arm_status arm_conv_partial_fast_opt_q15(
acc3 = __SMLADX(x3, y1, acc3);
acc1 = __SMLADX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr1 + 2U);
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -265,14 +269,10 @@ arm_status arm_conv_partial_fast_opt_q15(
x3 = __PKHBT(x1, x2, 0);
#endif
+ /* multiply and accumlate */
acc3 = __SMLADX(x3, y2, acc3);
- /* update scratch pointers */
- pIn2 += 4U;
- pScr1 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -284,7 +284,6 @@ arm_status arm_conv_partial_fast_opt_q15(
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2);
acc1 += (*pScr1++ * *pIn2);
@@ -293,41 +292,37 @@ arm_status arm_conv_partial_fast_opt_q15(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
-
/* Store the results in the accumulators in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 4U;
-
}
-
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numPoints & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numPoints;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate convolution for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
@@ -341,16 +336,16 @@ arm_status arm_conv_partial_fast_opt_q15(
while (tapCnt > 0U)
{
-
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read two samples from smaller buffer */
- y1 = *__SIMD32(pIn2)++;
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ /* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -359,11 +354,10 @@ arm_status arm_conv_partial_fast_opt_q15(
/* apply same above for remaining samples of smaller length sequence */
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -379,378 +373,15 @@ arm_status arm_conv_partial_fast_opt_q15(
pScratch1 += 1U;
}
- /* set status as ARM_MATH_SUCCESS */
+
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
- /* Return to application */
- return (status);
-}
-
-#else
-
-arm_status arm_conv_partial_fast_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
-{
-
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q31_t acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- arm_status status; /* Status variable */
- uint32_t tapCnt; /* loop count */
- q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */
- q15_t y10, y11; /* Temporary variables to hold srcB buffer */
-
-
- /* Check for range of output samples to be calculated */
- if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
- {
- /* Set status as ARM_MATH_ARGUMENT_ERROR */
- status = ARM_MATH_ARGUMENT_ERROR;
- }
- else
- {
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
- }
-
- /* Temporary pointer for scratch2 */
- py = pScratch2;
-
- /* pointer to take end of scratch2 buffer */
- pScr2 = pScratch2 + srcBLen - 1;
-
- /* points to smaller length sequence */
- px = pIn2;
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr2-- = *px++;
- *pScr2-- = *px++;
- *pScr2-- = *px++;
- *pScr2-- = *px++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr2-- = *px++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Initialze temporary scratch pointer */
- pScr1 = pScratch1;
-
- /* Fill (srcBLen - 1U) zeros in scratch buffer */
- arm_fill_q15(0, pScr1, (srcBLen - 1U));
-
- /* Update temporary scratch pointer */
- pScr1 += (srcBLen - 1U);
-
- /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
-
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcALen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* Initialization of pIn2 pointer */
- pIn2 = py;
-
- pScratch1 += firstIndex;
-
- pOut = pDst + firstIndex;
-
- /* Actual convolution process starts here */
- blkCnt = (numPoints) >> 2;
-
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* Read two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *pScr1++;
- x21 = *pScr1++;
-
- tapCnt = (srcBLen) >> 2U;
-
- while (tapCnt > 0U)
- {
-
- /* Read two samples from smaller buffer */
- y10 = *pIn2;
- y11 = *(pIn2 + 1U);
-
- /* multiply and accumlate */
- acc0 += (q31_t) x10 *y10;
- acc0 += (q31_t) x11 *y11;
- acc2 += (q31_t) x20 *y10;
- acc2 += (q31_t) x21 *y11;
-
- /* multiply and accumlate */
- acc1 += (q31_t) x11 *y10;
- acc1 += (q31_t) x20 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1;
- x11 = *(pScr1 + 1U);
-
- /* multiply and accumlate */
- acc3 += (q31_t) x21 *y10;
- acc3 += (q31_t) x10 *y11;
-
- /* Read next two samples from scratch2 buffer */
- y10 = *(pIn2 + 2U);
- y11 = *(pIn2 + 3U);
-
- /* multiply and accumlate */
- acc0 += (q31_t) x20 *y10;
- acc0 += (q31_t) x21 *y11;
- acc2 += (q31_t) x10 *y10;
- acc2 += (q31_t) x11 *y11;
- acc1 += (q31_t) x21 *y10;
- acc1 += (q31_t) x10 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *(pScr1 + 2);
- x21 = *(pScr1 + 3);
-
- /* multiply and accumlate */
- acc3 += (q31_t) x11 *y10;
- acc3 += (q31_t) x20 *y11;
-
- /* update scratch pointers */
- pIn2 += 4U;
- pScr1 += 4U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Update scratch pointer for remaining samples of smaller length sequence */
- pScr1 -= 4U;
-
- /* apply same above for remaining samples of smaller length sequence */
- tapCnt = (srcBLen) & 3U;
-
- while (tapCnt > 0U)
- {
- /* accumlate the results */
- acc0 += (*pScr1++ * *pIn2);
- acc1 += (*pScr1++ * *pIn2);
- acc2 += (*pScr1++ * *pIn2);
- acc3 += (*pScr1++ * *pIn2++);
-
- pScr1 -= 3U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
-
- /* Store the results in the accumulators in the destination buffer. */
- *pOut++ = __SSAT((acc0 >> 15), 16);
- *pOut++ = __SSAT((acc1 >> 15), 16);
- *pOut++ = __SSAT((acc2 >> 15), 16);
- *pOut++ = __SSAT((acc3 >> 15), 16);
-
- /* Initialization of inputB pointer */
- pIn2 = py;
-
- pScratch1 += 4U;
-
- }
-
-
- blkCnt = numPoints & 0x3;
-
- /* Calculate convolution for remaining samples of Bigger length sequence */
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
-
- tapCnt = (srcBLen) >> 1U;
-
- while (tapCnt > 0U)
- {
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read two samples from smaller buffer */
- y10 = *pIn2++;
- y11 = *pIn2++;
-
- /* multiply and accumlate */
- acc0 += (q31_t) x10 *y10;
- acc0 += (q31_t) x11 *y11;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- tapCnt = (srcBLen) & 1U;
-
- /* apply same above for remaining samples of smaller length sequence */
- while (tapCnt > 0U)
- {
-
- /* accumlate the results */
- acc0 += (*pScr1++ * *pIn2++);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-
- /* Initialization of inputB pointer */
- pIn2 = py;
-
- pScratch1 += 1U;
-
- }
-
- /* set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
-
- }
/* Return to application */
return (status);
}
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
index bd43a98..535fbc7 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_fast_q15.c
* Description: Fast Q15 Partial convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,51 +29,50 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ @brief Partial convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+ @remark
+ Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
*/
-
arm_status arm_conv_partial_fast_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0;
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -83,11 +82,10 @@ arm_status arm_conv_partial_fast_q15(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >=srcBLen)
+ if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
pIn1 = pSrcA;
@@ -114,11 +112,9 @@ arm_status arm_conv_partial_fast_q15(
check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
- (int32_t) firstIndex);
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -180,7 +176,7 @@ arm_status arm_conv_partial_fast_q15(
/* Perform the multiply-accumulates */
sum = __SMLAD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -191,10 +187,10 @@ arm_status arm_conv_partial_fast_q15(
py = ++pSrc2;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -213,25 +209,25 @@ arm_status arm_conv_partial_fast_q15(
k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* For the next MAC operations, the pointer py is used without SIMD
- * So, py is incremented by 1 */
+ So, py is incremented by 1 */
py = py + 1U;
/* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = count % 0x4U;
while (k > 0U)
@@ -239,7 +235,7 @@ arm_status arm_conv_partial_fast_q15(
/* Perform the multiply-accumulates */
sum = __SMLAD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -250,10 +246,10 @@ arm_status arm_conv_partial_fast_q15(
py = ++pSrc2 - 1U;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -270,12 +266,13 @@ arm_status arm_conv_partial_fast_q15(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
@@ -284,22 +281,21 @@ arm_status arm_conv_partial_fast_q15(
/* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
-
- /* --------------------
+ /* -------------------
* Stage2 process
- * -------------------*/
+ * ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = ((uint32_t) blockSize2 >> 2U);
while (blkCnt > 0U)
{
- py = py - 1U;
+ py = py - 1U;
/* Set all accumulators to zero */
acc0 = 0;
@@ -309,10 +305,10 @@ arm_status arm_conv_partial_fast_q15(
/* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 ((q15_t *) px);
/* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px+1);
- px+= 2U;
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
@@ -324,7 +320,7 @@ arm_status arm_conv_partial_fast_q15(
{
/* Read the last two inputB samples using SIMD:
* y[srcBLen - 1] and y[srcBLen - 2] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
acc0 = __SMLADX(x0, c0, acc0);
@@ -333,10 +329,10 @@ arm_status arm_conv_partial_fast_q15(
acc1 = __SMLADX(x1, c0, acc1);
/* Read x[2], x[3] */
- x2 = *__SIMD32(px);
+ x2 = read_q15x2 ((q15_t *) px);
/* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px+1);
+ x3 = read_q15x2 ((q15_t *) px + 1);
/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
acc2 = __SMLADX(x2, c0, acc2);
@@ -345,7 +341,7 @@ arm_status arm_conv_partial_fast_q15(
acc3 = __SMLADX(x3, c0, acc3);
/* Read y[srcBLen - 3] and y[srcBLen - 4] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
acc0 = __SMLADX(x2, c0, acc0);
@@ -354,11 +350,11 @@ arm_status arm_conv_partial_fast_q15(
acc1 = __SMLADX(x3, c0, acc1);
/* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px+2);
+ x0 = read_q15x2 ((q15_t *) px + 2);
/* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px+3);
- px += 4U;
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
acc2 = __SMLADX(x0, c0, acc2);
@@ -369,33 +365,29 @@ arm_status arm_conv_partial_fast_q15(
} while (--k);
/* For the next MAC operations, SIMD is not used
- * So, the 16 bit pointer if inputB, py is updated */
+ So, the 16 bit pointer if inputB, py is updated */
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = srcBLen % 0x4U;
if (k == 1U)
{
/* Read y[srcBLen - 5] */
- c0 = *(py+1);
+ c0 = *(py + 1);
#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-
+ c0 = c0 << 16U;
#else
-
- c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */
- x3 = *__SIMD32(px);
- px++;
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD (x0, c0, acc0);
+ acc1 = __SMLAD (x1, c0, acc1);
acc2 = __SMLADX(x1, c0, acc2);
acc3 = __SMLADX(x3, c0, acc3);
}
@@ -403,16 +395,16 @@ arm_status arm_conv_partial_fast_q15(
if (k == 2U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
- /* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
- px += 2U;
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLADX(x0, c0, acc0);
acc1 = __SMLADX(x1, c0, acc1);
acc2 = __SMLADX(x3, c0, acc2);
@@ -422,58 +414,52 @@ arm_status arm_conv_partial_fast_q15(
if (k == 3U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLADX(x0, c0, acc0);
acc1 = __SMLADX(x1, c0, acc1);
acc2 = __SMLADX(x3, c0, acc2);
acc3 = __SMLADX(x2, c0, acc3);
- c0 = *(py-1);
+ c0 = *(py-1);
#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
+ c0 = c0 << 16U;
#else
-
- c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[10] */
- x3 = _SIMD32_OFFSET(px+2);
- px += 3U;
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
/* Perform the multiply-accumulates */
acc0 = __SMLADX(x1, c0, acc0);
- acc1 = __SMLAD(x2, c0, acc1);
+ acc1 = __SMLAD (x2, c0, acc1);
acc2 = __SMLADX(x2, c0, acc2);
acc3 = __SMLADX(x3, c0, acc3);
}
/* Store the results in the accumulators in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
- *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(acc0 >> 15, acc1 >> 15, 16));
+ write_q15x2_ia (&pOut, __PKHBT(acc2 >> 15, acc3 >> 15, 16));
#else
-
- *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
- *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT(acc1 >> 15, acc0 >> 15, 16));
+ write_q15x2_ia (&pOut, __PKHBT(acc3 >> 15, acc2 >> 15, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Increment the pointer pIn1 index, count by 4 */
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -481,7 +467,7 @@ arm_status arm_conv_partial_fast_q15(
}
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
blkCnt = (uint32_t) blockSize2 % 0x4U;
while (blkCnt > 0U)
@@ -493,16 +479,16 @@ arm_status arm_conv_partial_fast_q15(
k = srcBLen >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -513,7 +499,7 @@ arm_status arm_conv_partial_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -526,17 +512,10 @@ arm_status arm_conv_partial_fast_q15(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -557,7 +536,7 @@ arm_status arm_conv_partial_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -570,14 +549,7 @@ arm_status arm_conv_partial_fast_q15(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -635,21 +607,21 @@ arm_status arm_conv_partial_fast_q15(
{
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* For the next MAC operations, the pointer py is used without SIMD
- * So, py is incremented by 1 */
+ So, py is incremented by 1 */
py = py + 1U;
/* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = count % 0x4U;
while (k > 0U)
@@ -714,781 +686,15 @@ arm_status arm_conv_partial_fast_q15(
blockSize3--;
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
-#else
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0;
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
- arm_status status; /* status of Partial convolution */
- q15_t a, b;
-
- /* Check for range of output samples to be calculated */
- if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
- {
- /* Set status as ARM_MATH_ARGUMENT_ERROR */
- status = ARM_MATH_ARGUMENT_ERROR;
- }
- else
- {
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >=srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
- }
-
- /* Conditions to check which loopCounter holds
- * the first and last indices of the output samples to be calculated. */
- check = firstIndex + numPoints;
- blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
- blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = ((int32_t) check - blockSize3) -
- (blockSize1 + (int32_t) firstIndex);
- blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
-
- /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
- /* The function is internally
- * divided into three stages according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first stage of the
- * algorithm, the multiplications increase by one for every iteration.
- * In the second stage of the algorithm, srcBLen number of multiplications are done.
- * In the third stage of the algorithm, the multiplications decrease by one
- * for every iteration. */
-
- /* Set the output pointer to point to the firstIndex
- * of the output sample to be calculated. */
- pOut = pDst + firstIndex;
-
- /* --------------------------
- * Initializations of stage1
- * -------------------------*/
-
- /* sum = x[0] * y[0]
- * sum = x[0] * y[1] + x[1] * y[0]
- * ....
- * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
- */
-
- /* In this stage the MAC operations are increased by 1 for every iteration.
- The count variable holds the number of MAC operations performed.
- Since the partial convolution starts from firstIndex
- Number of Macs to be performed is firstIndex + 1 */
- count = 1U + firstIndex;
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- pSrc2 = pIn2 + firstIndex;
- py = pSrc2;
-
- /* ------------------------
- * Stage1 process
- * ----------------------*/
-
- /* For loop unrolling by 4, this stage is divided into two. */
- /* First part of this stage computes the MAC operations less than 4 */
- /* Second part of this stage computes the MAC operations greater than or equal to 4 */
-
- /* The first part of the stage starts here */
- while ((count < 4U) && (blockSize1 > 0))
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Loop over number of MAC operations between
- * inputA samples and inputB samples */
- k = count;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = ++pSrc2;
- px = pIn1;
-
- /* Increment the MAC count */
- count++;
-
- /* Decrement the loop counter */
- blockSize1--;
- }
-
- /* The second part of the stage starts here */
- /* The internal loop, over count, is unrolled by 4 */
- /* To, read the last two inputB samples using SIMD:
- * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
- py = py - 1;
-
- while (blockSize1 > 0)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- py++;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = count % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = ++pSrc2 - 1U;
- px = pIn1;
-
- /* Increment the MAC count */
- count++;
-
- /* Decrement the loop counter */
- blockSize1--;
- }
-
- /* --------------------------
- * Initializations of stage2
- * ------------------------*/
-
- /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
- * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
- * ....
- * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
- */
-
- /* Working pointer of inputA */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1;
- }
- else
- {
- px = pIn1;
- }
-
- /* Working pointer of inputB */
- pSrc2 = pIn2 + (srcBLen - 1U);
- py = pSrc2;
-
- /* count is the index by which the pointer pIn1 to be incremented */
- count = 0U;
-
-
- /* --------------------
- * Stage2 process
- * -------------------*/
-
- /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
- * So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4 */
- if (srcBLen >= 4U)
- {
- /* Loop unroll over blockSize2, by 4 */
- blkCnt = ((uint32_t) blockSize2 >> 2U);
-
- while (blkCnt > 0U)
- {
- py = py - 1U;
-
- /* Set all accumulators to zero */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* read x[0], x[1] samples */
- a = *px++;
- b = *px++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *px;
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *px;
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- do
- {
- /* Read the last two inputB samples using SIMD:
- * y[srcBLen - 1] and y[srcBLen - 2] */
- a = *py;
- b = *(py+1);
- py -= 2;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
- acc0 = __SMLADX(x0, c0, acc0);
-
- /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
- acc1 = __SMLADX(x1, c0, acc1);
-
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x2 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x3 = __PKHBT(b, a, 16);
-
-#else
-
- x2 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x3 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
- acc2 = __SMLADX(x2, c0, acc2);
-
- /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
- acc3 = __SMLADX(x3, c0, acc3);
-
- /* Read y[srcBLen - 3] and y[srcBLen - 4] */
- a = *py;
- b = *(py+1);
- py -= 2;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
- acc0 = __SMLADX(x2, c0, acc0);
-
- /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
- acc1 = __SMLADX(x3, c0, acc1);
-
- /* Read x[4], x[5], x[6] */
- a = *(px + 2);
- b = *(px + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *(px + 4);
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *(px + 4);
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 4U;
-
- /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
- acc2 = __SMLADX(x0, c0, acc2);
-
- /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
- acc3 = __SMLADX(x1, c0, acc3);
-
- } while (--k);
-
- /* For the next MAC operations, SIMD is not used
- * So, the 16 bit pointer if inputB, py is updated */
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- if (k == 1U)
- {
- /* Read y[srcBLen - 5] */
- c0 = *(py+1);
-
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-
-#else
-
- c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7] */
- a = *px;
- b = *(px+1);
- px++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
- acc2 = __SMLADX(x1, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- if (k == 2U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- a = *py;
- b = *(py+1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- px += 2U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x0, c0, acc0);
- acc1 = __SMLADX(x1, c0, acc1);
- acc2 = __SMLADX(x3, c0, acc2);
- acc3 = __SMLADX(x2, c0, acc3);
- }
-
- if (k == 3U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- a = *py;
- b = *(py+1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x0, c0, acc0);
- acc1 = __SMLADX(x1, c0, acc1);
- acc2 = __SMLADX(x3, c0, acc2);
- acc3 = __SMLADX(x2, c0, acc3);
-
- /* Read y[srcBLen - 7] */
- c0 = *(py-1);
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-#else
-
- c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[10] */
- a = *(px+2);
- b = *(px+3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 3U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x1, c0, acc0);
- acc1 = __SMLAD(x2, c0, acc1);
- acc2 = __SMLADX(x2, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- /* Store the results in the accumulators in the destination buffer. */
- *pOut++ = (q15_t)(acc0 >> 15);
- *pOut++ = (q15_t)(acc1 >> 15);
- *pOut++ = (q15_t)(acc2 >> 15);
- *pOut++ = (q15_t)(acc3 >> 15);
-
- /* Increment the pointer pIn1 index, count by 4 */
- count += 4U;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = (uint32_t) blockSize2 % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Increment the pointer pIn1 index, count by 1 */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
- else
- {
- /* If the srcBLen is not a multiple of 4,
- * the blockSize2 loop cannot be unrolled by 4 */
- blkCnt = (uint32_t) blockSize2;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* srcBLen number of MACS should be performed */
- k = srcBLen;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Increment the MAC count */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pSrc2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
-
-
- /* --------------------------
- * Initializations of stage3
- * -------------------------*/
-
- /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
- * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
- * ....
- * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
- * sum += x[srcALen-1] * y[srcBLen-1]
- */
-
- /* In this stage the MAC operations are decreased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = srcBLen - 1U;
-
- /* Working pointer of inputA */
- pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
- px = pSrc1;
-
- /* Working pointer of inputB */
- pSrc2 = pIn2 + (srcBLen - 1U);
- pIn2 = pSrc2 - 1U;
- py = pIn2;
-
- /* -------------------
- * Stage3 process
- * ------------------*/
-
- /* For loop unrolling by 4, this stage is divided into two. */
- /* First part of this stage computes the MAC operations greater than 4 */
- /* Second part of this stage computes the MAC operations less than or equal to 4 */
-
- /* The first part of the stage starts here */
- j = count >> 2U;
-
- while ((j > 0U) && (blockSize3 > 0))
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- py++;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = count % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pIn2;
-
- /* Decrement the MAC count */
- count--;
-
- /* Decrement the loop counter */
- blockSize3--;
-
- j--;
- }
-
- /* The second part of the stage starts here */
- /* SIMD is not used for the next MAC operations,
- * so pointer py is updated to read only one sample at a time */
- py = py + 1U;
-
- while (blockSize3 > 0)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- /* sum += x[srcALen-1] * y[srcBLen-1] */
- sum += ((q31_t) * px++ * *py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (sum >> 15);
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pSrc2;
-
- /* Decrement the MAC count */
- count--;
-
- /* Decrement the loop counter */
- blockSize3--;
- }
-
- /* set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
- }
-
- /* Return to application */
- return (status);
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
index af3724d..f232d51 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_fast_q31.c
* Description: Fast Q31 Partial convolution
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,50 +29,55 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * \par
- * See arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ @brief Partial convolution of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
*/
arm_status arm_conv_partial_fast_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q31_t x0, x1, x2, x3, c0;
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0;
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -82,7 +87,6 @@ arm_status arm_conv_partial_fast_q31(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -113,11 +117,9 @@ arm_status arm_conv_partial_fast_q31(
check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
- (int32_t) firstIndex);
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -160,50 +162,56 @@ arm_status arm_conv_partial_fast_q31(
* Stage1 process
* ----------------------*/
- /* The first loop starts here */
- while (blockSize1 > 0)
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[1] * y[srcBLen - 2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[2] * y[srcBLen - 3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* x[3] * y[srcBLen - 4] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -214,10 +222,10 @@ arm_status arm_conv_partial_fast_q31(
py = ++pSrc2;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -234,12 +242,13 @@ arm_status arm_conv_partial_fast_q31(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
@@ -257,7 +266,9 @@ arm_status arm_conv_partial_fast_q31(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2 */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = ((uint32_t) blockSize2 >> 2U);
while (blkCnt > 0U)
@@ -269,9 +280,9 @@ arm_status arm_conv_partial_fast_q31(
acc3 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -281,29 +292,24 @@ arm_status arm_conv_partial_fast_q31(
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
-
/* acc1 += x[1] * y[srcBLen - 1] */
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
-
/* acc2 += x[2] * y[srcBLen - 1] */
acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
-
/* acc3 += x[3] * y[srcBLen - 1] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
/* Read y[srcBLen - 2] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[1] * y[srcBLen - 2] */
@@ -316,10 +322,9 @@ arm_status arm_conv_partial_fast_q31(
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
/* Read y[srcBLen - 3] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[2] * y[srcBLen - 3] */
@@ -332,10 +337,9 @@ arm_status arm_conv_partial_fast_q31(
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
/* Read y[srcBLen - 4] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[3] * y[srcBLen - 4] */
@@ -347,7 +351,6 @@ arm_status arm_conv_partial_fast_q31(
/* acc3 += x[6] * y[srcBLen - 4] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
-
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
@@ -357,10 +360,9 @@ arm_status arm_conv_partial_fast_q31(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -391,34 +393,33 @@ arm_status arm_conv_partial_fast_q31(
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (uint32_t) blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
@@ -431,42 +432,41 @@ arm_status arm_conv_partial_fast_q31(
sum = (q31_t) ((((q63_t) sum << 32) +
((q63_t) * px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = sum << 1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -488,9 +488,9 @@ arm_status arm_conv_partial_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -501,14 +501,7 @@ arm_status arm_conv_partial_fast_q31(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -544,50 +537,56 @@ arm_status arm_conv_partial_fast_q31(
* Stage3 process
* ------------------*/
- while (blockSize3 > 0)
+ while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulates */
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py--))) >> 32);
+ ((q63_t) *px++ * (*py--))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -598,15 +597,14 @@ arm_status arm_conv_partial_fast_q31(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -616,5 +614,5 @@ arm_status arm_conv_partial_fast_q31(
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
index 44e368e..21999d2 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_opt_q15.c
* Description: Partial convolution of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,63 +29,64 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, state buffers should be aligned by 32-bit
- *
- * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- *
- */
+ @brief Partial convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ @remark
+ Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
+ */
arm_status arm_conv_partial_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q63_t acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
- q31_t y1, y2; /* State variables */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- arm_status status; /* Status variable */
- uint32_t tapCnt; /* loop count */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q63_t acc0; /* Accumulator */
+ q31_t x1; /* Temporary variables to hold state and coefficient values */
+ q31_t y1; /* State variables */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+ arm_status status; /* Status variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulator */
+ q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y2; /* State variables */
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -95,7 +96,6 @@ arm_status arm_conv_partial_opt_q15(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -130,11 +130,12 @@ arm_status arm_conv_partial_opt_q15(
/* points to smaller length sequence */
px = pIn2;
- /* Apply loop unrolling and do 4 Copies simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
while (k > 0U)
{
/* copy second buffer in reversal manner */
@@ -143,26 +144,33 @@ arm_status arm_conv_partial_opt_q15(
*pScr2-- = *px++;
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
*pScr2-- = *px++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Initialze temporary scratch pointer */
pScr1 = pScratch1;
+ /* Assuming scratch1 buffer is aligned by 32-bit */
/* Fill (srcBLen - 1U) zeros in scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
@@ -191,6 +199,10 @@ arm_status arm_conv_partial_opt_q15(
pOut = pDst + firstIndex;
/* Actual convolution process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (numPoints) >> 2;
while (blkCnt > 0)
@@ -205,10 +217,10 @@ arm_status arm_conv_partial_opt_q15(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
@@ -216,8 +228,8 @@ arm_status arm_conv_partial_opt_q15(
{
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
/* multiply and accumlate */
acc0 = __SMLALD(x1, y1, acc0);
@@ -234,7 +246,7 @@ arm_status arm_conv_partial_opt_q15(
acc1 = __SMLALDX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = _SIMD32_OFFSET(pScr1);
+ x1 = read_q15x2_ia (&pScr1);
/* multiply and accumlate */
acc0 = __SMLALD(x2, y2, acc0);
@@ -250,7 +262,7 @@ arm_status arm_conv_partial_opt_q15(
acc3 = __SMLALDX(x3, y1, acc3);
acc1 = __SMLALDX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr1 + 2U);
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -260,12 +272,7 @@ arm_status arm_conv_partial_opt_q15(
acc3 = __SMLALDX(x3, y2, acc3);
- /* update scratch pointers */
- pIn2 += 4U;
- pScr1 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -285,41 +292,37 @@ arm_status arm_conv_partial_opt_q15(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
-
/* Store the results in the accumulators in the destination buffer. */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Initialization of inputB pointer */
pIn2 = py;
pScratch1 += 4U;
-
}
-
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numPoints & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numPoints;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate convolution for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
@@ -333,12 +336,11 @@ arm_status arm_conv_partial_opt_q15(
while (tapCnt > 0U)
{
-
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read two samples from smaller buffer */
- y1 = *__SIMD32(pIn2)++;
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
acc0 = __SMLALD(x1, y1, acc0);
@@ -351,17 +353,17 @@ arm_status arm_conv_partial_opt_q15(
/* apply same above for remaining samples of smaller length sequence */
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
- /* Store the result in the accumulator in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
*pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
/* Initialization of inputB pointer */
@@ -371,383 +373,14 @@ arm_status arm_conv_partial_opt_q15(
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
-
}
/* Return to application */
return (status);
}
-#else
-
-arm_status arm_conv_partial_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
-{
-
- q15_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
- q63_t acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- uint32_t j, k, blkCnt; /* loop counter */
- arm_status status; /* Status variable */
- uint32_t tapCnt; /* loop count */
- q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */
- q15_t y10, y11; /* Temporary variables to hold srcB buffer */
-
-
- /* Check for range of output samples to be calculated */
- if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
- {
- /* Set status as ARM_MATH_ARGUMENT_ERROR */
- status = ARM_MATH_ARGUMENT_ERROR;
- }
- else
- {
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
- }
-
- /* Temporary pointer for scratch2 */
- py = pScratch2;
-
- /* pointer to take end of scratch2 buffer */
- pScr2 = pScratch2 + srcBLen - 1;
-
- /* points to smaller length sequence */
- px = pIn2;
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr2-- = *px++;
- *pScr2-- = *px++;
- *pScr2-- = *px++;
- *pScr2-- = *px++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr2-- = *px++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Initialze temporary scratch pointer */
- pScr1 = pScratch1;
-
- /* Fill (srcBLen - 1U) zeros in scratch buffer */
- arm_fill_q15(0, pScr1, (srcBLen - 1U));
-
- /* Update temporary scratch pointer */
- pScr1 += (srcBLen - 1U);
-
- /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
-
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcALen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = *pIn1++;
-
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* Initialization of pIn2 pointer */
- pIn2 = py;
-
- pScratch1 += firstIndex;
-
- pOut = pDst + firstIndex;
-
- /* Actual convolution process starts here */
- blkCnt = (numPoints) >> 2;
-
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* Read two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *pScr1++;
- x21 = *pScr1++;
-
- tapCnt = (srcBLen) >> 2U;
-
- while (tapCnt > 0U)
- {
-
- /* Read two samples from smaller buffer */
- y10 = *pIn2;
- y11 = *(pIn2 + 1U);
-
- /* multiply and accumlate */
- acc0 += (q63_t) x10 *y10;
- acc0 += (q63_t) x11 *y11;
- acc2 += (q63_t) x20 *y10;
- acc2 += (q63_t) x21 *y11;
-
- /* multiply and accumlate */
- acc1 += (q63_t) x11 *y10;
- acc1 += (q63_t) x20 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1;
- x11 = *(pScr1 + 1U);
-
- /* multiply and accumlate */
- acc3 += (q63_t) x21 *y10;
- acc3 += (q63_t) x10 *y11;
-
- /* Read next two samples from scratch2 buffer */
- y10 = *(pIn2 + 2U);
- y11 = *(pIn2 + 3U);
-
- /* multiply and accumlate */
- acc0 += (q63_t) x20 *y10;
- acc0 += (q63_t) x21 *y11;
- acc2 += (q63_t) x10 *y10;
- acc2 += (q63_t) x11 *y11;
- acc1 += (q63_t) x21 *y10;
- acc1 += (q63_t) x10 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *(pScr1 + 2);
- x21 = *(pScr1 + 3);
-
- /* multiply and accumlate */
- acc3 += (q63_t) x11 *y10;
- acc3 += (q63_t) x20 *y11;
-
- /* update scratch pointers */
- pIn2 += 4U;
- pScr1 += 4U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Update scratch pointer for remaining samples of smaller length sequence */
- pScr1 -= 4U;
-
- /* apply same above for remaining samples of smaller length sequence */
- tapCnt = (srcBLen) & 3U;
-
- while (tapCnt > 0U)
- {
- /* accumlate the results */
- acc0 += (*pScr1++ * *pIn2);
- acc1 += (*pScr1++ * *pIn2);
- acc2 += (*pScr1++ * *pIn2);
- acc3 += (*pScr1++ * *pIn2++);
-
- pScr1 -= 3U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
-
- /* Store the results in the accumulators in the destination buffer. */
- *pOut++ = __SSAT((acc0 >> 15), 16);
- *pOut++ = __SSAT((acc1 >> 15), 16);
- *pOut++ = __SSAT((acc2 >> 15), 16);
- *pOut++ = __SSAT((acc3 >> 15), 16);
-
-
- /* Initialization of inputB pointer */
- pIn2 = py;
-
- pScratch1 += 4U;
-
- }
-
-
- blkCnt = numPoints & 0x3;
-
- /* Calculate convolution for remaining samples of Bigger length sequence */
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
-
- tapCnt = (srcBLen) >> 1U;
-
- while (tapCnt > 0U)
- {
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read two samples from smaller buffer */
- y10 = *pIn2++;
- y11 = *pIn2++;
-
- /* multiply and accumlate */
- acc0 += (q63_t) x10 *y10;
- acc0 += (q63_t) x11 *y11;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- tapCnt = (srcBLen) & 1U;
-
- /* apply same above for remaining samples of smaller length sequence */
- while (tapCnt > 0U)
- {
-
- /* accumlate the results */
- acc0 += (*pScr1++ * *pIn2++);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-
-
- /* Initialization of inputB pointer */
- pIn2 = py;
-
- pScratch1 += 1U;
-
- }
-
- /* set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
-
- }
-
- /* Return to application */
- return (status);
-}
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
index 00dbef1..811f386 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_opt_q7.c
* Description: Partial convolution of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,61 +29,52 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- *
- *
+ @brief Partial convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
*/
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
arm_status arm_conv_partial_opt_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
-
- q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
- q15_t x4; /* Temporary input variable */
- q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
- uint32_t j, k, blkCnt, tapCnt; /* loop counter */
- q7_t *px; /* Temporary input1 pointer */
- q15_t *py; /* Temporary input2 pointer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t x1, x2, x3, y1; /* Temporary input variables */
- arm_status status;
- q7_t *pOut = pDst; /* output pointer */
- q7_t out0, out1, out2, out3; /* temporary variables */
+ q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
+ q15_t x4; /* Temporary input variable */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ const q7_t *px; /* Temporary input1 pointer */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ arm_status status;
+ q7_t *pOut = pDst; /* Output pointer */
+ q7_t out0, out1, out2, out3; /* Temporary variables */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -93,7 +84,6 @@ arm_status arm_conv_partial_opt_q7(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -133,16 +123,16 @@ arm_status arm_conv_partial_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -153,10 +143,10 @@ arm_status arm_conv_partial_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * px--;
+ x4 = (q15_t) *px--;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -178,16 +168,16 @@ arm_status arm_conv_partial_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -198,7 +188,7 @@ arm_status arm_conv_partial_opt_q7(
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
/* Decrement the loop counter */
@@ -227,31 +217,29 @@ arm_status arm_conv_partial_opt_q7(
/* Actual convolution process starts here */
blkCnt = (numPoints) >> 2;
-
while (blkCnt > 0)
{
- /* Initialze temporary scratch pointer as scratch1 */
+ /* Initialize temporary scratch pointer as scratch1 */
pScr1 = pScratch1;
- /* Clear Accumlators */
+ /* Clear Accumulators */
acc0 = 0;
acc1 = 0;
acc2 = 0;
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2);
+ y1 = read_q15x2_ia (&pScr2);
/* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
@@ -268,7 +256,7 @@ arm_status arm_conv_partial_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
@@ -280,7 +268,7 @@ arm_status arm_conv_partial_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2 + 2U);
+ y1 = read_q15x2_ia (&pScr2);
acc0 = __SMLAD(x2, y1, acc0);
@@ -288,7 +276,7 @@ arm_status arm_conv_partial_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -298,25 +286,18 @@ arm_status arm_conv_partial_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
- pScr2 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-
/* Update scratch pointer for remaining samples of smaller length sequence */
pScr1 -= 4U;
-
/* apply same above for remaining samples of smaller length sequence */
tapCnt = (srcBLen) & 3U;
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2);
acc1 += (*pScr1++ * *pScr2);
@@ -325,7 +306,7 @@ arm_status arm_conv_partial_opt_q7(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -337,13 +318,12 @@ arm_status arm_conv_partial_opt_q7(
out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
- *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
+ write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
/* Initialization of inputB pointer */
pScr2 = py;
pScratch1 += 4U;
-
}
blkCnt = (numPoints) & 0x3;
@@ -363,10 +343,10 @@ arm_status arm_conv_partial_opt_q7(
{
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read two samples from smaller buffer */
- y1 = *__SIMD32(pScr2)++;
+ y1 = read_q15x2_ia (&pScr2);
acc0 = __SMLAD(x1, y1, acc0);
@@ -383,7 +363,7 @@ arm_status arm_conv_partial_opt_q7(
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -396,396 +376,15 @@ arm_status arm_conv_partial_opt_q7(
pScr2 = py;
pScratch1 += 1U;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
-
-
}
return (status);
-
}
-#else
-
-arm_status arm_conv_partial_opt_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints,
- q15_t * pScratch1,
- q15_t * pScratch2)
-{
-
- q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
- q15_t x4; /* Temporary input variable */
- q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
- uint32_t j, k, blkCnt, tapCnt; /* loop counter */
- q7_t *px; /* Temporary input1 pointer */
- q15_t *py; /* Temporary input2 pointer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulator */
- arm_status status;
- q7_t *pOut = pDst; /* output pointer */
- q15_t x10, x11, x20, x21; /* Temporary input variables */
- q15_t y10, y11; /* Temporary input variables */
-
- /* Check for range of output samples to be calculated */
- if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
- {
- /* Set status as ARM_MATH_ARGUMENT_ERROR */
- status = ARM_MATH_ARGUMENT_ERROR;
- }
- else
- {
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
- }
-
- /* pointer to take end of scratch2 buffer */
- pScr2 = pScratch2;
-
- /* points to smaller length sequence */
- px = pIn2 + srcBLen - 1;
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- x4 = (q15_t) * px--;
- *pScr2++ = x4;
- x4 = (q15_t) * px--;
- *pScr2++ = x4;
- x4 = (q15_t) * px--;
- *pScr2++ = x4;
- x4 = (q15_t) * px--;
- *pScr2++ = x4;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * px--;
- *pScr2++ = x4;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Initialze temporary scratch pointer */
- pScr1 = pScratch1;
-
- /* Fill (srcBLen - 1U) zeros in scratch buffer */
- arm_fill_q15(0, pScr1, (srcBLen - 1U));
-
- /* Update temporary scratch pointer */
- pScr1 += (srcBLen - 1U);
-
- /* Copy (srcALen) samples in scratch buffer */
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- x4 = (q15_t) * pIn1++;
- *pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
- *pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
- *pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
- *pScr1++ = x4;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = srcALen % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * pIn1++;
- *pScr1++ = x4;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-
- /* Temporary pointer for scratch2 */
- py = pScratch2;
-
- /* Initialization of pIn2 pointer */
- pIn2 = (q7_t *) py;
-
- pScr2 = py;
-
- pOut = pDst + firstIndex;
-
- pScratch1 += firstIndex;
-
- /* Actual convolution process starts here */
- blkCnt = (numPoints) >> 2;
-
-
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* Read two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *pScr1++;
- x21 = *pScr1++;
-
- tapCnt = (srcBLen) >> 2U;
-
- while (tapCnt > 0U)
- {
-
- /* Read four samples from smaller buffer */
- y10 = *pScr2;
- y11 = *(pScr2 + 1U);
-
- /* multiply and accumlate */
- acc0 += (q31_t) x10 *y10;
- acc0 += (q31_t) x11 *y11;
- acc2 += (q31_t) x20 *y10;
- acc2 += (q31_t) x21 *y11;
-
-
- acc1 += (q31_t) x11 *y10;
- acc1 += (q31_t) x20 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1;
- x11 = *(pScr1 + 1U);
-
- /* multiply and accumlate */
- acc3 += (q31_t) x21 *y10;
- acc3 += (q31_t) x10 *y11;
-
- /* Read next two samples from scratch2 buffer */
- y10 = *(pScr2 + 2U);
- y11 = *(pScr2 + 3U);
-
- /* multiply and accumlate */
- acc0 += (q31_t) x20 *y10;
- acc0 += (q31_t) x21 *y11;
- acc2 += (q31_t) x10 *y10;
- acc2 += (q31_t) x11 *y11;
- acc1 += (q31_t) x21 *y10;
- acc1 += (q31_t) x10 *y11;
-
- /* Read next two samples from scratch1 buffer */
- x20 = *(pScr1 + 2);
- x21 = *(pScr1 + 3);
-
- /* multiply and accumlate */
- acc3 += (q31_t) x11 *y10;
- acc3 += (q31_t) x20 *y11;
-
- /* update scratch pointers */
-
- pScr1 += 4U;
- pScr2 += 4U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
-
-
- /* Update scratch pointer for remaining samples of smaller length sequence */
- pScr1 -= 4U;
-
-
- /* apply same above for remaining samples of smaller length sequence */
- tapCnt = (srcBLen) & 3U;
-
- while (tapCnt > 0U)
- {
-
- /* accumlate the results */
- acc0 += (*pScr1++ * *pScr2);
- acc1 += (*pScr1++ * *pScr2);
- acc2 += (*pScr1++ * *pScr2);
- acc3 += (*pScr1++ * *pScr2++);
-
- pScr1 -= 3U;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
- *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
- *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
- *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
-
- /* Initialization of inputB pointer */
- pScr2 = py;
-
- pScratch1 += 4U;
-
- }
-
- blkCnt = (numPoints) & 0x3;
-
- /* Calculate convolution for remaining samples of Bigger length sequence */
- while (blkCnt > 0)
- {
- /* Initialze temporary scratch pointer as scratch1 */
- pScr1 = pScratch1;
-
- /* Clear Accumlators */
- acc0 = 0;
-
- tapCnt = (srcBLen) >> 1U;
-
- while (tapCnt > 0U)
- {
-
- /* Read next two samples from scratch1 buffer */
- x10 = *pScr1++;
- x11 = *pScr1++;
-
- /* Read two samples from smaller buffer */
- y10 = *pScr2++;
- y11 = *pScr2++;
-
- /* multiply and accumlate */
- acc0 += (q31_t) x10 *y10;
- acc0 += (q31_t) x11 *y11;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- tapCnt = (srcBLen) & 1U;
-
- /* apply same above for remaining samples of smaller length sequence */
- while (tapCnt > 0U)
- {
-
- /* accumlate the results */
- acc0 += (*pScr1++ * *pScr2++);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- blkCnt--;
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
-
- /* Initialization of inputB pointer */
- pScr2 = py;
-
- pScratch1 += 1U;
-
- }
-
- /* set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
-
- }
-
- return (status);
-
-}
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-
-
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c b/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
index 93864b7..55272ea 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_q15.c
* Description: Partial convolution of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,58 +29,56 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- * \par
- * Refer the function arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
- *
+ @brief Partial convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
*/
arm_status arm_conv_partial_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
+#if defined (ARM_MATH_DSP)
-#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
- arm_status status; /* status of Partial convolution */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt, check;
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -90,7 +88,6 @@ arm_status arm_conv_partial_q15(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -121,11 +118,9 @@ arm_status arm_conv_partial_q15(
check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
- (int32_t) firstIndex);
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -173,7 +168,7 @@ arm_status arm_conv_partial_q15(
/* Second part of this stage computes the MAC operations greater than or equal to 4 */
/* The first part of the stage starts here */
- while ((count < 4U) && (blockSize1 > 0))
+ while ((count < 4U) && (blockSize1 > 0U))
{
/* Accumulator is made zero for every iteration */
sum = 0;
@@ -187,7 +182,7 @@ arm_status arm_conv_partial_q15(
/* Perform the multiply-accumulates */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -198,10 +193,10 @@ arm_status arm_conv_partial_q15(
py = ++pSrc2;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -211,7 +206,7 @@ arm_status arm_conv_partial_q15(
* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
py = py - 1;
- while (blockSize1 > 0)
+ while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
@@ -220,16 +215,16 @@ arm_status arm_conv_partial_q15(
k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -238,7 +233,7 @@ arm_status arm_conv_partial_q15(
py = py + 1U;
/* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = count % 0x4U;
while (k > 0U)
@@ -246,7 +241,7 @@ arm_status arm_conv_partial_q15(
/* Perform the multiply-accumulates */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -257,10 +252,10 @@ arm_status arm_conv_partial_q15(
py = ++pSrc2 - 1U;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -277,12 +272,13 @@ arm_status arm_conv_partial_q15(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
@@ -291,219 +287,193 @@ arm_status arm_conv_partial_q15(
/* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
- /* --------------------
- * Stage2 process
- * -------------------*/
-
- /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
- * So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4 */
- if (srcBLen >= 4U)
- {
- /* Loop unroll over blockSize2, by 4 */
- blkCnt = blockSize2 >> 2U;
-
- while (blkCnt > 0U)
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
{
- py = py - 1U;
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
- /* Set all accumulators to zero */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
-
- /* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
- /* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px+1);
- px+= 2U;
-
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- do
+ while (blkCnt > 0U)
{
- /* Read the last two inputB samples using SIMD:
- * y[srcBLen - 1] and y[srcBLen - 2] */
- c0 = *__SIMD32(py)--;
+ py = py - 1U;
- /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
- acc0 = __SMLALDX(x0, c0, acc0);
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
- /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
- acc1 = __SMLALDX(x1, c0, acc1);
- /* Read x[2], x[3] */
- x2 = *__SIMD32(px);
-
- /* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px+1);
-
- /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
- acc2 = __SMLALDX(x2, c0, acc2);
-
- /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
- acc3 = __SMLALDX(x3, c0, acc3);
-
- /* Read y[srcBLen - 3] and y[srcBLen - 4] */
- c0 = *__SIMD32(py)--;
-
- /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
- acc0 = __SMLALDX(x2, c0, acc0);
-
- /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
- acc1 = __SMLALDX(x3, c0, acc1);
-
- /* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px+2);
-
- /* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px+3);
- px += 4U;
-
- /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
- acc2 = __SMLALDX(x0, c0, acc2);
-
- /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
- acc3 = __SMLALDX(x1, c0, acc3);
-
- } while (--k);
-
- /* For the next MAC operations, SIMD is not used
- * So, the 16 bit pointer if inputB, py is updated */
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- if (k == 1U)
- {
- /* Read y[srcBLen - 5] */
- c0 = *(py+1);
-
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-
-#else
-
- c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7] */
- x3 = *__SIMD32(px);
- px++;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLALD(x0, c0, acc0);
- acc1 = __SMLALD(x1, c0, acc1);
- acc2 = __SMLALDX(x1, c0, acc2);
- acc3 = __SMLALDX(x3, c0, acc3);
- }
-
- if (k == 2U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
-
- /* Read x[7], x[8] */
- x3 = *__SIMD32(px);
-
- /* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
- /* Perform the multiply-accumulates */
- acc0 = __SMLALDX(x0, c0, acc0);
- acc1 = __SMLALDX(x1, c0, acc1);
- acc2 = __SMLALDX(x3, c0, acc2);
- acc3 = __SMLALDX(x2, c0, acc3);
- }
- if (k == 3U)
- {
- /* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
- /* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = read_q15x2_da ((q15_t **) &py);
- /* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLALDX(x0, c0, acc0);
- /* Perform the multiply-accumulates */
- acc0 = __SMLALDX(x0, c0, acc0);
- acc1 = __SMLALDX(x1, c0, acc1);
- acc2 = __SMLALDX(x3, c0, acc2);
- acc3 = __SMLALDX(x2, c0, acc3);
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
- c0 = *(py-1);
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLALDX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLALDX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLALDX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLALDX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLALDX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
+ c0 = c0 << 16U;
#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
- c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
- /* Read x[10] */
- x3 = _SIMD32_OFFSET(px+2);
- px += 3U;
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD (x0, c0, acc0);
+ acc1 = __SMLALD (x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
- /* Perform the multiply-accumulates */
- acc0 = __SMLALDX(x1, c0, acc0);
- acc1 = __SMLALD(x2, c0, acc1);
- acc2 = __SMLALDX(x2, c0, acc2);
- acc3 = __SMLALDX(x3, c0, acc3);
- }
+ if (k == 2U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
- /* Store the results in the accumulators in the destination buffer. */
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
-#ifndef ARM_MATH_BIG_ENDIAN
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+ }
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+ if (k == 3U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD (x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
- /* Increment the pointer pIn1 index, count by 4 */
- count += 4U;
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
- py = pSrc2;
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
- /* Decrement the loop counter */
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
blkCnt--;
}
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
blkCnt = (uint32_t) blockSize2 % 0x4U;
while (blkCnt > 0U)
@@ -515,16 +485,16 @@ arm_status arm_conv_partial_q15(
k = srcBLen >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -534,10 +504,10 @@ arm_status arm_conv_partial_q15(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -548,17 +518,10 @@ arm_status arm_conv_partial_q15(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -579,7 +542,7 @@ arm_status arm_conv_partial_q15(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -592,14 +555,7 @@ arm_status arm_conv_partial_q15(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -643,7 +599,7 @@ arm_status arm_conv_partial_q15(
/* The first part of the stage starts here */
j = count >> 2U;
- while ((j > 0U) && (blockSize3 > 0))
+ while ((j > 0U) && (blockSize3 > 0U))
{
/* Accumulator is made zero for every iteration */
sum = 0;
@@ -657,12 +613,12 @@ arm_status arm_conv_partial_q15(
{
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -679,7 +635,7 @@ arm_status arm_conv_partial_q15(
/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -690,10 +646,10 @@ arm_status arm_conv_partial_q15(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
j--;
@@ -704,7 +660,7 @@ arm_status arm_conv_partial_q15(
* so pointer py is updated to read only one sample at a time */
py = py + 1U;
- while (blockSize3 > 0)
+ while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
@@ -718,7 +674,7 @@ arm_status arm_conv_partial_q15(
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -729,34 +685,32 @@ arm_status arm_conv_partial_q15(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
-#else
+#else /* #if defined (ARM_MATH_DSP) */
- /* Run the below code for Cortex-M0 */
-
- q15_t *pIn1 = pSrcA; /* inputA pointer */
- q15_t *pIn2 = pSrcB; /* inputB pointer */
- q63_t sum; /* Accumulator */
- uint32_t i, j; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
{
- /* Set status as ARM_ARGUMENT_ERROR */
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
status = ARM_MATH_ARGUMENT_ERROR;
}
else
@@ -768,28 +722,31 @@ arm_status arm_conv_partial_q15(
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
+ sum += ((q31_t) pIn1[j] * pIn2[i - j]);
}
}
/* Store the output in the destination buffer */
pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
}
- /* set status as ARM_SUCCESS as there are no argument errors */
+
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
+ /* Return to application */
return (status);
-#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c b/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
index 94999b9..d0f0122 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_q31.c
* Description: Partial convolution of Q31 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,55 +29,59 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q31 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * See arm_conv_partial_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ @brief Partial convolution of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_fast_q31() for a faster but less precise implementation of this function.
*/
arm_status arm_conv_partial_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q63_t sum, acc0, acc1, acc2; /* Accumulator */
- q31_t x0, x1, x2, c0;
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
- arm_status status; /* status of Partial convolution */
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulator */
+ q31_t x0, x1, x2, c0; /* Temporary variables */
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -87,7 +91,6 @@ arm_status arm_conv_partial_q31(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -118,11 +121,9 @@ arm_status arm_conv_partial_q31(
check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
- (int32_t) firstIndex);
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -165,42 +166,51 @@ arm_status arm_conv_partial_q31(
* Stage1 process
* ----------------------*/
- /* The first loop starts here */
- while (blockSize1 > 0)
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 1] */
- sum += (q63_t) * px++ * (*py--);
- /* x[1] * y[srcBLen - 2] */
- sum += (q63_t) * px++ * (*py--);
- /* x[2] * y[srcBLen - 3] */
- sum += (q63_t) * px++ * (*py--);
- /* x[3] * y[srcBLen - 4] */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
- /* Decrement the loop counter */
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -211,10 +221,10 @@ arm_status arm_conv_partial_q31(
py = ++pSrc2;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -231,12 +241,13 @@ arm_status arm_conv_partial_q31(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
@@ -254,9 +265,11 @@ arm_status arm_conv_partial_q31(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blkCnt */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unroll over blkCnt */
blkCnt = blockSize2 / 3;
+
while (blkCnt > 0U)
{
/* Set all accumulators to zero */
@@ -265,8 +278,8 @@ arm_status arm_conv_partial_q31(
acc2 = 0;
/* read x[0], x[1] samples */
- x0 = *(px++);
- x1 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
/* Apply loop unrolling and compute 3 MACs simultaneously. */
k = srcBLen / 3;
@@ -281,13 +294,13 @@ arm_status arm_conv_partial_q31(
/* Read x[2] sample */
x2 = *(px);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
- acc0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
/* acc1 += x[1] * y[srcBLen - 1] */
- acc1 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x1 * c0;
/* acc2 += x[2] * y[srcBLen - 1] */
- acc2 += (q63_t) x2 *c0;
+ acc2 += (q63_t) x2 * c0;
/* Read y[srcBLen - 2] sample */
c0 = *(py - 1U);
@@ -297,11 +310,11 @@ arm_status arm_conv_partial_q31(
/* Perform the multiply-accumulate */
/* acc0 += x[1] * y[srcBLen - 2] */
- acc0 += (q63_t) x1 *c0;
+ acc0 += (q63_t) x1 * c0;
/* acc1 += x[2] * y[srcBLen - 2] */
- acc1 += (q63_t) x2 *c0;
+ acc1 += (q63_t) x2 * c0;
/* acc2 += x[3] * y[srcBLen - 2] */
- acc2 += (q63_t) x0 *c0;
+ acc2 += (q63_t) x0 * c0;
/* Read y[srcBLen - 3] sample */
c0 = *(py - 2U);
@@ -309,13 +322,13 @@ arm_status arm_conv_partial_q31(
/* Read x[4] sample */
x1 = *(px + 2U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[srcBLen - 3] */
- acc0 += (q63_t) x2 *c0;
+ acc0 += (q63_t) x2 * c0;
/* acc1 += x[3] * y[srcBLen - 2] */
- acc1 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x0 * c0;
/* acc2 += x[4] * y[srcBLen - 2] */
- acc2 += (q63_t) x1 *c0;
+ acc2 += (q63_t) x1 * c0;
px += 3U;
@@ -331,18 +344,17 @@ arm_status arm_conv_partial_q31(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x2 = *(px++);
+ x2 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
- acc0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
/* acc1 += x[5] * y[srcBLen - 5] */
- acc1 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x1 * c0;
/* acc2 += x[6] * y[srcBLen - 5] */
- acc2 += (q63_t) x2 *c0;
+ acc2 += (q63_t) x2 * c0;
/* Reuse the present samples for the next MAC */
x0 = x1;
@@ -361,77 +373,75 @@ arm_status arm_conv_partial_q31(
count += 3U;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q31_t) (sum >> 31);
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -452,9 +462,9 @@ arm_status arm_conv_partial_q31(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -465,14 +475,7 @@ arm_status arm_conv_partial_q31(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -508,37 +511,51 @@ arm_status arm_conv_partial_q31(
* Stage3 process
* ------------------*/
- while (blockSize3 > 0)
+ while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
k--;
}
- /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -549,15 +566,14 @@ arm_status arm_conv_partial_q31(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -565,19 +581,18 @@ arm_status arm_conv_partial_q31(
return (status);
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- q31_t *pIn1 = pSrcA; /* inputA pointer */
- q31_t *pIn2 = pSrcB; /* inputB pointer */
- q63_t sum; /* Accumulator */
- uint32_t i, j; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
{
- /* Set status as ARM_ARGUMENT_ERROR */
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
status = ARM_MATH_ARGUMENT_ERROR;
}
else
@@ -589,28 +604,31 @@ arm_status arm_conv_partial_q31(
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
+ sum += ((q63_t) pIn1[j] * pIn2[i - j]);
}
}
/* Store the output in the destination buffer */
pDst[i] = (q31_t) (sum >> 31U);
}
- /* set status as ARM_SUCCESS as there are no argument errors */
+
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
+ /* Return to application */
return (status);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c b/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
index d4e0679..9b0228c 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
@@ -3,13 +3,13 @@
* Title: arm_conv_partial_q7.c
* Description: Partial convolution of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,59 +29,61 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup PartialConv
- * @{
+ @addtogroup PartialConv
+ @{
*/
/**
- * @brief Partial convolution of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in] firstIndex is the first output sample to start with.
- * @param[in] numPoints is the number of output points to be computed.
- * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
- *
- * \par
- * Refer the function arm_conv_partial_opt_q7() for a faster implementation of this function.
- *
+ @brief Partial convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_opt_q7() for a faster implementation of this function.
*/
arm_status arm_conv_partial_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst,
- uint32_t firstIndex,
- uint32_t numPoints)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q7_t *pIn1; /* inputA pointer */
- q7_t *pIn2; /* inputB pointer */
- q7_t *pOut = pDst; /* output pointer */
- q7_t *px; /* Intermediate inputA pointer */
- q7_t *py; /* Intermediate inputB pointer */
- q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t input1, input2;
- q15_t in1, in2;
- q7_t x0, x1, x2, x3, c0, c1;
- uint32_t j, k, count, check, blkCnt;
- int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
- arm_status status;
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check; /* Loop counters */
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
+#endif
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
@@ -91,7 +93,6 @@ arm_status arm_conv_partial_q7(
}
else
{
-
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
@@ -122,11 +123,9 @@ arm_status arm_conv_partial_q7(
check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
- blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
- blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
- (int32_t) numPoints) : 0;
- blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
- (int32_t) firstIndex);
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
@@ -154,7 +153,7 @@ arm_status arm_conv_partial_q7(
/* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed.
- Since the partial convolution starts from from firstIndex
+ Since the partial convolution starts from firstIndex
Number of Macs to be performed is firstIndex + 1 */
count = 1U + firstIndex;
@@ -170,26 +169,26 @@ arm_status arm_conv_partial_q7(
* ----------------------*/
/* The first stage starts here */
- while (blockSize1 > 0)
+ while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] , x[1] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* y[srcBLen - 1] , y[srcBLen - 2] */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* x[0] * y[srcBLen - 1] */
@@ -197,33 +196,39 @@ arm_status arm_conv_partial_q7(
sum = __SMLAD(input1, input2, sum);
/* x[2] , x[3] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* y[srcBLen - 3] , y[srcBLen - 4] */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* x[2] * y[srcBLen - 3] */
/* x[3] * y[srcBLen - 4] */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -234,10 +239,10 @@ arm_status arm_conv_partial_q7(
py = ++pSrc2;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -254,18 +259,19 @@ arm_status arm_conv_partial_q7(
/* Working pointer of inputA */
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
{
- px = pIn1 + firstIndex - srcBLen + 1;
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
}
else
{
- px = pIn1;
+ pSrc1 = pIn1;
}
+ px = pSrc1;
/* Working pointer of inputB */
pSrc2 = pIn2 + (srcBLen - 1U);
py = pSrc2;
- /* count is index by which the pointer pIn1 to be incremented */
+ /* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
/* -------------------
@@ -277,7 +283,9 @@ arm_status arm_conv_partial_q7(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = ((uint32_t) blockSize2 >> 2U);
while (blkCnt > 0U)
@@ -289,9 +297,9 @@ arm_status arm_conv_partial_q7(
acc3 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -301,12 +309,12 @@ arm_status arm_conv_partial_q7(
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
+ c0 = *py--;
/* Read y[srcBLen - 2] sample */
- c1 = *(py--);
+ c1 = *py--;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
/* x[0] and x[1] are packed */
in1 = (q15_t) x0;
@@ -342,7 +350,7 @@ arm_status arm_conv_partial_q7(
acc2 = __SMLAD(input1, input2, acc2);
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* x[3] and x[4] are packed */
in1 = (q15_t) x3;
@@ -354,12 +362,12 @@ arm_status arm_conv_partial_q7(
acc3 = __SMLAD(input1, input2, acc3);
/* Read y[srcBLen - 3] sample */
- c0 = *(py--);
+ c0 = *py--;
/* Read y[srcBLen - 4] sample */
- c1 = *(py--);
+ c1 = *py--;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
/* x[2] and x[3] are packed */
in1 = (q15_t) x2;
@@ -395,7 +403,7 @@ arm_status arm_conv_partial_q7(
acc2 = __SMLAD(input1, input2, acc2);
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
/* x[5] and x[6] are packed */
in1 = (q15_t) x1;
@@ -415,10 +423,9 @@ arm_status arm_conv_partial_q7(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -449,78 +456,81 @@ arm_status arm_conv_partial_q7(
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (uint32_t) blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
-
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -528,20 +538,13 @@ arm_status arm_conv_partial_q7(
*pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
/* Increment the pointer pIn1 index, count by 1 */
- count++;
+ count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -564,7 +567,7 @@ arm_status arm_conv_partial_q7(
/* Perform the multiply-accumulate */
sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -575,14 +578,7 @@ arm_status arm_conv_partial_q7(
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
- if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
- {
- px = pIn1 + firstIndex - srcBLen + 1 + count;
- }
- else
- {
- px = pIn1 + count;
- }
+ px = pSrc1 + count;
py = pSrc2;
/* Decrement the loop counter */
@@ -618,26 +614,26 @@ arm_status arm_conv_partial_q7(
* Stage3 process
* ------------------*/
- while (blockSize3 > 0)
+ while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
@@ -645,34 +641,40 @@ arm_status arm_conv_partial_q7(
sum = __SMLAD(input1, input2, sum);
/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulates */
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum += ((q31_t) * px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -683,15 +685,14 @@ arm_status arm_conv_partial_q7(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -699,19 +700,18 @@ arm_status arm_conv_partial_q7(
return (status);
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- q7_t *pIn1 = pSrcA; /* inputA pointer */
- q7_t *pIn2 = pSrcB; /* inputB pointer */
- q31_t sum; /* Accumulator */
- uint32_t i, j; /* loop counters */
- arm_status status; /* status of Partial convolution */
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB; /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
/* Check for range of output samples to be calculated */
if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
{
- /* Set status as ARM_ARGUMENT_ERROR */
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
status = ARM_MATH_ARGUMENT_ERROR;
}
else
@@ -723,7 +723,7 @@ arm_status arm_conv_partial_q7(
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
@@ -736,15 +736,18 @@ arm_status arm_conv_partial_q7(
/* Store the output in the destination buffer */
pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
}
- /* set status as ARM_SUCCESS as there are no argument errors */
+
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
+ /* Return to application */
return (status);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of PartialConv group
+ @} end of PartialConv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_q15.c b/DSP/Source/FilteringFunctions/arm_conv_q15.c
index 29513fd..ad2b629 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_q15.c
@@ -3,13 +3,13 @@
* Title: arm_conv_q15.c
* Description: Convolution of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,62 +29,56 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * This approach provides 33 guard bits and there is no risk of overflow.
- * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
- *
- * \par
- * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- * \par
- * Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
- *
+ @brief Convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
*/
void arm_conv_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
{
-#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
+#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -145,7 +139,6 @@ void arm_conv_q15(
/* Working pointer of inputB */
py = pIn2;
-
/* ------------------------
* Stage1 process
* ----------------------*/
@@ -169,7 +162,7 @@ void arm_conv_q15(
/* Perform the multiply-accumulates */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -180,10 +173,10 @@ void arm_conv_q15(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -205,13 +198,13 @@ void arm_conv_q15(
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -225,10 +218,10 @@ void arm_conv_q15(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -239,10 +232,10 @@ void arm_conv_q15(
py = pIn2 + (count - 1U);
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -266,17 +259,16 @@ void arm_conv_q15(
/* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
-
- /* --------------------
+ /* -------------------
* Stage2 process
- * -------------------*/
+ * ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U)
@@ -289,13 +281,12 @@ void arm_conv_q15(
acc2 = 0;
acc3 = 0;
-
/* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
- /* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px+1);
- px+= 2U;
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -306,7 +297,7 @@ void arm_conv_q15(
{
/* Read the last two inputB samples using SIMD:
* y[srcBLen - 1] and y[srcBLen - 2] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
acc0 = __SMLALDX(x0, c0, acc0);
@@ -315,10 +306,10 @@ void arm_conv_q15(
acc1 = __SMLALDX(x1, c0, acc1);
/* Read x[2], x[3] */
- x2 = *__SIMD32(px);
+ x2 = read_q15x2 ((q15_t *) px);
/* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px+1);
+ x3 = read_q15x2 ((q15_t *) px + 1);
/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
acc2 = __SMLALDX(x2, c0, acc2);
@@ -327,7 +318,7 @@ void arm_conv_q15(
acc3 = __SMLALDX(x3, c0, acc3);
/* Read y[srcBLen - 3] and y[srcBLen - 4] */
- c0 = *__SIMD32(py)--;
+ c0 = read_q15x2_da ((q15_t **) &py);
/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
acc0 = __SMLALDX(x2, c0, acc0);
@@ -336,10 +327,11 @@ void arm_conv_q15(
acc1 = __SMLALDX(x3, c0, acc1);
/* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px+2);
+ x0 = read_q15x2 ((q15_t *) px + 2);
/* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px+3);
+ x1 = read_q15x2 ((q15_t *) px + 3);
+
px += 4U;
/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
@@ -360,22 +352,18 @@ void arm_conv_q15(
if (k == 1U)
{
/* Read y[srcBLen - 5] */
- c0 = *(py+1);
-
+ c0 = *(py + 1);
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
-
#else
-
c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLALD(x0, c0, acc0);
acc1 = __SMLALD(x1, c0, acc1);
acc2 = __SMLALDX(x1, c0, acc2);
@@ -385,16 +373,16 @@ void arm_conv_q15(
if (k == 2U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLALDX(x0, c0, acc0);
acc1 = __SMLALDX(x1, c0, acc1);
acc2 = __SMLALDX(x3, c0, acc2);
@@ -404,31 +392,29 @@ void arm_conv_q15(
if (k == 3U)
{
/* Read y[srcBLen - 5], y[srcBLen - 6] */
- c0 = _SIMD32_OFFSET(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px+1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLALDX(x0, c0, acc0);
acc1 = __SMLALDX(x1, c0, acc1);
acc2 = __SMLALDX(x3, c0, acc2);
acc3 = __SMLALDX(x2, c0, acc3);
c0 = *(py-1);
-
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
#else
-
c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
/* Read x[10] */
- x3 = _SIMD32_OFFSET(px+2);
+ x3 = read_q15x2 ((q15_t *) px + 2);
px += 3U;
/* Perform the multiply-accumulates */
@@ -438,23 +424,13 @@ void arm_conv_q15(
acc3 = __SMLALDX(x3, c0, acc3);
}
-
- /* Store the results in the accumulators in the destination buffer. */
-
+ /* Store the result in the accumulator in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
- *__SIMD32(pOut)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Increment the pointer pIn1 index, count by 4 */
@@ -464,7 +440,7 @@ void arm_conv_q15(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -485,12 +461,12 @@ void arm_conv_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -501,7 +477,7 @@ void arm_conv_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -538,7 +514,7 @@ void arm_conv_q15(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -573,7 +549,6 @@ void arm_conv_q15(
/* In this stage the MAC operations are decreased by 1 for every iteration.
The blockSize3 variable holds the number of MAC operations performed */
-
blockSize3 = srcBLen - 1U;
/* Working pointer of inputA */
@@ -608,14 +583,15 @@ void arm_conv_q15(
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
+ /* Perform the multiply-accumulate */
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */
- sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -632,7 +608,7 @@ void arm_conv_q15(
/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -643,7 +619,7 @@ void arm_conv_q15(
px = ++pSrc1;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
j--;
@@ -668,7 +644,7 @@ void arm_conv_q15(
/* sum += x[srcALen-1] * y[srcBLen-1] */
sum = __SMLALD(*px++, *py--, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -679,33 +655,31 @@ void arm_conv_q15(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
-#else
+#else /* #if defined (ARM_MATH_DSP) */
-/* Run the below code for Cortex-M0 */
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
- q15_t *pIn1 = pSrcA; /* input pointer */
- q15_t *pIn2 = pSrcB; /* coefficient pointer */
- q63_t sum; /* Accumulator */
- uint32_t i, j; /* loop counter */
-
- /* Loop to calculate output of convolution for output length number of times */
+ /* Loop to calculate convolution for output length number of values */
for (i = 0; i < (srcALen + srcBLen - 1); i++)
{
/* Initialize sum with zero to carry on MAC operations */
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += (q31_t) pIn1[j] * (pIn2[i - j]);
+ sum += ((q31_t) pIn1[j] * pIn2[i - j]);
}
}
@@ -713,10 +687,10 @@ void arm_conv_q15(
pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
}
-#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_q31.c b/DSP/Source/FilteringFunctions/arm_conv_q31.c
index 78e50f0..39550ec 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_q31.c
@@ -3,13 +3,13 @@
* Title: arm_conv_q31.c
* Description: Convolution of Q31 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,63 +29,62 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q31 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
- * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
- * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- * \par
- * See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ @brief Convolution of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
*/
void arm_conv_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q63_t sum; /* Accumulator */
- q63_t acc0, acc1, acc2; /* Accumulator */
- q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -101,10 +100,10 @@ void arm_conv_q31(
else
{
/* Initialization of inputA pointer */
- pIn1 = (q31_t *) pSrcB;
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (q31_t *) pSrcA;
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -158,36 +157,45 @@ void arm_conv_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 1] */
- sum += (q63_t) * px++ * (*py--);
- /* x[1] * y[srcBLen - 2] */
- sum += (q63_t) * px++ * (*py--);
- /* x[2] * y[srcBLen - 3] */
- sum += (q63_t) * px++ * (*py--);
- /* x[3] * y[srcBLen - 4] */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
- /* Decrement the loop counter */
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -198,10 +206,10 @@ void arm_conv_q31(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -210,7 +218,7 @@ void arm_conv_q31(
* ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
- * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
*/
@@ -234,6 +242,8 @@ void arm_conv_q31(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Loop unroll by 3 */
blkCnt = blockSize2 / 3;
@@ -245,8 +255,8 @@ void arm_conv_q31(
acc2 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
/* Apply loop unrolling and compute 3 MACs simultaneously. */
k = srcBLen / 3;
@@ -257,11 +267,10 @@ void arm_conv_q31(
{
/* Read y[srcBLen - 1] sample */
c0 = *(py);
-
/* Read x[3] sample */
x2 = *(px);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[0] * y[srcBLen - 1] */
acc0 += ((q63_t) x0 * c0);
/* acc1 += x[1] * y[srcBLen - 1] */
@@ -271,7 +280,6 @@ void arm_conv_q31(
/* Read y[srcBLen - 2] sample */
c0 = *(py - 1U);
-
/* Read x[4] sample */
x0 = *(px + 1U);
@@ -285,11 +293,10 @@ void arm_conv_q31(
/* Read y[srcBLen - 3] sample */
c0 = *(py - 2U);
-
/* Read x[5] sample */
x1 = *(px + 2U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[srcBLen - 3] */
acc0 += ((q63_t) x2 * c0);
/* acc1 += x[3] * y[srcBLen - 2] */
@@ -310,10 +317,9 @@ void arm_conv_q31(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x2 = *(px++);
+ x2 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -327,11 +333,11 @@ void arm_conv_q31(
x0 = x1;
x1 = x2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* Store the results in the accumulators in the destination buffer. */
+ /* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q31_t) (acc0 >> 31);
*pOut++ = (q31_t) (acc1 >> 31);
*pOut++ = (q31_t) (acc2 >> 31);
@@ -343,44 +349,56 @@ void arm_conv_q31(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
/* Decrement the loop counter */
k--;
@@ -389,14 +407,14 @@ void arm_conv_q31(
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q31_t) (sum >> 31);
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -417,7 +435,7 @@ void arm_conv_q31(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) *px++ * *py--;
/* Decrement the loop counter */
k--;
@@ -426,14 +444,14 @@ void arm_conv_q31(
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q31_t) (sum >> 31);
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -470,36 +488,47 @@ void arm_conv_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = blockSize3 >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
- sum += (q63_t) * px++ * (*py--);
- /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
- sum += (q63_t) * px++ * (*py--);
- /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
- sum += (q63_t) * px++ * (*py--);
- /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
- sum += (q63_t) * px++ * (*py--);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = blockSize3 % 0x4U;
-
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * *py--;
- /* Decrement the loop counter */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = blockSize3 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
k--;
}
@@ -510,33 +539,32 @@ void arm_conv_q31(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i, j; /* Loop counters */
- q31_t *pIn1 = pSrcA; /* input pointer */
- q31_t *pIn2 = pSrcB; /* coefficient pointer */
- q63_t sum; /* Accumulator */
- uint32_t i, j; /* loop counter */
-
- /* Loop to calculate output of convolution for output length number of times */
- for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
{
- /* Initialize sum with zero to carry on MAC operations */
+ /* Initialize sum with zero to carry out MAC operations */
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
+ sum += ((q63_t) pIn1[j] * pIn2[i - j]);
}
}
@@ -544,10 +572,10 @@ void arm_conv_q31(
pDst[i] = (q31_t) (sum >> 31U);
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_conv_q7.c b/DSP/Source/FilteringFunctions/arm_conv_q7.c
index 9e5a79b..bdd1cab 100644
--- a/DSP/Source/FilteringFunctions/arm_conv_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_conv_q7.c
@@ -3,13 +3,13 @@
* Title: arm_conv_q7.c
* Description: Convolution of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,62 +29,60 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Conv
- * @{
+ @addtogroup Conv
+ @{
*/
/**
- * @brief Convolution of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
- * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
- *
- * \par
- * Refer the function arm_conv_opt_q7() for a faster implementation of this function.
- *
+ @brief Convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ @remark
+ Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
*/
void arm_conv_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q7_t *pIn1; /* inputA pointer */
- q7_t *pIn2; /* inputB pointer */
- q7_t *pOut = pDst; /* output pointer */
- q7_t *px; /* Intermediate inputA pointer */
- q7_t *py; /* Intermediate inputB pointer */
- q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
- q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
- q31_t input1, input2; /* Temporary input variables */
- q15_t in1, in2; /* Temporary input variables */
- uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -123,7 +121,7 @@ void arm_conv_q7(
/* The algorithm is implemented in three stages.
The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
- blockSize2 = (srcALen - srcBLen) + 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
/* --------------------------
@@ -157,21 +155,21 @@ void arm_conv_q7(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] , x[1] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[srcBLen - 1] , y[srcBLen - 2] */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* x[0] * y[srcBLen - 1] */
@@ -179,33 +177,39 @@ void arm_conv_q7(
sum = __SMLAD(input1, input2, sum);
/* x[2] , x[3] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[srcBLen - 3] , y[srcBLen - 4] */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* x[2] * y[srcBLen - 3] */
/* x[3] * y[srcBLen - 4] */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += ((q15_t) * px++ * *py--);
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -216,10 +220,10 @@ void arm_conv_q7(
py = pIn2 + count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -228,7 +232,7 @@ void arm_conv_q7(
* ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
- * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
*/
@@ -252,7 +256,9 @@ void arm_conv_q7(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U)
@@ -264,9 +270,9 @@ void arm_conv_q7(
acc3 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -276,12 +282,12 @@ void arm_conv_q7(
do
{
/* Read y[srcBLen - 1] sample */
- c0 = *(py--);
+ c0 = *py--;
/* Read y[srcBLen - 2] sample */
- c1 = *(py--);
+ c1 = *py--;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
/* x[0] and x[1] are packed */
in1 = (q15_t) x0;
@@ -317,7 +323,7 @@ void arm_conv_q7(
acc2 = __SMLAD(input1, input2, acc2);
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* x[3] and x[4] are packed */
in1 = (q15_t) x3;
@@ -329,12 +335,12 @@ void arm_conv_q7(
acc3 = __SMLAD(input1, input2, acc3);
/* Read y[srcBLen - 3] sample */
- c0 = *(py--);
+ c0 = *py--;
/* Read y[srcBLen - 4] sample */
- c1 = *(py--);
+ c1 = *py--;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
/* x[2] and x[3] are packed */
in1 = (q15_t) x2;
@@ -370,7 +376,7 @@ void arm_conv_q7(
acc2 = __SMLAD(input1, input2, acc2);
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
/* x[5] and x[6] are packed */
in1 = (q15_t) x1;
@@ -390,10 +396,9 @@ void arm_conv_q7(
while (k > 0U)
{
/* Read y[srcBLen - 5] sample */
- c0 = *(py--);
-
+ c0 = *py--;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[srcBLen - 5] */
@@ -410,11 +415,10 @@ void arm_conv_q7(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
-
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
*pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
@@ -428,65 +432,77 @@ void arm_conv_q7(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += ((q15_t) * px++ * *py--);
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -523,7 +539,7 @@ void arm_conv_q7(
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q15_t) * px++ * *py--);
+ sum += ((q15_t) *px++ * *py--);
/* Decrement the loop counter */
k--;
@@ -539,7 +555,7 @@ void arm_conv_q7(
px = pIn1 + count;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -576,21 +592,21 @@ void arm_conv_q7(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = blockSize3 >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
@@ -598,33 +614,40 @@ void arm_conv_q7(
sum = __SMLAD(input1, input2, sum);
/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
- in1 = (q15_t) * py--;
- in2 = (q15_t) * py--;
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = blockSize3 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += ((q15_t) * px++ * *py--);
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q15_t) *px++ * *py--);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -635,33 +658,32 @@ void arm_conv_q7(
px = ++pSrc1;
py = pSrc2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB; /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
- q7_t *pIn1 = pSrcA; /* input pointer */
- q7_t *pIn2 = pSrcB; /* coefficient pointer */
- q31_t sum; /* Accumulator */
- uint32_t i, j; /* loop counter */
-
- /* Loop to calculate output of convolution for output length number of times */
- for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
{
- /* Initialize sum with zero to carry on MAC operations */
+ /* Initialize sum with zero to carry out MAC operations */
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
- for (j = 0; j <= i; j++)
+ for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
- sum += (q15_t) pIn1[j] * (pIn2[i - j]);
+ sum += ((q15_t) pIn1[j] * pIn2[i - j]);
}
}
@@ -669,10 +691,10 @@ void arm_conv_q7(
pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Conv group
+ @} end of Conv group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_f32.c b/DSP/Source/FilteringFunctions/arm_correlate_f32.c
index 12031f1..1096526 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_f32.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_f32.c
* Description: Correlation of floating-point sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,102 +29,97 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup Corr Correlation
- *
- * Correlation is a mathematical operation that is similar to convolution.
- * As with convolution, correlation uses two signals to produce a third signal.
- * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
- * Correlation is commonly used to measure the similarity between two signals.
- * It has applications in pattern recognition, cryptanalysis, and searching.
- * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
- * Fast versions of the Q15 and Q31 functions are also provided.
- *
- * \par Algorithm
- * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
- * The convolution of the two signals is denoted by
- *
- * c[n] = a[n] * b[n]
- *
- * In correlation, one of the signals is flipped in time
- *
- * c[n] = a[n] * b[-n]
- *
- *
- * \par
- * and this is mathematically defined as
- * \image html CorrelateEquation.gif
- * \par
- * The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen.
- * The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2).
- * The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result.
- *
- * Note
- * \par
- * The pDst should be initialized to all zeros before being used.
- *
- * Fixed-Point Behavior
- * \par
- * Correlation requires summing up a large number of intermediate products.
- * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
- * Refer to the function specific documentation below for further details of the particular algorithm used.
- *
- *
- * Fast Versions
- *
- * \par
- * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
- * the input signals should be scaled down to avoid intermediate overflows.
- *
- *
- * Opt Versions
- *
- * \par
- * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
- * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
+ @defgroup Corr Correlation
+
+ Correlation is a mathematical operation that is similar to convolution.
+ As with convolution, correlation uses two signals to produce a third signal.
+ The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
+ Correlation is commonly used to measure the similarity between two signals.
+ It has applications in pattern recognition, cryptanalysis, and searching.
+ The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
+ Fast versions of the Q15 and Q31 functions are also provided.
+
+ @par Algorithm
+ Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
+ The convolution of the two signals is denoted by
+
+ c[n] = a[n] * b[n]
+
+ In correlation, one of the signals is flipped in time
+
+ c[n] = a[n] * b[-n]
+
+ @par
+ and this is mathematically defined as
+ \image html CorrelateEquation.gif
+ @par
+ The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen.
+ The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2).
+ The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result.
+
+ @note
+ The pDst should be initialized to all zeros before being used.
+
+ @par Fixed-Point Behavior
+ Correlation requires summing up a large number of intermediate products.
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ Refer to the function specific documentation below for further details of the particular algorithm used.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
+
/**
- * @brief Correlation of floating-point sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
+ @brief Correlation of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
*/
void arm_correlate_f32(
- float32_t * pSrcA,
- uint32_t srcALen,
- float32_t * pSrcB,
- uint32_t srcBLen,
- float32_t * pDst)
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ const float32_t *pIn1; /* InputA pointer */
+ const float32_t *pIn2; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1;
+ float32_t sum;
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t *pIn1; /* inputA pointer */
- float32_t *pIn2; /* inputB pointer */
- float32_t *pOut = pDst; /* output pointer */
- float32_t *px; /* Intermediate inputA pointer */
- float32_t *py; /* Intermediate inputB pointer */
- float32_t *pSrc1; /* Intermediate pointers */
- float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
- int32_t inc = 1; /* Destination address modifier */
-
+#if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)
+ float32_t acc0, acc1, acc2, acc3; /* Accumulators */
+ float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -158,16 +153,6 @@ void arm_correlate_f32(
/* Updating the pointer position to non zero value */
pOut += j;
-
- //while (j > 0U)
- //{
- // /* Zero is stored in the destination buffer */
- // *pOut++ = 0.0f;
-
- // /* Decrement the loop counter */
- // j--;
- //}
-
}
else
{
@@ -188,18 +173,18 @@ void arm_correlate_f32(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -235,37 +220,73 @@ void arm_correlate_f32(
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+ k = count & 0x3;
+#else
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 4] */
sum += *px++ * *py++;
+
/* x[1] * y[srcBLen - 3] */
sum += *px++ * *py++;
+
/* x[2] * y[srcBLen - 2] */
sum += *px++ * *py++;
+
/* x[3] * y[srcBLen - 1] */
sum += *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#endif /* #if defined(ARM_MATH_NEON) */
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
sum += *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -278,10 +299,10 @@ void arm_correlate_f32(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -290,7 +311,7 @@ void arm_correlate_f32(
* ------------------------*/
/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
- * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
* ....
* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
*/
@@ -310,12 +331,25 @@ void arm_correlate_f32(
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ * srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
+#if defined(ARM_MATH_NEON)
+ float32x4_t c;
+ float32x4_t x1v;
+ float32x4_t x2v;
+ uint32x4_t x1v_u;
+ uint32x4_t x2v_u;
+ float32x4_t x;
+ uint32x4_t x_u;
+ float32x4_t res = vdupq_n_f32(0) ;
+#endif /* #if defined(ARM_MATH_NEON) */
+
while (blkCnt > 0U)
{
/* Set all accumulators to zero */
@@ -324,10 +358,75 @@ void arm_correlate_f32(
acc2 = 0.0f;
acc3 = 0.0f;
+#if defined(ARM_MATH_NEON)
+ /* Compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ res = vdupq_n_f32(0) ;
+
+ x1v = vld1q_f32(px);
+ px += 4;
+ do
+ {
+ x2v = vld1q_f32(px);
+ c = vld1q_f32(py);
+
+ py += 4;
+
+ x = x1v;
+ res = vmlaq_n_f32(res,x,c[0]);
+
+ x = vextq_f32(x1v,x2v,1);
+
+ res = vmlaq_n_f32(res,x,c[1]);
+
+ x = vextq_f32(x1v,x2v,2);
+
+ res = vmlaq_n_f32(res,x,c[2]);
+
+ x = vextq_f32(x1v,x2v,3);
+
+ res = vmlaq_n_f32(res,x,c[3]);
+
+ x1v = x2v;
+ px+=4;
+ x2v = vld1q_f32(px);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen & 0x3;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py++);
+
+ res = vmlaq_n_f32(res,x1v,c0);
+
+ /* Reuse the present samples for the next MAC */
+ x1v[0] = x1v[1];
+ x1v[1] = x1v[2];
+ x1v[2] = x1v[3];
+
+ x1v[3] = *(px++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ px-=1;
+
+ acc0 = res[0];
+ acc1 = res[1];
+ acc2 = res[2];
+ acc3 = res[3];
+#else
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -338,7 +437,6 @@ void arm_correlate_f32(
{
/* Read y[0] sample */
c0 = *(py++);
-
/* Read x[3] sample */
x3 = *(px++);
@@ -354,7 +452,6 @@ void arm_correlate_f32(
/* Read y[1] sample */
c0 = *(py++);
-
/* Read x[4] sample */
x0 = *(px++);
@@ -370,11 +467,10 @@ void arm_correlate_f32(
/* Read y[2] sample */
c0 = *(py++);
-
/* Read x[5] sample */
x1 = *(px++);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[2] */
acc0 += x2 * c0;
/* acc1 += x[3] * y[2] */
@@ -386,11 +482,10 @@ void arm_correlate_f32(
/* Read y[3] sample */
c0 = *(py++);
-
/* Read x[6] sample */
x2 = *(px++);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[3] * y[3] */
acc0 += x3 * c0;
/* acc1 += x[4] * y[3] */
@@ -400,7 +495,6 @@ void arm_correlate_f32(
/* acc3 += x[6] * y[3] */
acc3 += x2 * c0;
-
} while (--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
@@ -411,11 +505,10 @@ void arm_correlate_f32(
{
/* Read y[4] sample */
c0 = *(py++);
-
/* Read x[7] sample */
x3 = *(px++);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[4] * y[4] */
acc0 += x0 * c0;
/* acc1 += x[5] * y[4] */
@@ -434,6 +527,8 @@ void arm_correlate_f32(
k--;
}
+#endif /* #if defined(ARM_MATH_NEON) */
+
/* Store the result in the accumulator in the destination buffer. */
*pOut = acc0;
/* Destination pointer is updated according to the address modifier, inc */
@@ -455,39 +550,74 @@ void arm_correlate_f32(
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+#else
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum += *px++ * *py++;
sum += *px++ * *py++;
sum += *px++ * *py++;
sum += *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
-
+#endif /* #if defined(ARM_MATH_NEON) */
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
while (k > 0U)
{
@@ -500,6 +630,7 @@ void arm_correlate_f32(
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
+
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
@@ -554,6 +685,7 @@ void arm_correlate_f32(
}
}
+
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -585,37 +717,71 @@ void arm_correlate_f32(
/* Accumulator is made zero for every iteration */
sum = 0.0f;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- /* sum += x[srcALen - srcBLen + 4] * y[3] */
- sum += *px++ * *py++;
- /* sum += x[srcALen - srcBLen + 3] * y[2] */
- sum += *px++ * *py++;
- /* sum += x[srcALen - srcBLen + 2] * y[1] */
- sum += *px++ * *py++;
- /* sum += x[srcALen - srcBLen + 1] * y[0] */
- sum += *px++ * *py++;
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
/* Decrement the loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+#else
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+#endif /* #if defined (ARM_MATH_NEON) */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum += *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -628,7 +794,7 @@ void arm_correlate_f32(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
/* Decrement the loop counter */
@@ -636,15 +802,14 @@ void arm_correlate_f32(
}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- float32_t *pIn1 = pSrcA; /* inputA pointer */
- float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
- float32_t sum; /* Accumulator */
- uint32_t i = 0U, j; /* loop counters */
- uint32_t inv = 0U; /* Reverse order flag */
- uint32_t tot = 0U; /* Length */
+ const float32_t *pIn1 = pSrcA; /* inputA pointer */
+ const float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -698,7 +863,7 @@ void arm_correlate_f32(
/* Loop to calculate convolution for output length number of times */
for (i = 0U; i <= tot; i++)
{
- /* Initialize sum with zero to carry on MAC operations */
+ /* Initialize sum with zero to carry out MAC operations */
sum = 0.0f;
/* Loop to perform MAC operations according to convolution equation */
@@ -711,6 +876,7 @@ void arm_correlate_f32(
sum += pIn1[j] * pIn2[-((int32_t) i - j)];
}
}
+
/* Store the output in the destination buffer */
if (inv == 1)
*pDst-- = sum;
@@ -718,10 +884,10 @@ void arm_correlate_f32(
*pDst++ = sum;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
index a1b0dbd..13661cb 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_fast_opt_q15.c
* Description: Fast Q15 Correlation
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,70 +29,61 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @return none.
- *
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch buffers should be aligned by 32-bit
- *
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
- * maximum of min(srcALen, srcBLen) number of additions is carried internally.
- * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
- *
- * \par
- * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ @brief Correlation of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence.
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
*/
void arm_correlate_fast_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- q15_t * pScratch)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
{
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *py; /* Intermediate inputB pointer */
- q31_t x1, x2, x3; /* temporary variables for holding input and coefficient values */
- uint32_t j, blkCnt, outBlockSize; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
- uint32_t tapCnt;
- q31_t y1, y2;
- q15_t *pScr; /* Intermediate pointers */
- q15_t *pOut = pDst; /* output pointer */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q31_t acc0; /* Accumulators */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch; /* Temporary pointer for scratch */
+ const q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, blkCnt, outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+ uint32_t tapCnt; /* Loop count */
- q15_t a, b;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables for holding input and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -110,10 +101,10 @@ void arm_correlate_fast_opt_q15(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
@@ -126,15 +117,14 @@ void arm_correlate_fast_opt_q15(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -147,112 +137,45 @@ void arm_correlate_fast_opt_q15(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
- pScr = pScratch;
+ pScr1 = pScratch;
/* Fill (srcBLen - 1U) zeros in scratch buffer */
- arm_fill_q15(0, pScr, (srcBLen - 1U));
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update temporary scratch pointer */
- pScr += (srcBLen - 1U);
+ pScr1 += (srcBLen - 1U);
-#ifndef UNALIGNED_SUPPORT_DISABLE
/* Copy (srcALen) samples in scratch buffer */
- arm_copy_q15(pIn1, pScr, srcALen);
+ arm_copy_q15(pIn1, pScr1, srcALen);
/* Update pointers */
- pScr += srcALen;
+ pScr1 += srcALen;
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- j = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (j > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
-
- /* Decrement the loop counter */
- j--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- j = srcALen % 0x4U;
-
- while (j > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr++ = *pIn1++;
-
- /* Decrement the loop counter */
- j--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
- arm_fill_q15(0, pScr, (srcBLen - 1U));
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update pointer */
- pScr += (srcBLen - 1U);
-
-#else
-
-/* Apply loop unrolling and do 4 Copies simultaneously. */
- j = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (j > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr++ = 0;
- *pScr++ = 0;
- *pScr++ = 0;
- *pScr++ = 0;
-
- /* Decrement the loop counter */
- j--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- j = (srcBLen - 1U) % 0x4U;
-
- while (j > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr++ = 0;
-
- /* Decrement the loop counter */
- j--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+ pScr1 += (srcBLen - 1U);
/* Temporary pointer for scratch2 */
py = pIn2;
/* Actual correlation process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (srcALen + srcBLen - 1U) >> 2;
while (blkCnt > 0)
{
/* Initialze temporary scratch pointer as scratch1 */
- pScr = pScratch;
+ pScr1 = pScratch;
/* Clear Accumlators */
acc0 = 0;
@@ -260,41 +183,42 @@ void arm_correlate_fast_opt_q15(
acc2 = 0;
acc3 = 0;
- /* Read four samples from scratch1 buffer */
- x1 = *__SIMD32(pScr)++;
+ /* Read two samples from scratch buffer */
+ x1 = read_q15x2_ia (&pScr1);
- /* Read next four samples from scratch1 buffer */
- x2 = *__SIMD32(pScr)++;
+ /* Read next two samples from scratch buffer */
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+ /* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
-
acc2 = __SMLAD(x2, y1, acc2);
+ /* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
#else
x3 = __PKHBT(x1, x2, 0);
#endif
+ /* multiply and accumlate */
acc1 = __SMLADX(x3, y1, acc1);
- x1 = _SIMD32_OFFSET(pScr);
+ /* Read next two samples from scratch buffer */
+ x1 = read_q15x2_ia (&pScr1);
+ /* multiply and accumlate */
acc0 = __SMLAD(x2, y2, acc0);
-
acc2 = __SMLAD(x1, y2, acc2);
+ /* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x1, x2, 0);
#else
@@ -302,81 +226,9 @@ void arm_correlate_fast_opt_q15(
#endif
acc3 = __SMLADX(x3, y1, acc3);
-
acc1 = __SMLADX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr + 2U);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc3 = __SMLADX(x3, y2, acc3);
-#else
-
- /* Read four samples from smaller buffer */
- a = *pIn2;
- b = *(pIn2 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- y1 = __PKHBT(a, b, 16);
-#else
- y1 = __PKHBT(b, a, 16);
-#endif
-
- a = *(pIn2 + 2);
- b = *(pIn2 + 3);
-#ifndef ARM_MATH_BIG_ENDIAN
- y2 = __PKHBT(a, b, 16);
-#else
- y2 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLAD(x1, y1, acc0);
-
- acc2 = __SMLAD(x2, y1, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc1 = __SMLADX(x3, y1, acc1);
-
- a = *pScr;
- b = *(pScr + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x1 = __PKHBT(a, b, 16);
-#else
- x1 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLAD(x2, y2, acc0);
-
- acc2 = __SMLAD(x1, y2, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x1, x2, 0);
-#else
- x3 = __PKHBT(x2, x1, 0);
-#endif
-
- acc3 = __SMLADX(x3, y1, acc3);
-
- acc1 = __SMLADX(x3, y2, acc1);
-
- a = *(pScr + 2);
- b = *(pScr + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x2 = __PKHBT(a, b, 16);
-#else
- x2 = __PKHBT(b, a, 16);
-#endif
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -386,44 +238,32 @@ void arm_correlate_fast_opt_q15(
acc3 = __SMLADX(x3, y2, acc3);
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- pIn2 += 4U;
-
- pScr += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-
/* Update scratch pointer for remaining samples of smaller length sequence */
- pScr -= 4U;
-
+ pScr1 -= 4U;
/* apply same above for remaining samples of smaller length sequence */
tapCnt = (srcBLen) & 3U;
while (tapCnt > 0U)
{
-
/* accumlate the results */
- acc0 += (*pScr++ * *pIn2);
- acc1 += (*pScr++ * *pIn2);
- acc2 += (*pScr++ * *pIn2);
- acc3 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
- pScr -= 3U;
+ pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
-
/* Store the results in the accumulators in the destination buffer. */
*pOut = (__SSAT(acc0 >> 15U, 16));
pOut += inc;
@@ -434,22 +274,27 @@ void arm_correlate_fast_opt_q15(
*pOut = (__SSAT(acc3 >> 15U, 16));
pOut += inc;
-
/* Initialization of inputB pointer */
pIn2 = py;
pScratch += 4U;
-
}
-
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate correlation for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
/* Initialze temporary scratch pointer as scratch1 */
- pScr = pScratch;
+ pScr1 = pScratch;
/* Clear Accumlators */
acc0 = 0;
@@ -459,10 +304,11 @@ void arm_correlate_fast_opt_q15(
while (tapCnt > 0U)
{
- acc0 += (*pScr++ * *pIn2++);
- acc0 += (*pScr++ * *pIn2++);
+ /* Read next two samples from scratch buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -473,28 +319,27 @@ void arm_correlate_fast_opt_q15(
{
/* accumlate the results */
- acc0 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
- /* Store the result in the accumulator in the destination buffer. */
-
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
*pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
-
pOut += inc;
/* Initialization of inputB pointer */
pIn2 = py;
pScratch += 1U;
-
}
+
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c b/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
index 383949d..6898618 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_fast_q15.c
* Description: Fast Q15 Correlation
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,58 +29,56 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
- * maximum of min(srcALen, srcBLen) number of additions is carried internally.
- * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
- *
- * \par
- * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ @brief Correlation of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
*/
void arm_correlate_fast_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
/* The algorithm implementation is based on the lengths of the inputs. */
@@ -99,10 +97,10 @@ void arm_correlate_fast_q15(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
@@ -120,10 +118,10 @@ void arm_correlate_fast_q15(
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -140,14 +138,15 @@ void arm_correlate_fast_q15(
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -184,23 +183,23 @@ void arm_correlate_fast_q15(
sum = 0;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
+ k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
- sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
/* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
- sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = count % 0x4U;
while (k > 0U)
@@ -222,10 +221,10 @@ void arm_correlate_fast_q15(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -245,16 +244,16 @@ void arm_correlate_fast_q15(
/* Working pointer of inputB */
py = pIn2;
- /* count is index by which the pointer pIn1 to be incremented */
+ /* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
- /* -------------------
+ /* --------------------
* Stage2 process
- * ------------------*/
+ * -------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ * srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
/* Loop unroll over blockSize2, by 4 */
@@ -269,9 +268,9 @@ void arm_correlate_fast_q15(
acc3 = 0;
/* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 ((q15_t *) px);
/* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px + 1);
+ x1 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
@@ -283,7 +282,7 @@ void arm_correlate_fast_q15(
{
/* Read the first two inputB samples using SIMD:
* y[0] and y[1] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* acc0 += x[0] * y[0] + x[1] * y[1] */
acc0 = __SMLAD(x0, c0, acc0);
@@ -292,10 +291,10 @@ void arm_correlate_fast_q15(
acc1 = __SMLAD(x1, c0, acc1);
/* Read x[2], x[3] */
- x2 = *__SIMD32(px);
+ x2 = read_q15x2 ((q15_t *) px);
/* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px + 1);
+ x3 = read_q15x2 ((q15_t *) px + 1);
/* acc2 += x[2] * y[0] + x[3] * y[1] */
acc2 = __SMLAD(x2, c0, acc2);
@@ -304,7 +303,7 @@ void arm_correlate_fast_q15(
acc3 = __SMLAD(x3, c0, acc3);
/* Read y[2] and y[3] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* acc0 += x[2] * y[2] + x[3] * y[3] */
acc0 = __SMLAD(x2, c0, acc0);
@@ -313,10 +312,10 @@ void arm_correlate_fast_q15(
acc1 = __SMLAD(x3, c0, acc1);
/* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px + 2);
+ x0 = read_q15x2 ((q15_t *) px + 2);
/* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px + 3);
+ x1 = read_q15x2 ((q15_t *) px + 3);
px += 4U;
/* acc2 += x[4] * y[2] + x[5] * y[3] */
@@ -338,23 +337,20 @@ void arm_correlate_fast_q15(
{
/* Read y[4] */
c0 = *py;
+
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
-
#else
-
c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
px++;
/* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
+ acc0 = __SMLAD (x0, c0, acc0);
+ acc1 = __SMLAD (x1, c0, acc1);
acc2 = __SMLADX(x1, c0, acc2);
acc3 = __SMLADX(x3, c0, acc3);
}
@@ -362,13 +358,13 @@ void arm_correlate_fast_q15(
if (k == 2U)
{
/* Read y[4], y[5] */
- c0 = *__SIMD32(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px + 1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
/* Perform the multiply-accumulates */
@@ -381,13 +377,13 @@ void arm_correlate_fast_q15(
if (k == 3U)
{
/* Read y[4], y[5] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px + 1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
/* Perform the multiply-accumulates */
acc0 = __SMLAD(x0, c0, acc0);
@@ -398,20 +394,18 @@ void arm_correlate_fast_q15(
c0 = (*py);
/* Read y[6] */
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
#else
-
c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[10] */
- x3 = _SIMD32_OFFSET(px + 2);
+ x3 = read_q15x2 ((q15_t *) px + 2);
px += 3U;
/* Perform the multiply-accumulates */
acc0 = __SMLADX(x1, c0, acc0);
- acc1 = __SMLAD(x2, c0, acc1);
+ acc1 = __SMLAD (x2, c0, acc1);
acc2 = __SMLADX(x2, c0, acc2);
acc3 = __SMLADX(x3, c0, acc3);
}
@@ -430,15 +424,14 @@ void arm_correlate_fast_q15(
*pOut = (q15_t) (acc3 >> 15);
pOut += inc;
- /* Increment the pointer pIn1 index, count by 1 */
+ /* Increment the pointer pIn1 index, count by 4 */
count += 4U;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -459,12 +452,12 @@ void arm_correlate_fast_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -477,7 +470,7 @@ void arm_correlate_fast_q15(
/* Perform the multiply-accumulates */
sum += ((q31_t) * px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -493,7 +486,7 @@ void arm_correlate_fast_q15(
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -508,15 +501,15 @@ void arm_correlate_fast_q15(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Loop over srcBLen */
+ /* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -525,14 +518,14 @@ void arm_correlate_fast_q15(
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -577,11 +570,11 @@ void arm_correlate_fast_q15(
{
/* Perform the multiply-accumulates */
/* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
- sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
/* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
- sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -594,7 +587,7 @@ void arm_correlate_fast_q15(
/* Perform the multiply-accumulates */
sum = __SMLAD(*px++, *py++, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -614,694 +607,8 @@ void arm_correlate_fast_q15(
blockSize3--;
}
-#else
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
- q15_t a, b;
-
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- /* But CORR(x, y) is reverse of CORR(y, x) */
- /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
- /* and the destination pointer modifier, inc is set to -1 */
- /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
- /* But to improve the performance,
- * we include zeroes in the output instead of zero padding either of the the inputs*/
- /* If srcALen > srcBLen,
- * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
- /* If srcALen < srcBLen,
- * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = (pSrcA);
-
- /* Initialization of inputB pointer */
- pIn2 = (pSrcB);
-
- /* Number of output samples is calculated */
- outBlockSize = (2U * srcALen) - 1U;
-
- /* When srcALen > srcBLen, zero padding is done to srcB
- * to make their lengths equal.
- * Instead, (outBlockSize - (srcALen + srcBLen - 1))
- * number of output samples are made zero */
- j = outBlockSize - (srcALen + (srcBLen - 1U));
-
- /* Updating the pointer position to non zero value */
- pOut += j;
-
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = (pSrcB);
-
- /* Initialization of inputB pointer */
- pIn2 = (pSrcA);
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
-
- /* CORR(x, y) = Reverse order(CORR(y, x)) */
- /* Hence set the destination pointer to point to the last output sample */
- pOut = pDst + ((srcALen + srcBLen) - 2U);
-
- /* Destination address modifier is set to -1 */
- inc = -1;
-
- }
-
- /* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
- * algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
- /* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
- blockSize1 = srcBLen - 1U;
- blockSize2 = srcALen - (srcBLen - 1U);
- blockSize3 = blockSize1;
-
- /* --------------------------
- * Initializations of stage1
- * -------------------------*/
-
- /* sum = x[0] * y[srcBlen - 1]
- * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
- * ....
- * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
- */
-
- /* In this stage the MAC operations are increased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = 1U;
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- pSrc1 = pIn2 + (srcBLen - 1U);
- py = pSrc1;
-
- /* ------------------------
- * Stage1 process
- * ----------------------*/
-
- /* The first loop starts here */
- while (blockSize1 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = count % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- /* x[0] * y[srcBLen - 1] */
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = (q15_t) (sum >> 15);
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = pSrc1 - count;
- px = pIn1;
-
- /* Increment the MAC count */
- count++;
-
- /* Decrement the loop counter */
- blockSize1--;
- }
-
- /* --------------------------
- * Initializations of stage2
- * ------------------------*/
-
- /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
- * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
- * ....
- * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- */
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- py = pIn2;
-
- /* count is index by which the pointer pIn1 to be incremented */
- count = 0U;
-
- /* -------------------
- * Stage2 process
- * ------------------*/
-
- /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
- * So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
- if (srcBLen >= 4U)
- {
- /* Loop unroll over blockSize2, by 4 */
- blkCnt = blockSize2 >> 2U;
-
- while (blkCnt > 0U)
- {
- /* Set all accumulators to zero */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* read x[0], x[1], x[2] samples */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 2U;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- do
- {
- /* Read the first two inputB samples using SIMD:
- * y[0] and y[1] */
- a = *py;
- b = *(py + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[0] * y[0] + x[1] * y[1] */
- acc0 = __SMLAD(x0, c0, acc0);
-
- /* acc1 += x[1] * y[0] + x[2] * y[1] */
- acc1 = __SMLAD(x1, c0, acc1);
-
- /* Read x[2], x[3], x[4] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x2 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x3 = __PKHBT(b, a, 16);
-
-#else
-
- x2 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x3 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc2 += x[2] * y[0] + x[3] * y[1] */
- acc2 = __SMLAD(x2, c0, acc2);
-
- /* acc3 += x[3] * y[0] + x[4] * y[1] */
- acc3 = __SMLAD(x3, c0, acc3);
-
- /* Read y[2] and y[3] */
- a = *(py + 2);
- b = *(py + 3);
-
- py += 4U;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* acc0 += x[2] * y[2] + x[3] * y[3] */
- acc0 = __SMLAD(x2, c0, acc0);
-
- /* acc1 += x[3] * y[2] + x[4] * y[3] */
- acc1 = __SMLAD(x3, c0, acc1);
-
- /* Read x[4], x[5], x[6] */
- a = *(px + 2);
- b = *(px + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x0 = __PKHBT(a, b, 16);
- a = *(px + 4);
- x1 = __PKHBT(b, a, 16);
-
-#else
-
- x0 = __PKHBT(b, a, 16);
- a = *(px + 4);
- x1 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 4U;
-
- /* acc2 += x[4] * y[2] + x[5] * y[3] */
- acc2 = __SMLAD(x0, c0, acc2);
-
- /* acc3 += x[5] * y[2] + x[6] * y[3] */
- acc3 = __SMLAD(x1, c0, acc3);
-
- } while (--k);
-
- /* For the next MAC operations, SIMD is not used
- * So, the 16 bit pointer if inputB, py is updated */
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- if (k == 1U)
- {
- /* Read y[4] */
- c0 = *py;
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-
-#else
-
- c0 = c0 & 0x0000FFFF;
-
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7] */
- a = *px;
- b = *(px + 1);
-
- px++;;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px++;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
- acc2 = __SMLADX(x1, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- if (k == 2U)
- {
- /* Read y[4], y[5] */
- a = *py;
- b = *(py + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 2U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
- acc2 = __SMLAD(x3, c0, acc2);
- acc3 = __SMLAD(x2, c0, acc3);
- }
-
- if (k == 3U)
- {
- /* Read y[4], y[5] */
- a = *py;
- b = *(py + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- c0 = __PKHBT(a, b, 16);
-
-#else
-
- c0 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- py += 2U;
-
- /* Read x[7], x[8], x[9] */
- a = *px;
- b = *(px + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
- a = *(px + 2);
- x2 = __PKHBT(b, a, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
- a = *(px + 2);
- x2 = __PKHBT(a, b, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLAD(x0, c0, acc0);
- acc1 = __SMLAD(x1, c0, acc1);
- acc2 = __SMLAD(x3, c0, acc2);
- acc3 = __SMLAD(x2, c0, acc3);
-
- c0 = (*py);
- /* Read y[6] */
-#ifdef ARM_MATH_BIG_ENDIAN
-
- c0 = c0 << 16U;
-#else
-
- c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-
- /* Read x[10] */
- b = *(px + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- x3 = __PKHBT(a, b, 16);
-
-#else
-
- x3 = __PKHBT(b, a, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- px += 3U;
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLADX(x1, c0, acc0);
- acc1 = __SMLAD(x2, c0, acc1);
- acc2 = __SMLADX(x2, c0, acc2);
- acc3 = __SMLADX(x3, c0, acc3);
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = (q15_t) (acc0 >> 15);
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- *pOut = (q15_t) (acc1 >> 15);
- pOut += inc;
-
- *pOut = (q15_t) (acc2 >> 15);
- pOut += inc;
-
- *pOut = (q15_t) (acc3 >> 15);
- pOut += inc;
-
- /* Increment the pointer pIn1 index, count by 1 */
- count += 4U;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pIn2;
-
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize2 % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = srcBLen >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = srcBLen % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = (q15_t) (sum >> 15);
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Increment the pointer pIn1 index, count by 1 */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pIn2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
- else
- {
- /* If the srcBLen is not a multiple of 4,
- * the blockSize2 loop cannot be unrolled by 4 */
- blkCnt = blockSize2;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Loop over srcBLen */
- k = srcBLen;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = (q15_t) (sum >> 15);
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Increment the MAC count */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pIn2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
-
- /* --------------------------
- * Initializations of stage3
- * -------------------------*/
-
- /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- * ....
- * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
- * sum += x[srcALen-1] * y[0]
- */
-
- /* In this stage the MAC operations are decreased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = srcBLen - 1U;
-
- /* Working pointer of inputA */
- pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
- px = pSrc1;
-
- /* Working pointer of inputB */
- py = pIn2;
-
- /* -------------------
- * Stage3 process
- * ------------------*/
-
- while (blockSize3 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0;
-
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
- k = count % 0x4U;
-
- while (k > 0U)
- {
- /* Perform the multiply-accumulates */
- sum += ((q31_t) * px++ * *py++);
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = (q15_t) (sum >> 15);
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pIn2;
-
- /* Decrement the MAC count */
- count--;
-
- /* Decrement the loop counter */
- blockSize3--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c b/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
index 4a006aa..a5840b7 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_fast_q31.c
* Description: Fast Q31 Correlation
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,61 +29,58 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are accumulated in a 32-bit register in 2.30 format.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- *
- * \par
- * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signals must be scaled down.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
- * maximum of min(srcALen, srcBLen) number of additions is carried internally.
- *
- * \par
- * See arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ @brief Correlation of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+
+ @remark
+ Refer to \ref arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
*/
void arm_correlate_fast_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
{
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1; /* Intermediate pointers */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
-
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -91,10 +88,10 @@ void arm_correlate_fast_q31(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
@@ -112,10 +109,10 @@ void arm_correlate_fast_q31(
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -132,14 +129,15 @@ void arm_correlate_fast_q31(
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -176,7 +174,7 @@ void arm_correlate_fast_q31(
sum = 0;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
+ k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
@@ -184,18 +182,21 @@ void arm_correlate_fast_q31(
{
/* x[0] * y[srcBLen - 4] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* x[1] * y[srcBLen - 3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* x[2] * y[srcBLen - 2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* x[3] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -205,12 +206,12 @@ void arm_correlate_fast_q31(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -223,10 +224,10 @@ void arm_correlate_fast_q31(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -270,9 +271,9 @@ void arm_correlate_fast_q31(
acc3 = 0;
/* read x[0], x[1], x[2] samples */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
@@ -282,10 +283,9 @@ void arm_correlate_fast_q31(
do
{
/* Read y[0] sample */
- c0 = *(py++);
-
+ c0 = *py++;
/* Read x[3] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulate */
/* acc0 += x[0] * y[0] */
@@ -297,13 +297,13 @@ void arm_correlate_fast_q31(
/* acc3 += x[3] * y[0] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
/* Read y[1] sample */
- c0 = *(py++);
-
+ c0 = *py++;
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[1] * y[1] */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
/* acc1 += x[2] * y[1] */
@@ -313,11 +313,11 @@ void arm_correlate_fast_q31(
/* acc3 += x[4] * y[1] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
- /* Read y[2] sample */
- c0 = *(py++);
+ /* Read y[2] sample */
+ c0 = *py++;
/* Read x[5] sample */
- x1 = *(px++);
+ x1 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[2] * y[2] */
@@ -329,11 +329,11 @@ void arm_correlate_fast_q31(
/* acc3 += x[5] * y[2] */
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
- /* Read y[3] sample */
- c0 = *(py++);
+ /* Read y[3] sample */
+ c0 = *py++;
/* Read x[6] sample */
- x2 = *(px++);
+ x2 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[3] * y[3] */
@@ -355,10 +355,9 @@ void arm_correlate_fast_q31(
while (k > 0U)
{
/* Read y[4] sample */
- c0 = *(py++);
-
+ c0 = *py++;
/* Read x[7] sample */
- x3 = *(px++);
+ x3 = *px++;
/* Perform the multiply-accumulates */
/* acc0 += x[4] * y[4] */
@@ -375,7 +374,7 @@ void arm_correlate_fast_q31(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -400,8 +399,7 @@ void arm_correlate_fast_q31(
px = pIn1 + count;
py = pIn2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -423,15 +421,15 @@ void arm_correlate_fast_q31(
{
/* Perform the multiply-accumulates */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -443,9 +441,9 @@ void arm_correlate_fast_q31(
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -454,15 +452,14 @@ void arm_correlate_fast_q31(
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -477,16 +474,16 @@ void arm_correlate_fast_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Loop over srcBLen */
+ /* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -495,18 +492,19 @@ void arm_correlate_fast_q31(
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
+
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -545,21 +543,24 @@ void arm_correlate_fast_q31(
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen - srcBLen + 4] * y[3] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* sum += x[srcALen - srcBLen + 3] * y[2] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* sum += x[srcALen - srcBLen + 2] * y[1] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
+
/* sum += x[srcALen - srcBLen + 1] * y[0] */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -569,11 +570,11 @@ void arm_correlate_fast_q31(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = (q31_t) ((((q63_t) sum << 32) +
- ((q63_t) * px++ * (*py++))) >> 32);
+ ((q63_t) *px++ * (*py++))) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -586,15 +587,15 @@ void arm_correlate_fast_q31(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c b/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
index 1eda719..d46d9a0 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_opt_q15.c
* Description: Correlation of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,69 +29,58 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @return none.
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch buffers should be aligned by 32-bit
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * This approach provides 33 guard bits and there is no risk of overflow.
- * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
- *
- * \par
- * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- *
- */
+ @brief Correlation of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @return none
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
+ */
void arm_correlate_opt_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst,
- q15_t * pScratch)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
{
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q63_t acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *py; /* Intermediate inputB pointer */
- q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */
- uint32_t j, blkCnt, outBlockSize; /* loop counter */
- int32_t inc = 1; /* output pointer increment */
- uint32_t tapCnt;
- q31_t y1, y2;
- q15_t *pScr; /* Intermediate pointers */
- q15_t *pOut = pDst; /* output pointer */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+ q63_t acc0; /* Accumulators */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1; /* Temporary pointer for scratch1 */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, blkCnt, outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Output pointer increment */
+ uint32_t tapCnt;
- q15_t a, b;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables for holding input1 and input2 values */
+ q31_t y1, y2; /* State variables */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -109,13 +98,13 @@ void arm_correlate_opt_q15(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
- outBlockSize = (2U * srcALen) - 1U;
+ outBlockSize = (srcALen * 2U) - 1U;
/* When srcALen > srcBLen, zero padding is done to srcB
* to make their lengths equal.
@@ -125,15 +114,14 @@ void arm_correlate_opt_q15(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -146,113 +134,43 @@ void arm_correlate_opt_q15(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
- pScr = pScratch;
+ pScr1 = pScratch;
/* Fill (srcBLen - 1U) zeros in scratch buffer */
- arm_fill_q15(0, pScr, (srcBLen - 1U));
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update temporary scratch pointer */
- pScr += (srcBLen - 1U);
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ pScr1 += (srcBLen - 1U);
/* Copy (srcALen) samples in scratch buffer */
- arm_copy_q15(pIn1, pScr, srcALen);
+ arm_copy_q15(pIn1, pScr1, srcALen);
/* Update pointers */
- //pIn1 += srcALen;
- pScr += srcALen;
+ pScr1 += srcALen;
-#else
-
- /* Apply loop unrolling and do 4 Copies simultaneously. */
- j = srcALen >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (j > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
- *pScr++ = *pIn1++;
-
- /* Decrement the loop counter */
- j--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- j = srcALen % 0x4U;
-
- while (j > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr++ = *pIn1++;
-
- /* Decrement the loop counter */
- j--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
- arm_fill_q15(0, pScr, (srcBLen - 1U));
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update pointer */
- pScr += (srcBLen - 1U);
-
-#else
-
-/* Apply loop unrolling and do 4 Copies simultaneously. */
- j = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (j > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr++ = 0;
- *pScr++ = 0;
- *pScr++ = 0;
- *pScr++ = 0;
-
- /* Decrement the loop counter */
- j--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- j = (srcBLen - 1U) % 0x4U;
-
- while (j > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr++ = 0;
-
- /* Decrement the loop counter */
- j--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+ pScr1 += (srcBLen - 1U);
/* Temporary pointer for scratch2 */
py = pIn2;
/* Actual correlation process starts here */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (srcALen + srcBLen - 1U) >> 2;
while (blkCnt > 0)
{
/* Initialze temporary scratch pointer as scratch1 */
- pScr = pScratch;
+ pScr1 = pScratch;
/* Clear Accumlators */
acc0 = 0;
@@ -260,41 +178,42 @@ void arm_correlate_opt_q15(
acc2 = 0;
acc3 = 0;
- /* Read four samples from scratch1 buffer */
- x1 = *__SIMD32(pScr)++;
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
- /* Read next four samples from scratch1 buffer */
- x2 = *__SIMD32(pScr)++;
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pIn2);
- y2 = _SIMD32_OFFSET(pIn2 + 2U);
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+ /* multiply and accumlate */
acc0 = __SMLALD(x1, y1, acc0);
-
acc2 = __SMLALD(x2, y1, acc2);
+ /* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
#else
x3 = __PKHBT(x1, x2, 0);
#endif
+ /* multiply and accumlate */
acc1 = __SMLALDX(x3, y1, acc1);
- x1 = _SIMD32_OFFSET(pScr);
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+ /* multiply and accumlate */
acc0 = __SMLALD(x2, y2, acc0);
-
acc2 = __SMLALD(x1, y2, acc2);
+ /* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x1, x2, 0);
#else
@@ -302,10 +221,9 @@ void arm_correlate_opt_q15(
#endif
acc3 = __SMLALDX(x3, y1, acc3);
-
acc1 = __SMLALDX(x3, y2, acc1);
- x2 = _SIMD32_OFFSET(pScr + 2U);
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -315,110 +233,27 @@ void arm_correlate_opt_q15(
acc3 = __SMLALDX(x3, y2, acc3);
-#else
-
- /* Read four samples from smaller buffer */
- a = *pIn2;
- b = *(pIn2 + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- y1 = __PKHBT(a, b, 16);
-#else
- y1 = __PKHBT(b, a, 16);
-#endif
-
- a = *(pIn2 + 2);
- b = *(pIn2 + 3);
-#ifndef ARM_MATH_BIG_ENDIAN
- y2 = __PKHBT(a, b, 16);
-#else
- y2 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLALD(x1, y1, acc0);
-
- acc2 = __SMLALD(x2, y1, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc1 = __SMLALDX(x3, y1, acc1);
-
- a = *pScr;
- b = *(pScr + 1);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x1 = __PKHBT(a, b, 16);
-#else
- x1 = __PKHBT(b, a, 16);
-#endif
-
- acc0 = __SMLALD(x2, y2, acc0);
-
- acc2 = __SMLALD(x1, y2, acc2);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x1, x2, 0);
-#else
- x3 = __PKHBT(x2, x1, 0);
-#endif
-
- acc3 = __SMLALDX(x3, y1, acc3);
-
- acc1 = __SMLALDX(x3, y2, acc1);
-
- a = *(pScr + 2);
- b = *(pScr + 3);
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x2 = __PKHBT(a, b, 16);
-#else
- x2 = __PKHBT(b, a, 16);
-#endif
-
-#ifndef ARM_MATH_BIG_ENDIAN
- x3 = __PKHBT(x2, x1, 0);
-#else
- x3 = __PKHBT(x1, x2, 0);
-#endif
-
- acc3 = __SMLALDX(x3, y2, acc3);
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- pIn2 += 4U;
-
- pScr += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-
/* Update scratch pointer for remaining samples of smaller length sequence */
- pScr -= 4U;
-
+ pScr1 -= 4U;
/* apply same above for remaining samples of smaller length sequence */
tapCnt = (srcBLen) & 3U;
while (tapCnt > 0U)
{
-
/* accumlate the results */
- acc0 += (*pScr++ * *pIn2);
- acc1 += (*pScr++ * *pIn2);
- acc2 += (*pScr++ * *pIn2);
- acc3 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
- pScr -= 3U;
+ pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -439,17 +274,24 @@ void arm_correlate_opt_q15(
pIn2 = py;
pScratch += 4U;
-
}
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
/* Calculate correlation for remaining samples of Bigger length sequence */
while (blkCnt > 0)
{
/* Initialze temporary scratch pointer as scratch1 */
- pScr = pScratch;
+ pScr1 = pScratch;
/* Clear Accumlators */
acc0 = 0;
@@ -459,10 +301,11 @@ void arm_correlate_opt_q15(
while (tapCnt > 0U)
{
- acc0 += (*pScr++ * *pIn2++);
- acc0 += (*pScr++ * *pIn2++);
+ /* Read next two samples from scratch1 buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -471,31 +314,28 @@ void arm_correlate_opt_q15(
/* apply same above for remaining samples of smaller length sequence */
while (tapCnt > 0U)
{
-
/* accumlate the results */
- acc0 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
blkCnt--;
- /* Store the result in the accumulator in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
*pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
-
pOut += inc;
/* Initialization of inputB pointer */
pIn2 = py;
pScratch += 1U;
-
}
-
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c b/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
index d4ff45e..035bfba 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_opt_q7.c
* Description: Correlation of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,67 +29,53 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
- * @return none.
- *
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
- * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
- *
- *
+ @brief Correlation of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
*/
-
-
void arm_correlate_opt_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst,
- q15_t * pScratch1,
- q15_t * pScratch2)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
{
- q7_t *pOut = pDst; /* output pointer */
- q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
- q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
- q7_t *pIn1; /* inputA pointer */
- q7_t *pIn2; /* inputB pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- uint32_t j, k = 0U, blkCnt; /* loop counter */
- int32_t inc = 1; /* output pointer increment */
- uint32_t outBlockSize; /* loop counter */
- q15_t x4; /* Temporary input variable */
- uint32_t tapCnt; /* loop counter */
- q31_t x1, x2, x3, y1; /* Temporary input variables */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
+ q15_t x4; /* Temporary input variable */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ int32_t inc = 1; /* Output pointer increment */
+ uint32_t outBlockSize; /* Loop counter */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ q7_t *pOut = pDst; /* Output pointer */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -107,13 +93,13 @@ void arm_correlate_opt_q7(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
- outBlockSize = (2U * srcALen) - 1U;
+ outBlockSize = (srcALen * 2U) - 1U;
/* When srcALen > srcBLen, zero padding is done to srcB
* to make their lengths equal.
@@ -123,15 +109,14 @@ void arm_correlate_opt_q7(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -144,7 +129,6 @@ void arm_correlate_opt_q7(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
@@ -152,34 +136,34 @@ void arm_correlate_opt_q7(
k = srcBLen >> 2U;
/* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
+ a second loop below copies for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * pIn2++;
+ x4 = (q15_t) *pIn2++;
*pScr2++ = x4;
- x4 = (q15_t) * pIn2++;
+ x4 = (q15_t) *pIn2++;
*pScr2++ = x4;
- x4 = (q15_t) * pIn2++;
+ x4 = (q15_t) *pIn2++;
*pScr2++ = x4;
- x4 = (q15_t) * pIn2++;
+ x4 = (q15_t) *pIn2++;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = srcBLen % 0x4U;
while (k > 0U)
{
/* copy second buffer in reversal manner for remaining samples */
- x4 = (q15_t) * pIn2++;
+ x4 = (q15_t) *pIn2++;
*pScr2++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -190,28 +174,29 @@ void arm_correlate_opt_q7(
pScr1 += (srcBLen - 1U);
/* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
k = srcALen >> 2U;
/* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
+ a second loop below copies for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* copy second buffer in reversal manner */
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- x4 = (q15_t) * pIn1++;
+ x4 = (q15_t) *pIn1++;
*pScr1++ = x4;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
+ No loop unrolling is used. */
k = srcALen % 0x4U;
while (k > 0U)
@@ -224,49 +209,13 @@ void arm_correlate_opt_q7(
k--;
}
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
/* Fill (srcBLen - 1U) zeros at end of scratch buffer */
arm_fill_q15(0, pScr1, (srcBLen - 1U));
/* Update pointer */
pScr1 += (srcBLen - 1U);
-#else
-
-/* Apply loop unrolling and do 4 Copies simultaneously. */
- k = (srcBLen - 1U) >> 2U;
-
- /* First part of the processing with loop unrolling copies 4 data points at a time.
- ** a second loop below copies for the remaining 1 to 3 samples. */
- while (k > 0U)
- {
- /* copy second buffer in reversal manner */
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* If the count is not a multiple of 4, copy remaining samples here.
- ** No loop unrolling is used. */
- k = (srcBLen - 1U) % 0x4U;
-
- while (k > 0U)
- {
- /* copy second buffer in reversal manner for remaining samples */
- *pScr1++ = 0;
-
- /* Decrement the loop counter */
- k--;
- }
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Temporary pointer for second sequence */
+ /* Temporary pointer for scratch2 */
py = pScratch2;
/* Initialization of pScr2 pointer */
@@ -287,18 +236,17 @@ void arm_correlate_opt_q7(
acc3 = 0;
/* Read two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* Read next two samples from scratch1 buffer */
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
tapCnt = (srcBLen) >> 2U;
while (tapCnt > 0U)
{
-
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2);
+ y1 = read_q15x2_ia (&pScr2);
/* multiply and accumlate */
acc0 = __SMLAD(x1, y1, acc0);
@@ -315,7 +263,7 @@ void arm_correlate_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
/* Read next two samples from scratch1 buffer */
- x1 = *__SIMD32(pScr1)++;
+ x1 = read_q15x2_ia (&pScr1);
/* pack input data */
#ifndef ARM_MATH_BIG_ENDIAN
@@ -327,7 +275,7 @@ void arm_correlate_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
/* Read four samples from smaller buffer */
- y1 = _SIMD32_OFFSET(pScr2 + 2U);
+ y1 = read_q15x2_ia (&pScr2);
acc0 = __SMLAD(x2, y1, acc0);
@@ -335,7 +283,7 @@ void arm_correlate_opt_q7(
acc1 = __SMLADX(x3, y1, acc1);
- x2 = *__SIMD32(pScr1)++;
+ x2 = read_q15x2_ia (&pScr1);
#ifndef ARM_MATH_BIG_ENDIAN
x3 = __PKHBT(x2, x1, 0);
@@ -345,25 +293,18 @@ void arm_correlate_opt_q7(
acc3 = __SMLADX(x3, y1, acc3);
- pScr2 += 4U;
-
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-
/* Update scratch pointer for remaining samples of smaller length sequence */
pScr1 -= 4U;
-
/* apply same above for remaining samples of smaller length sequence */
tapCnt = (srcBLen) & 3U;
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2);
acc1 += (*pScr1++ * *pScr2);
@@ -372,7 +313,7 @@ void arm_correlate_opt_q7(
pScr1 -= 3U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -392,10 +333,8 @@ void arm_correlate_opt_q7(
pScr2 = py;
pScratch1 += 4U;
-
}
-
blkCnt = (srcALen + srcBLen - 1U) & 0x3;
/* Calculate correlation for remaining samples of Bigger length sequence */
@@ -414,7 +353,7 @@ void arm_correlate_opt_q7(
acc0 += (*pScr1++ * *pScr2++);
acc0 += (*pScr1++ * *pScr2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -423,11 +362,10 @@ void arm_correlate_opt_q7(
/* apply same above for remaining samples of smaller length sequence */
while (tapCnt > 0U)
{
-
/* accumlate the results */
acc0 += (*pScr1++ * *pScr2++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -435,18 +373,16 @@ void arm_correlate_opt_q7(
/* Store the result in the accumulator in the destination buffer. */
*pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
-
pOut += inc;
/* Initialization of inputB pointer */
pScr2 = py;
pScratch1 += 1U;
-
}
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_q15.c b/DSP/Source/FilteringFunctions/arm_correlate_q15.c
index ce86db4..9837875 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_q15.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_q15.c
* Description: Correlation of Q15 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,64 +29,58 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q15 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * This approach provides 33 guard bits and there is no risk of overflow.
- * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
- *
- * \par
- * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
- * \par
- * Refer the function arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
- *
+ @brief Correlation of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
*/
void arm_correlate_q15(
- q15_t * pSrcA,
- uint32_t srcALen,
- q15_t * pSrcB,
- uint32_t srcBLen,
- q15_t * pDst)
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
{
-#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t *pIn1; /* inputA pointer */
- q15_t *pIn2; /* inputB pointer */
- q15_t *pOut = pDst; /* output pointer */
- q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *px; /* Intermediate inputA pointer */
- q15_t *py; /* Intermediate inputB pointer */
- q15_t *pSrc1; /* Intermediate pointers */
- q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
+#if defined (ARM_MATH_DSP)
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -104,13 +98,13 @@ void arm_correlate_q15(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
- outBlockSize = (2U * srcALen) - 1U;
+ outBlockSize = (srcALen * 2U) - 1U;
/* When srcALen > srcBLen, zero padding is done to srcB
* to make their lengths equal.
@@ -120,15 +114,14 @@ void arm_correlate_q15(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -141,18 +134,18 @@ void arm_correlate_q15(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -189,18 +182,19 @@ void arm_correlate_q15(
sum = 0;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
+ k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
+ /* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
- sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
/* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
- sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -210,11 +204,11 @@ void arm_correlate_q15(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
sum = __SMLALD(*px++, *py++, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -227,10 +221,10 @@ void arm_correlate_q15(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -250,7 +244,7 @@ void arm_correlate_q15(
/* Working pointer of inputB */
py = pIn2;
- /* count is index by which the pointer pIn1 to be incremented */
+ /* count is the index by which the pointer pIn1 to be incremented */
count = 0U;
/* -------------------
@@ -259,10 +253,10 @@ void arm_correlate_q15(
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ * srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U)
@@ -274,9 +268,10 @@ void arm_correlate_q15(
acc3 = 0;
/* read x[0], x[1] samples */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 ((q15_t *) px);
+
/* read x[1], x[2] samples */
- x1 = _SIMD32_OFFSET(px + 1);
+ x1 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
@@ -288,7 +283,7 @@ void arm_correlate_q15(
{
/* Read the first two inputB samples using SIMD:
* y[0] and y[1] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* acc0 += x[0] * y[0] + x[1] * y[1] */
acc0 = __SMLALD(x0, c0, acc0);
@@ -297,10 +292,10 @@ void arm_correlate_q15(
acc1 = __SMLALD(x1, c0, acc1);
/* Read x[2], x[3] */
- x2 = *__SIMD32(px);
+ x2 = read_q15x2 ((q15_t *) px);
/* Read x[3], x[4] */
- x3 = _SIMD32_OFFSET(px + 1);
+ x3 = read_q15x2 ((q15_t *) px + 1);
/* acc2 += x[2] * y[0] + x[3] * y[1] */
acc2 = __SMLALD(x2, c0, acc2);
@@ -309,7 +304,7 @@ void arm_correlate_q15(
acc3 = __SMLALD(x3, c0, acc3);
/* Read y[2] and y[3] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* acc0 += x[2] * y[2] + x[3] * y[3] */
acc0 = __SMLALD(x2, c0, acc0);
@@ -318,11 +313,10 @@ void arm_correlate_q15(
acc1 = __SMLALD(x3, c0, acc1);
/* Read x[4], x[5] */
- x0 = _SIMD32_OFFSET(px + 2);
+ x0 = read_q15x2 ((q15_t *) px + 2);
/* Read x[5], x[6] */
- x1 = _SIMD32_OFFSET(px + 3);
-
+ x1 = read_q15x2 ((q15_t *) px + 3);
px += 4U;
/* acc2 += x[4] * y[2] + x[5] * y[3] */
@@ -342,21 +336,18 @@ void arm_correlate_q15(
/* Read y[4] */
c0 = *py;
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
-
#else
-
c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
px++;
- /* Perform the multiply-accumulates */
- acc0 = __SMLALD(x0, c0, acc0);
- acc1 = __SMLALD(x1, c0, acc1);
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD (x0, c0, acc0);
+ acc1 = __SMLALD (x1, c0, acc1);
acc2 = __SMLALDX(x1, c0, acc2);
acc3 = __SMLALDX(x3, c0, acc3);
}
@@ -364,16 +355,16 @@ void arm_correlate_q15(
if (k == 2U)
{
/* Read y[4], y[5] */
- c0 = *__SIMD32(py);
+ c0 = read_q15x2 ((q15_t *) py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px + 1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
px += 2U;
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLALD(x0, c0, acc0);
acc1 = __SMLALD(x1, c0, acc1);
acc2 = __SMLALD(x3, c0, acc2);
@@ -383,15 +374,15 @@ void arm_correlate_q15(
if (k == 3U)
{
/* Read y[4], y[5] */
- c0 = *__SIMD32(py)++;
+ c0 = read_q15x2_ia ((q15_t **) &py);
/* Read x[7], x[8] */
- x3 = *__SIMD32(px);
+ x3 = read_q15x2 ((q15_t *) px);
/* Read x[9] */
- x2 = _SIMD32_OFFSET(px + 1);
+ x2 = read_q15x2 ((q15_t *) px + 1);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
acc0 = __SMLALD(x0, c0, acc0);
acc1 = __SMLALD(x1, c0, acc1);
acc2 = __SMLALD(x3, c0, acc2);
@@ -401,19 +392,18 @@ void arm_correlate_q15(
/* Read y[6] */
#ifdef ARM_MATH_BIG_ENDIAN
-
c0 = c0 << 16U;
#else
-
c0 = c0 & 0x0000FFFF;
-#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
/* Read x[10] */
- x3 = _SIMD32_OFFSET(px + 2);
+ x3 = read_q15x2 ((q15_t *) px + 2);
px += 3U;
/* Perform the multiply-accumulates */
acc0 = __SMLALDX(x1, c0, acc0);
- acc1 = __SMLALD(x2, c0, acc1);
+ acc1 = __SMLALD (x2, c0, acc1);
acc2 = __SMLALDX(x2, c0, acc2);
acc3 = __SMLALDX(x3, c0, acc3);
}
@@ -439,7 +429,7 @@ void arm_correlate_q15(
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -460,12 +450,12 @@ void arm_correlate_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q63_t) * px++ * *py++);
- sum += ((q63_t) * px++ * *py++);
- sum += ((q63_t) * px++ * *py++);
- sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -476,7 +466,7 @@ void arm_correlate_q15(
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
/* Decrement the loop counter */
k--;
@@ -509,13 +499,13 @@ void arm_correlate_q15(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Loop over srcBLen */
+ /* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
/* Decrement the loop counter */
k--;
@@ -538,6 +528,7 @@ void arm_correlate_q15(
}
}
+
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -576,13 +567,13 @@ void arm_correlate_q15(
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
- sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
/* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
- sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -592,10 +583,10 @@ void arm_correlate_q15(
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLALD(*px++, *py++, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -608,23 +599,21 @@ void arm_correlate_q15(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
-#else
+#else /* #if defined (ARM_MATH_DSP) */
-/* Run the below code for Cortex-M0 */
-
- q15_t *pIn1 = pSrcA; /* inputA pointer */
- q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
- q63_t sum; /* Accumulators */
- uint32_t i = 0U, j; /* loop counters */
- uint32_t inv = 0U; /* Reverse order flag */
- uint32_t tot = 0U; /* Length */
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -672,10 +661,9 @@ void arm_correlate_q15(
/* Setting the reverse flag */
inv = 1;
-
}
- /* Loop to calculate convolution for output length number of times */
+ /* Loop to calculate convolution for output length number of values */
for (i = 0U; i <= tot; i++)
{
/* Initialize sum with zero to carry on MAC operations */
@@ -685,12 +673,13 @@ void arm_correlate_q15(
for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
- if ((((i - j) < srcBLen) && (j < srcALen)))
+ if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
}
}
+
/* Store the output in the destination buffer */
if (inv == 1)
*pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
@@ -698,10 +687,10 @@ void arm_correlate_q15(
*pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
}
-#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_q31.c b/DSP/Source/FilteringFunctions/arm_correlate_q31.c
index 3d7d3d0..caa2f51 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_q31.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_q31.c
* Description: Correlation of Q31 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,63 +29,64 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q31 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
- * maximum of min(srcALen, srcBLen) number of additions is carried internally.
- * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- * \par
- * See arm_correlate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ @brief Correlation of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
*/
void arm_correlate_q31(
- q31_t * pSrcA,
- uint32_t srcALen,
- q31_t * pSrcB,
- uint32_t srcBLen,
- q31_t * pDst)
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
{
-#if defined (ARM_MATH_DSP)
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t *pIn1; /* inputA pointer */
- q31_t *pIn2; /* inputB pointer */
- q31_t *pOut = pDst; /* output pointer */
- q31_t *px; /* Intermediate inputA pointer */
- q31_t *py; /* Intermediate inputB pointer */
- q31_t *pSrc1; /* Intermediate pointers */
- q63_t sum, acc0, acc1, acc2; /* Accumulators */
- q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1; /* Destination address modifier */
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1; /* Intermediate pointers */
+ q63_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables for holding input and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -103,10 +104,10 @@ void arm_correlate_q31(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
@@ -119,15 +120,14 @@ void arm_correlate_q31(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -140,18 +140,18 @@ void arm_correlate_q31(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -177,6 +177,7 @@ void arm_correlate_q31(
pSrc1 = pIn2 + (srcBLen - 1U);
py = pSrc1;
+
/* ------------------------
* Stage1 process
* ----------------------*/
@@ -187,37 +188,46 @@ void arm_correlate_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] * y[srcBLen - 4] */
- sum += (q63_t) * px++ * (*py++);
- /* x[1] * y[srcBLen - 3] */
- sum += (q63_t) * px++ * (*py++);
- /* x[2] * y[srcBLen - 2] */
- sum += (q63_t) * px++ * (*py++);
- /* x[3] * y[srcBLen - 1] */
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * (*py++);
- /* Decrement the loop counter */
+ /* x[1] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* x[2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* x[3] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * (*py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -230,10 +240,10 @@ void arm_correlate_q31(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -265,6 +275,8 @@ void arm_correlate_q31(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Loop unroll by 3 */
blkCnt = blockSize2 / 3;
@@ -276,8 +288,8 @@ void arm_correlate_q31(
acc2 = 0;
/* read x[0], x[1] samples */
- x0 = *(px++);
- x1 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
/* Apply loop unrolling and compute 3 MACs simultaneously. */
k = srcBLen / 3;
@@ -288,7 +300,6 @@ void arm_correlate_q31(
{
/* Read y[0] sample */
c0 = *(py);
-
/* Read x[2] sample */
x2 = *(px);
@@ -302,11 +313,10 @@ void arm_correlate_q31(
/* Read y[1] sample */
c0 = *(py + 1U);
-
/* Read x[3] sample */
x0 = *(px + 1U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[1] * y[1] */
acc0 += ((q63_t) x1 * c0);
/* acc1 += x[2] * y[1] */
@@ -316,11 +326,10 @@ void arm_correlate_q31(
/* Read y[2] sample */
c0 = *(py + 2U);
-
/* Read x[4] sample */
x1 = *(px + 2U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* acc0 += x[2] * y[2] */
acc0 += ((q63_t) x2 * c0);
/* acc1 += x[3] * y[2] */
@@ -358,7 +367,7 @@ void arm_correlate_q31(
x0 = x1;
x1 = x2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -380,45 +389,56 @@ void arm_correlate_q31(
px = pIn1 + count;
py = pIn2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* Perform the multiply-accumulates */
- sum += (q63_t) * px++ * (*py++);
- sum += (q63_t) * px++ * (*py++);
- sum += (q63_t) * px++ * (*py++);
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * *py++;
/* Decrement the loop counter */
k--;
@@ -429,14 +449,14 @@ void arm_correlate_q31(
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
@@ -451,13 +471,13 @@ void arm_correlate_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Loop over srcBLen */
+ /* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * *py++;
/* Decrement the loop counter */
k--;
@@ -468,18 +488,19 @@ void arm_correlate_q31(
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
+
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -511,37 +532,46 @@ void arm_correlate_q31(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* sum += x[srcALen - srcBLen + 4] * y[3] */
- sum += (q63_t) * px++ * (*py++);
- /* sum += x[srcALen - srcBLen + 3] * y[2] */
- sum += (q63_t) * px++ * (*py++);
- /* sum += x[srcALen - srcBLen + 2] * y[1] */
- sum += (q63_t) * px++ * (*py++);
- /* sum += x[srcALen - srcBLen + 1] * y[0] */
- sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) *px++ * *py++;
- /* Decrement the loop counter */
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += (q63_t) * px++ * (*py++);
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -554,23 +584,22 @@ void arm_correlate_q31(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
- q31_t *pIn1 = pSrcA; /* inputA pointer */
- q31_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
- q63_t sum; /* Accumulators */
- uint32_t i = 0U, j; /* loop counters */
- uint32_t inv = 0U; /* Reverse order flag */
- uint32_t tot = 0U; /* Length */
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -618,25 +647,25 @@ void arm_correlate_q31(
/* Setting the reverse flag */
inv = 1;
-
}
/* Loop to calculate correlation for output length number of times */
for (i = 0U; i <= tot; i++)
{
- /* Initialize sum with zero to carry on MAC operations */
+ /* Initialize sum with zero to carry out MAC operations */
sum = 0;
/* Loop to perform MAC operations according to correlation equation */
for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
- if ((((i - j) < srcBLen) && (j < srcALen)))
+ if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
}
}
+
/* Store the output in the destination buffer */
if (inv == 1)
*pDst-- = (q31_t) (sum >> 31U);
@@ -644,10 +673,10 @@ void arm_correlate_q31(
*pDst++ = (q31_t) (sum >> 31U);
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_correlate_q7.c b/DSP/Source/FilteringFunctions/arm_correlate_q7.c
index dc5247f..e5881ac 100644
--- a/DSP/Source/FilteringFunctions/arm_correlate_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_correlate_q7.c
@@ -3,13 +3,13 @@
* Title: arm_correlate_q7.c
* Description: Correlation of Q7 sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,64 +29,63 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup Corr
- * @{
+ @addtogroup Corr
+ @{
*/
/**
- * @brief Correlation of Q7 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
- * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
- *
- * \par
- * Refer the function arm_correlate_opt_q7() for a faster implementation of this function.
- *
+ @brief Correlation of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
+
+ @remark
+ Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
*/
void arm_correlate_q7(
- q7_t * pSrcA,
- uint32_t srcALen,
- q7_t * pSrcB,
- uint32_t srcBLen,
- q7_t * pDst)
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q7_t *pIn1; /* inputA pointer */
- q7_t *pIn2; /* inputB pointer */
- q7_t *pOut = pDst; /* output pointer */
- q7_t *px; /* Intermediate inputA pointer */
- q7_t *py; /* Intermediate inputB pointer */
- q7_t *pSrc1; /* Intermediate pointers */
- q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
- q31_t input1, input2; /* temporary variables */
- q15_t in1, in2; /* temporary variables */
- q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */
- uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
- int32_t inc = 1;
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1;
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables for holding input and coefficient values */
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -104,10 +103,10 @@ void arm_correlate_q7(
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcA);
+ pIn1 = pSrcA;
/* Initialization of inputB pointer */
- pIn2 = (pSrcB);
+ pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
@@ -120,15 +119,14 @@ void arm_correlate_q7(
/* Updating the pointer position to non zero value */
pOut += j;
-
}
else
{
/* Initialization of inputA pointer */
- pIn1 = (pSrcB);
+ pIn1 = pSrcB;
/* Initialization of inputB pointer */
- pIn2 = (pSrcA);
+ pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
@@ -141,18 +139,18 @@ void arm_correlate_q7(
/* Destination address modifier is set to -1 */
inc = -1;
-
}
/* The function is internally
- * divided into three parts according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first part of the
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
- * In the second part of the algorithm, srcBLen number of multiplications are done.
- * In the third part of the algorithm, the multiplications decrease by one
- * for every iteration.*/
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
/* The algorithm is implemented in three stages.
- * The loop counters of each stage is initiated here. */
+ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
@@ -188,21 +186,21 @@ void arm_correlate_q7(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- k = count >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[0] , x[1] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* y[srcBLen - 4] , y[srcBLen - 3] */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* x[0] * y[srcBLen - 4] */
@@ -210,40 +208,45 @@ void arm_correlate_q7(
sum = __SMLAD(input1, input2, sum);
/* x[2] , x[3] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* y[srcBLen - 2] , y[srcBLen - 1] */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
/* x[2] * y[srcBLen - 2] */
/* x[3] * y[srcBLen - 1] */
sum = __SMLAD(input1, input2, sum);
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
- sum += (q31_t) ((q15_t) * px++ * *py++);
+ sum += (q31_t) ((q15_t) *px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
- *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
@@ -251,10 +254,10 @@ void arm_correlate_q7(
py = pSrc1 - count;
px = pIn1;
- /* Increment the MAC count */
+ /* Increment MAC count */
count++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize1--;
}
@@ -286,7 +289,9 @@ void arm_correlate_q7(
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
- /* Loop unroll over blockSize2, by 4 */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize2 >> 2U;
while (blkCnt > 0U)
@@ -321,13 +326,13 @@ void arm_correlate_q7(
in1 = (q15_t) x0;
in2 = (q15_t) x1;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[0] and y[1] are packed */
in1 = (q15_t) c0;
in2 = (q15_t) c1;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc0 += x[0] * y[0] + x[1] * y[1] */
acc0 = __SMLAD(input1, input2, acc0);
@@ -336,7 +341,7 @@ void arm_correlate_q7(
in1 = (q15_t) x1;
in2 = (q15_t) x2;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc1 += x[1] * y[0] + x[2] * y[1] */
acc1 = __SMLAD(input1, input2, acc1);
@@ -345,19 +350,19 @@ void arm_correlate_q7(
in1 = (q15_t) x2;
in2 = (q15_t) x3;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc2 += x[2] * y[0] + x[3] * y[1] */
acc2 = __SMLAD(input1, input2, acc2);
/* Read x[4] sample */
- x0 = *(px++);
+ x0 = *px++;
/* x[3] and x[4] are packed */
in1 = (q15_t) x3;
in2 = (q15_t) x0;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc3 += x[3] * y[0] + x[4] * y[1] */
acc3 = __SMLAD(input1, input2, acc3);
@@ -374,13 +379,13 @@ void arm_correlate_q7(
in1 = (q15_t) x2;
in2 = (q15_t) x3;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[2] and y[3] are packed */
in1 = (q15_t) c0;
in2 = (q15_t) c1;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc0 += x[2] * y[2] + x[3] * y[3] */
acc0 = __SMLAD(input1, input2, acc0);
@@ -389,7 +394,7 @@ void arm_correlate_q7(
in1 = (q15_t) x3;
in2 = (q15_t) x0;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc1 += x[3] * y[2] + x[4] * y[3] */
acc1 = __SMLAD(input1, input2, acc1);
@@ -398,7 +403,7 @@ void arm_correlate_q7(
in1 = (q15_t) x0;
in2 = (q15_t) x1;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc2 += x[4] * y[2] + x[5] * y[3] */
acc2 = __SMLAD(input1, input2, acc2);
@@ -410,7 +415,7 @@ void arm_correlate_q7(
in1 = (q15_t) x1;
in2 = (q15_t) x2;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* acc3 += x[5] * y[2] + x[6] * y[3] */
acc3 = __SMLAD(input1, input2, acc3);
@@ -425,7 +430,6 @@ void arm_correlate_q7(
{
/* Read y[4] sample */
c0 = *py++;
-
/* Read x[7] sample */
x3 = *px++;
@@ -444,7 +448,7 @@ void arm_correlate_q7(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
@@ -467,76 +471,89 @@ void arm_correlate_q7(
px = pIn1 + count;
py = pIn2;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize2 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = srcBLen >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
+
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
/* Reading two inputs of SrcA buffer and packing */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* Reading two inputs of SrcB buffer and packing */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
- /* Perform the multiply-accumulates */
+ /* Perform the multiply-accumulate */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += ((q15_t) * px++ * *py++);
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py++);
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
- *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Increment the pointer pIn1 index, count by 1 */
- count++;
+ count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
@@ -557,20 +574,20 @@ void arm_correlate_q7(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Loop over srcBLen */
+ /* srcBLen number of MACS should be performed */
k = srcBLen;
while (k > 0U)
{
/* Perform the multiply-accumulate */
- sum += ((q15_t) * px++ * *py++);
+ sum += ((q15_t) *px++ * *py++);
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
- *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
@@ -581,12 +598,12 @@ void arm_correlate_q7(
px = pIn1 + count;
py = pIn2;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
+
/* --------------------------
* Initializations of stage3
* -------------------------*/
@@ -618,60 +635,66 @@ void arm_correlate_q7(
/* Accumulator is made zero for every iteration */
sum = 0;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
k = count >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
- ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{
/* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[0] , y[1] */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* sum += x[srcALen - srcBLen + 1] * y[0] */
/* sum += x[srcALen - srcBLen + 2] * y[1] */
sum = __SMLAD(input1, input2, sum);
/* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
- in1 = (q15_t) * px++;
- in2 = (q15_t) * px++;
- input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* y[2] , y[3] */
- in1 = (q15_t) * py++;
- in2 = (q15_t) * py++;
- input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
/* sum += x[srcALen - srcBLen + 3] * y[2] */
/* sum += x[srcALen - srcBLen + 4] * y[3] */
sum = __SMLAD(input1, input2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
- /* If the count is not a multiple of 4, compute any remaining MACs here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
k = count % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (k > 0U)
{
- /* Perform the multiply-accumulates */
- sum += ((q15_t) * px++ * *py++);
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
- *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
@@ -679,23 +702,22 @@ void arm_correlate_q7(
px = ++pSrc1;
py = pIn2;
- /* Decrement the MAC count */
+ /* Decrement MAC count */
count--;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blockSize3--;
}
#else
+/* alternate version for CM0_FAMILY */
-/* Run the below code for Cortex-M0 */
-
- q7_t *pIn1 = pSrcA; /* inputA pointer */
- q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
- q31_t sum; /* Accumulator */
- uint32_t i = 0U, j; /* loop counters */
- uint32_t inv = 0U; /* Reverse order flag */
- uint32_t tot = 0U; /* Length */
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -743,25 +765,25 @@ void arm_correlate_q7(
/* Setting the reverse flag */
inv = 1;
-
}
/* Loop to calculate convolution for output length number of times */
for (i = 0U; i <= tot; i++)
{
- /* Initialize sum with zero to carry on MAC operations */
+ /* Initialize sum with zero to carry out MAC operations */
sum = 0;
/* Loop to perform MAC operations according to convolution equation */
for (j = 0U; j <= i; j++)
{
/* Check the array limitations */
- if ((((i - j) < srcBLen) && (j < srcALen)))
+ if (((i - j) < srcBLen) && (j < srcALen))
{
/* z[i] += x[i-j] * y[j] */
sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
}
}
+
/* Store the output in the destination buffer */
if (inv == 1)
*pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
@@ -769,10 +791,10 @@ void arm_correlate_q7(
*pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of Corr group
+ @} end of Corr group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
index 160dc2a..218ca34 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_f32.c
* Description: FIR decimation for floating-point sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,130 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
- *
- * These functions combine an FIR filter together with a decimator.
- * They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
- * Conceptually, the functions are equivalent to the block diagram below:
- * \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
- * When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized
- * cutoff frequency of 1/M in order to prevent aliasing distortion.
- * The user of the function is responsible for providing the filter coefficients.
- *
- * The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
- * Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the
- * samples output by the decimator are computed.
- * The functions operate on blocks of input and output data.
- * pSrc points to an array of blockSize input values and
- * pDst points to an array of blockSize/M output values.
- * In order to have an integer number of output samples blockSize
- * must always be a multiple of the decimation factor M.
- *
- * The library provides separate functions for Q15, Q31 and floating-point data types.
- *
- * \par Algorithm:
- * The FIR portion of the algorithm uses the standard form filter:
- *
- * where, b[n] are the filter coefficients.
- * \par
- * The pCoeffs points to a coefficient array of size numTaps.
- * Coefficients are stored in time reversed order.
- * \par
- *
- * The state variables are updated after each block of data is processed, the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * - Checks to make sure that the size of the input is a multiple of the decimation factor.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * The code below statically initializes each of the 3 different data type filter instance structures
- *
- *arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
- *arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
- *arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
- *
- * where M is the decimation factor; numTaps is the number of filter coefficients in the filter;
- * pCoeffs is the address of the coefficient buffer;
- * pState is the address of the state buffer.
- * Be sure to set the values in the state buffer to zeros when doing static initialization.
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
+
+ These functions combine an FIR filter together with a decimator.
+ They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
+ Conceptually, the functions are equivalent to the block diagram below:
+ \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
+ When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized
+ cutoff frequency of 1/M in order to prevent aliasing distortion.
+ The user of the function is responsible for providing the filter coefficients.
+
+ The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
+ Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the
+ samples output by the decimator are computed.
+ The functions operate on blocks of input and output data.
+ pSrc points to an array of blockSize input values and
+ pDst points to an array of blockSize/M output values.
+ In order to have an integer number of output samples blockSize
+ must always be a multiple of the decimation factor M.
+
+ The library provides separate functions for Q15, Q31 and floating-point data types.
+
+ @par Algorithm:
+ The FIR portion of the algorithm uses the standard form filter:
+
+ where, b[n] are the filter coefficients.
+ @par
+ The pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ - Checks to make sure that the size of the input is a multiple of the decimation factor.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
+ arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
+ arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
+
+ where M is the decimation factor; numTaps is the number of filter coefficients in the filter;
+ pCoeffs is the address of the coefficient buffer;
+ pState is the address of the state buffer.
+ Be sure to set the values in the state buffer to zeros when doing static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
- /**
- * @brief Processing function for the floating-point FIR decimator.
- * @param[in] *S points to an instance of the floating-point FIR decimator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- */
+/**
+ @brief Processing function for floating-point FIR decimator.
+ @param[in] S points to an instance of the floating-point FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+#if defined(ARM_MATH_NEON)
void arm_fir_decimate_f32(
const arm_fir_decimate_instance_f32 * S,
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t *px; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
float32_t sum0; /* Accumulator */
float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
-#if defined (ARM_MATH_DSP)
-
uint32_t blkCntN4;
float32_t *px0, *px1, *px2, *px3;
float32_t acc0, acc1, acc2, acc3;
float32_t x1, x2, x3;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
+ float32x4_t accv,acc0v,acc1v,acc2v,acc3v;
+ float32x4_t x0v, x1v, x2v, x3v;
+ float32x4_t c0v;
+ float32x2_t temp;
+ float32x4_t sum0v;
+
/* S->pState buffer contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = S->pState + (numTaps - 1U);
@@ -169,6 +172,248 @@ void arm_fir_decimate_f32(
} while (--i);
+ /* Set accumulators to zero */
+ acc0v = vdupq_n_f32(0.0);
+ acc1v = vdupq_n_f32(0.0);
+ acc2v = vdupq_n_f32(0.0);
+ acc3v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0v = vld1q_f32(pb);
+ pb += 4;
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0v = vld1q_f32(px0);
+ x1v = vld1q_f32(px1);
+ x2v = vld1q_f32(px2);
+ x3v = vld1q_f32(px3);
+
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vmlaq_f32(acc0v, x0v, c0v);
+ acc1v = vmlaq_f32(acc1v, x1v, c0v);
+ acc2v = vmlaq_f32(acc2v, x2v, c0v);
+ acc3v = vmlaq_f32(acc3v, x3v, c0v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v));
+ accv[0] = temp[0] + temp[1];
+
+ temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v));
+ accv[1] = temp[0] + temp[1];
+
+ temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v));
+ accv[2] = temp[0] + temp[1];
+
+ temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v));
+ accv[3] = temp[0] + temp[1];
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ accv[0] += x0 * c0;
+ accv[1] += x1 * c0;
+ accv[2] += x2 * c0;
+ accv[3] += x3 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + 4 * S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ vst1q_f32(pDst,accv);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN4 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
+ {
+ c0v = vld1q_f32(pb);
+ pb += 4;
+
+ x0v = vld1q_f32(px);
+ px += 4;
+
+ sum0v = vmlaq_f32(sum0v, x0v, c0v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
+ sum0 = temp[0] + temp[1];
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1U) >> 2;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ sum0v = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,sum0v);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+#else
+void arm_fir_decimate_f32(
+ const arm_fir_decimate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *px0; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
+ float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ float32_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t *px1, *px2, *px3;
+ float32_t x1, x2, x3;
+ float32_t acc1, acc2, acc3;
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+
+ /* Samples loop unrolled by 4 */
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
/* Set accumulators to zero */
acc0 = 0.0f;
acc1 = 0.0f;
@@ -184,11 +429,8 @@ void arm_fir_decimate_f32(
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
-
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
@@ -255,11 +497,11 @@ void arm_fir_decimate_f32(
acc2 += x2 * c0;
acc3 += x3 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
while (tapCnt > 0U)
@@ -267,7 +509,7 @@ void arm_fir_decimate_f32(
/* Read coefficients */
c0 = *(pb++);
- /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
x0 = *(px0++);
x1 = *(px1++);
x2 = *(px2++);
@@ -279,13 +521,13 @@ void arm_fir_decimate_f32(
acc2 += x2 * c0;
acc3 += x3 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* Advance the state pointer by the decimation factor
* to process the next group of decimation factor number samples */
- pState = pState + 4 * S->M;
+ pState = pState + S->M * 4;
/* The result is in the accumulator, store in the destination buffer. */
*pDst++ = acc0;
@@ -293,149 +535,20 @@ void arm_fir_decimate_f32(
*pDst++ = acc2;
*pDst++ = acc3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- while (blkCntN4 > 0U)
- {
- /* Copy decimation factor number of new input samples into the state buffer */
- i = S->M;
-
- do
- {
- *pStateCurnt++ = *pSrc++;
-
- } while (--i);
-
- /* Set accumulator to zero */
- sum0 = 0.0f;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize coeff pointer */
- pb = pCoeffs;
-
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
-
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
- while (tapCnt > 0U)
- {
- /* Read the b[numTaps-1] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-1] sample */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the b[numTaps-2] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-2] sample */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the b[numTaps-3] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-3] sample */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-4] sample */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = numTaps % 0x4U;
-
- while (tapCnt > 0U)
- {
- /* Read coefficients */
- c0 = *(pb++);
-
- /* Fetch 1 state variable */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Advance the state pointer by the decimation factor
- * to process the next group of decimation factor number samples */
- pState = pState + S->M;
-
- /* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = sum0;
-
- /* Decrement the loop counter */
- blkCntN4--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- i = (numTaps - 1U) >> 2;
-
- /* copy data */
- while (i > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- i--;
- }
-
- i = (numTaps - 1U) % 0x04U;
-
- /* copy data */
- while (i > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- i--;
- }
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
#else
-/* Run the below code for Cortex-M0 */
-
- /* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
-
- /* Total number of output samples to be computed */
+ /* Initialize blkCnt with number of samples */
blkCnt = outBlockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Copy decimation factor number of new input samples into the state buffer */
@@ -443,33 +556,88 @@ void arm_fir_decimate_f32(
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
/* Set accumulator to zero */
- sum0 = 0.0f;
+ acc0 = 0.0f;
/* Initialize state pointer */
- px = pState;
+ px0 = pState;
/* Initialize coeff pointer */
pb = pCoeffs;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *pb++;
/* Fetch 1 state variable */
- x0 = *px++;
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 += x0 * c0;
+ acc0 += x0 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -478,35 +646,58 @@ void arm_fir_decimate_f32(
pState = pState + S->M;
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = sum0;
+ *pDst++ = acc0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the start of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
- /* Copy numTaps number of values */
- i = (numTaps - 1U);
+#if defined (ARM_MATH_LOOPUNROLL)
- /* copy data */
- while (i > 0U)
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
index 00b2790..948b15c 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_fast_q15.c
* Description: Fast Q15 FIR Decimator
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,80 +29,75 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
- * @param[in] *S points to an instance of the Q15 FIR decimator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of input samples to process per call.
- * @return none
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, state buffers should be aligned by 32-bit
- *
- * Scaling and Overflow Behavior:
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
- * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
- *
- * \par
- * Refer to the function arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
- * Both the slow and the fast versions use the same instance structure.
- * Use the function arm_fir_decimate_init_q15() to initialize the filter structure.
+ @brief Processing function for the Q15 FIR decimator (fast variant).
+ @param[in] S points to an instance of the Q15 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of input samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
+ The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+ @remark
+ Refer to \ref arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_decimate_init_q15() to initialize the filter structure.
*/
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
void arm_fir_decimate_fast_q15(
const arm_fir_decimate_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *px; /* Temporary pointer for state buffer */
- q15_t *pb; /* Temporary pointer coefficient buffer */
- q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */
- q31_t sum0; /* Accumulators */
- q31_t acc0, acc1;
- q15_t *px0, *px1;
- uint32_t blkCntN3;
- uint32_t numTaps = S->numTaps; /* Number of taps */
- uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t c1; /* Temporary variables to hold state and coefficient values */
+#endif
/* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
-
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
/* Total number of output samples to be computed */
blkCnt = outBlockSize / 2;
blkCntN3 = outBlockSize - (2 * blkCnt);
-
while (blkCnt > 0U)
{
- /* Copy decimation factor number of new input samples into the state buffer */
- i = 2 * S->M;
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
@@ -110,70 +105,70 @@ void arm_fir_decimate_fast_q15(
acc0 = 0;
acc1 = 0;
- /* Initialize state pointer */
+ /* Initialize state pointer for all the samples */
px0 = pState;
-
px1 = pState + S->M;
-
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
- /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
- c0 = *__SIMD32(pb)++;
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
- x0 = *__SIMD32(px0)++;
-
- x1 = *__SIMD32(px1)++;
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
/* Perform the multiply-accumulate */
acc0 = __SMLAD(x0, c0, acc0);
-
acc1 = __SMLAD(x1, c0, acc1);
/* Read the b[numTaps-3] and b[numTaps-4] coefficient */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
- x0 = *__SIMD32(px0)++;
-
- x1 = *__SIMD32(px1)++;
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
/* Perform the multiply-accumulate */
acc0 = __SMLAD(x0, c0, acc0);
-
acc1 = __SMLAD(x1, c0, acc1);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *pb++;
- /* Fetch 1 state variable */
+ /* Fetch state variables for acc0, acc1 */
x0 = *px0++;
-
x1 = *px1++;
/* Perform the multiply-accumulate */
acc0 = __SMLAD(x0, c0, acc0);
acc1 = __SMLAD(x1, c0, acc1);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -186,12 +181,10 @@ void arm_fir_decimate_fast_q15(
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
*pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-
-
while (blkCntN3 > 0U)
{
/* Copy decimation factor number of new input samples into the state buffer */
@@ -199,11 +192,11 @@ void arm_fir_decimate_fast_q15(
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
- /*Set sum to zero */
+ /* Set accumulator to zero */
sum0 = 0;
/* Initialize state pointer */
@@ -212,38 +205,45 @@ void arm_fir_decimate_fast_q15(
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
- /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
- c0 = *__SIMD32(pb)++;
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
- /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
- x0 = *__SIMD32(px)++;
+ /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
+ x0 = read_q15x2_ia (&px);
- /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
- c1 = *__SIMD32(pb)++;
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
+ c1 = read_q15x2_ia ((q15_t **) &pb);
/* Perform the multiply-accumulate */
sum0 = __SMLAD(x0, c0, sum0);
/* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
- x0 = *__SIMD32(px)++;
+ x0 = read_q15x2_ia (&px);
/* Perform the multiply-accumulate */
sum0 = __SMLAD(x0, c1, sum0);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
@@ -255,7 +255,7 @@ void arm_fir_decimate_fast_q15(
/* Perform the multiply-accumulate */
sum0 = __SMLAD(x0, c0, sum0);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -267,68 +267,67 @@ void arm_fir_decimate_fast_q15(
/* so downsacle by 15 to get output in 1.15 */
*pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCntN3--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
i = (numTaps - 1U) >> 2U;
/* copy data */
while (i > 0U)
{
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
i = (numTaps - 1U) % 0x04U;
- /* copy data */
+ /* Copy data */
while (i > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
+
}
-#else
-
+#else /* #if defined (ARM_MATH_DSP) */
void arm_fir_decimate_fast_q15(
const arm_fir_decimate_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *px; /* Temporary pointer for state buffer */
- q15_t *pb; /* Temporary pointer coefficient buffer */
- q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
- q31_t sum0; /* Accumulators */
- q31_t acc0, acc1;
- q15_t *px0, *px1;
- uint32_t blkCntN3;
- uint32_t numTaps = S->numTaps; /* Number of taps */
- uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
/* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
-
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
/* Total number of output samples to be computed */
blkCnt = outBlockSize / 2;
@@ -336,12 +335,12 @@ void arm_fir_decimate_fast_q15(
while (blkCnt > 0U)
{
- /* Copy decimation factor number of new input samples into the state buffer */
- i = 2 * S->M;
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
@@ -351,18 +350,16 @@ void arm_fir_decimate_fast_q15(
/* Initialize state pointer */
px0 = pState;
-
px1 = pState + S->M;
-
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
/* Read the Read b[numTaps-1] coefficients */
@@ -387,7 +384,7 @@ void arm_fir_decimate_fast_q15(
acc0 += x0 * c0;
acc1 += x1 * c0;
- /* Read the b[numTaps-3] coefficients */
+ /* Read the b[numTaps-3] coefficients */
c0 = *pb++;
/* Read x[n-numTaps-3] for sample 0 and sample 1 */
@@ -413,9 +410,16 @@ void arm_fir_decimate_fast_q15(
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
@@ -443,8 +447,7 @@ void arm_fir_decimate_fast_q15(
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
*pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -455,11 +458,11 @@ void arm_fir_decimate_fast_q15(
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
- /*Set sum to zero */
+ /* Set accumulator to zero */
sum0 = 0;
/* Initialize state pointer */
@@ -468,17 +471,17 @@ void arm_fir_decimate_fast_q15(
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
- /* Read the Read b[numTaps-1] coefficients */
+ /* Read the b[numTaps-1] coefficient */
c0 = *pb++;
- /* Read x[n-numTaps-1] and sample */
+ /* Read x[n-numTaps-1] sample */
x0 = *px++;
/* Perform the multiply-accumulate */
@@ -487,13 +490,13 @@ void arm_fir_decimate_fast_q15(
/* Read the b[numTaps-2] coefficient */
c0 = *pb++;
- /* Read x[n-numTaps-2] and sample */
+ /* Read x[n-numTaps-2] sample */
x0 = *px++;
/* Perform the multiply-accumulate */
sum0 += x0 * c0;
- /* Read the b[numTaps-3] coefficients */
+ /* Read the b[numTaps-3] coefficient */
c0 = *pb++;
/* Read x[n-numTaps-3] sample */
@@ -511,13 +514,20 @@ void arm_fir_decimate_fast_q15(
/* Perform the multiply-accumulate */
sum0 += x0 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
@@ -541,7 +551,7 @@ void arm_fir_decimate_fast_q15(
/* so downsacle by 15 to get output in 1.15 */
*pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCntN3--;
}
@@ -550,19 +560,19 @@ void arm_fir_decimate_fast_q15(
** This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
i = (numTaps - 1U) >> 2U;
/* copy data */
while (i > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
@@ -571,16 +581,15 @@ void arm_fir_decimate_fast_q15(
/* copy data */
while (i > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
}
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#endif /* #if defined (ARM_MATH_DSP) */
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
index 3b3d817..2c3a28a 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_fast_q31.c
* Description: Fast Q31 FIR Decimator
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,151 +29,167 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
- * @param[in] *S points to an instance of the Q31 FIR decimator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of input samples to process per call.
- * @return none
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are added to a 2.30 accumulator.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
- *
- * \par
- * Refer to the function arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
- * Both the slow and the fast versions use the same instance structure.
- * Use the function arm_fir_decimate_init_q31() to initialize the filter structure.
+ @brief Processing function for the Q31 FIR decimator (fast variant).
+ @param[in] S points to an instance of the Q31 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are added to a 2.30 accumulator.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
+
+ @remark
+ Refer to \ref arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_decimate_init_q31() to initialize the filter structure.
*/
void arm_fir_decimate_fast_q31(
- arm_fir_decimate_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const arm_fir_decimate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
- q31_t *px; /* Temporary pointers for state buffer */
- q31_t *pb; /* Temporary pointers for coefficient buffer */
- q31_t sum0; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Number of taps */
- uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
- uint32_t blkCntN2;
- q31_t x1;
- q31_t acc0, acc1;
- q31_t *px0, *px1;
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *px0; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t *px1, *px2, *px3;
+ q31_t x1, x2, x3;
+ q63_t acc1, acc2, acc3;
+#endif
/* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
- /* Total number of output samples to be computed */
+#if defined (ARM_MATH_LOOPUNROLL)
- blkCnt = outBlockSize / 2;
- blkCntN2 = outBlockSize - (2 * blkCnt);
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+ /* Samples loop unrolled by 4 */
while (blkCnt > 0U)
{
- /* Copy decimation factor number of new input samples into the state buffer */
- i = 2 * S->M;
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
- /* Set accumulator to zero */
+ /* Set accumulators to zero */
acc0 = 0;
acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
- /* Initialize state pointer */
+ /* Initialize state pointer for all the samples */
px0 = pState;
px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
/* Read the b[numTaps-1] coefficient */
- c0 = *(pb);
+ c0 = *(pb++);
- /* Read x[n-numTaps-1] for sample 0 sample 1 */
- x0 = *(px0);
- x1 = *(px1);
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
/* Read the b[numTaps-2] coefficient */
- c0 = *(pb + 1U);
+ c0 = *(pb++);
- /* Read x[n-numTaps-2] for sample 0 sample 1 */
- x0 = *(px0 + 1U);
- x1 = *(px1 + 1U);
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
/* Read the b[numTaps-3] coefficient */
- c0 = *(pb + 2U);
+ c0 = *(pb++);
- /* Read x[n-numTaps-3] for sample 0 sample 1 */
- x0 = *(px0 + 2U);
- x1 = *(px1 + 2U);
- pb += 4U;
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
/* Read the b[numTaps-4] coefficient */
- c0 = *(pb - 1U);
-
- /* Read x[n-numTaps-4] for sample 0 sample 1 */
- x0 = *(px0 + 3U);
- x1 = *(px1 + 3U);
+ c0 = *(pb++);
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
- /* update state pointers */
- px0 += 4U;
- px1 += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
while (tapCnt > 0U)
@@ -181,112 +197,135 @@ void arm_fir_decimate_fast_q31(
/* Read coefficients */
c0 = *(pb++);
- /* Fetch 1 state variable */
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
x0 = *(px0++);
x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* Advance the state pointer by the decimation factor
* to process the next group of decimation factor number samples */
- pState = pState + S->M * 2;
+ pState = pState + S->M * 4;
/* The result is in the accumulator, store in the destination buffer. */
*pDst++ = (q31_t) (acc0 << 1);
*pDst++ = (q31_t) (acc1 << 1);
+ *pDst++ = (q31_t) (acc2 << 1);
+ *pDst++ = (q31_t) (acc3 << 1);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- while (blkCntN2 > 0U)
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = outBlockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
/* Copy decimation factor number of new input samples into the state buffer */
i = S->M;
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
/* Set accumulator to zero */
- sum0 = 0;
+ acc0 = 0;
/* Initialize state pointer */
- px = pState;
+ px0 = pState;
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
/* Read the b[numTaps-1] coefficient */
- c0 = *(pb++);
+ c0 = *pb++;
/* Read x[n-numTaps-1] sample */
- x0 = *(px++);
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
/* Read the b[numTaps-2] coefficient */
- c0 = *(pb++);
+ c0 = *pb++;
/* Read x[n-numTaps-2] sample */
- x0 = *(px++);
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
/* Read the b[numTaps-3] coefficient */
- c0 = *(pb++);
+ c0 = *pb++;
/* Read x[n-numTaps-3] sample */
- x0 = *(px++);
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
/* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
+ c0 = *pb++;
/* Read x[n-numTaps-4] sample */
- x0 = *(px++);
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
- c0 = *(pb++);
+ c0 = *pb++;
/* Fetch 1 state variable */
- x0 = *(px++);
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -295,45 +334,57 @@ void arm_fir_decimate_fast_q31(
pState = pState + S->M;
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = (q31_t) (sum0 << 1);
+ *pDst++ = (q31_t) (acc0 << 1);
- /* Decrement the loop counter */
- blkCntN2--;
+ /* Decrement loop counter */
+ blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
- i = (numTaps - 1U) >> 2U;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* copy data */
- while (i > 0U)
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
- i = (numTaps - 1U) % 0x04U;
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
- /* copy data */
- while (i > 0U)
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
+
}
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
index 20eb959..9382f09 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_init_f32.c
* Description: Floating-point FIR Decimator initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,44 +29,44 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Initialization function for the floating-point FIR decimator.
- * @param[in,out] *S points to an instance of the floating-point FIR decimator structure.
- * @param[in] numTaps number of coefficients in the filter.
- * @param[in] M decimation factor.
- * @param[in] *pCoeffs points to the filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * blockSize is not a multiple of M.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32().
- * M is the decimation factor.
+ @brief Initialization function for the floating-point FIR decimator.
+ @param[in,out] S points to an instance of the floating-point FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32().
+ M is the decimation factor.
*/
arm_status arm_fir_decimate_init_f32(
- arm_fir_decimate_instance_f32 * S,
- uint16_t numTaps,
- uint8_t M,
- float32_t * pCoeffs,
- float32_t * pState,
- uint32_t blockSize)
+ arm_fir_decimate_instance_f32 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
{
arm_status status;
@@ -84,7 +84,7 @@ arm_status arm_fir_decimate_init_f32(
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear state buffer and size is always (blockSize + numTaps - 1) */
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
/* Assign state pointer */
@@ -101,5 +101,5 @@ arm_status arm_fir_decimate_init_f32(
}
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
index 9094de5..f583a03 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_init_q15.c
* Description: Initialization function for the Q15 FIR Decimator
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,47 +29,46 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Initialization function for the Q15 FIR decimator.
- * @param[in,out] *S points to an instance of the Q15 FIR decimator structure.
- * @param[in] numTaps number of coefficients in the filter.
- * @param[in] M decimation factor.
- * @param[in] *pCoeffs points to the filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * blockSize is not a multiple of M.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples
- * to the call arm_fir_decimate_q15().
- * M is the decimation factor.
+ @brief Initialization function for the Q15 FIR decimator.
+ @param[in,out] S points to an instance of the Q15 FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples
+ to the call arm_fir_decimate_q15().
+ M is the decimation factor.
*/
arm_status arm_fir_decimate_init_q15(
- arm_fir_decimate_instance_q15 * S,
- uint16_t numTaps,
- uint8_t M,
- q15_t * pCoeffs,
- q15_t * pState,
- uint32_t blockSize)
+ arm_fir_decimate_instance_q15 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
{
-
arm_status status;
/* The size of the input block must be a multiple of the decimation factor */
@@ -86,13 +85,13 @@ arm_status arm_fir_decimate_init_q15(
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear the state buffer. The size of buffer is always (blockSize + numTaps - 1) */
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
/* Assign state pointer */
S->pState = pState;
- /* Assign Decimation factor */
+ /* Assign Decimation Factor */
S->M = M;
status = ARM_MATH_SUCCESS;
@@ -103,5 +102,5 @@ arm_status arm_fir_decimate_init_q15(
}
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
index a223a8e..5ee69c6 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_init_q31.c
* Description: Initialization function for Q31 FIR Decimation filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,44 +29,44 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Initialization function for the Q31 FIR decimator.
- * @param[in,out] *S points to an instance of the Q31 FIR decimator structure.
- * @param[in] numTaps number of coefficients in the filter.
- * @param[in] M decimation factor.
- * @param[in] *pCoeffs points to the filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * blockSize is not a multiple of M.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_q31().
- * M is the decimation factor.
+ @brief Initialization function for the Q31 FIR decimator.
+ @param[in,out] S points to an instance of the Q31 FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
-* \par
-* \image html FIR.gif "Finite Impulse Response filter"
-* \par
-* pCoeffs points to a coefficient array of size numTaps.
-* Coefficients are stored in time reversed order.
-* \par
-*
-* \par
-* pState points to a state array of size numTaps + blockSize - 1.
-* Samples in the state buffer are stored in the following order.
-* \par
-*
-* \par
-* Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1.
-* The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
-* to be avoided and yields a significant speed improvement.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-* There are separate instance structure declarations for each of the 4 supported data types.
-*
-* \par Initialization Functions
-* There is also an associated initialization function for each data type.
-* The initialization function performs the following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* The code below statically initializes each of the 4 different data type filter instance structures
-*
-*arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
-*arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
-*arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
-*arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
-*
-*
-* where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
-* pCoeffs is the address of the coefficient buffer.
-*
-* \par Fixed-Point Behavior
-* Care must be taken when using the fixed-point versions of the FIR filter functions.
-* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
-* Refer to the function specific documentation below for usage guidelines.
-*/
+ @defgroup FIR Finite Impulse Response (FIR) Filters
+
+ This set of functions implements Finite Impulse Response (FIR) filters
+ for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided.
+ The functions operate on blocks of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst points to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
+ Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n].
+
+ @par
+ \image html FIR.GIF "Finite Impulse Response filter"
+ @par
+ pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1.
+ The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 4 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 4 different data type filter instance structures
+
+ arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
+
+ where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
/**
-* @addtogroup FIR
-* @{
-*/
+ @addtogroup FIR
+ @{
+ */
/**
-*
-* @param[in] *S points to an instance of the floating-point FIR filter structure.
-* @param[in] *pSrc points to the block of input data.
-* @param[out] *pDst points to the block of output data.
-* @param[in] blockSize number of samples to process per call.
-* @return none.
-*
-*/
-
-#if defined(ARM_MATH_CM7)
+ @brief Processing function for floating-point FIR filter.
+ @param[in] S points to an instance of the floating-point FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+#if defined(ARM_MATH_NEON)
void arm_fir_f32(
const arm_fir_instance_f32 * S,
-float32_t * pSrc,
+const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
- float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
- float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */
+ float32_t *px; /* Temporary pointers for state buffer */
+ const float32_t *pb; /* Temporary pointers for coefficient buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt; /* Loop counters */
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Apply loop unrolling and compute 8 output values simultaneously.
- * The variables acc0 ... acc7 hold output values that are being computed:
- *
- * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
- * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
- */
- blkCnt = blockSize >> 3;
-
- /* First part of the processing with loop unrolling. Compute 8 outputs at a time.
- ** a second loop below computes the remaining 1 to 7 samples. */
- while (blkCnt > 0U)
- {
- /* Copy four new input samples into the state buffer */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
-
- /* Set all accumulators to zero */
- acc0 = 0.0f;
- acc1 = 0.0f;
- acc2 = 0.0f;
- acc3 = 0.0f;
- acc4 = 0.0f;
- acc5 = 0.0f;
- acc6 = 0.0f;
- acc7 = 0.0f;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize coeff pointer */
- pb = (pCoeffs);
-
- /* This is separated from the others to avoid
- * a call to __aeabi_memmove which would be slower
- */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
-
- /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
- x0 = *px++;
- x1 = *px++;
- x2 = *px++;
- x3 = *px++;
- x4 = *px++;
- x5 = *px++;
- x6 = *px++;
-
- /* Loop unrolling. Process 8 taps at a time. */
- tapCnt = numTaps >> 3U;
-
- /* Loop over the number of taps. Unroll by a factor of 8.
- ** Repeat until we've computed numTaps-8 coefficients. */
- while (tapCnt > 0U)
- {
- /* Read the b[numTaps-1] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-3] sample */
- x7 = *(px++);
-
- /* acc0 += b[numTaps-1] * x[n-numTaps] */
- acc0 += x0 * c0;
-
- /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
- acc1 += x1 * c0;
-
- /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
- acc2 += x2 * c0;
-
- /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
- acc3 += x3 * c0;
-
- /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
- acc4 += x4 * c0;
-
- /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
- acc5 += x5 * c0;
-
- /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
- acc6 += x6 * c0;
-
- /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
- acc7 += x7 * c0;
-
- /* Read the b[numTaps-2] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-4] sample */
- x0 = *(px++);
-
- /* Perform the multiply-accumulate */
- acc0 += x1 * c0;
- acc1 += x2 * c0;
- acc2 += x3 * c0;
- acc3 += x4 * c0;
- acc4 += x5 * c0;
- acc5 += x6 * c0;
- acc6 += x7 * c0;
- acc7 += x0 * c0;
-
- /* Read the b[numTaps-3] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-5] sample */
- x1 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x2 * c0;
- acc1 += x3 * c0;
- acc2 += x4 * c0;
- acc3 += x5 * c0;
- acc4 += x6 * c0;
- acc5 += x7 * c0;
- acc6 += x0 * c0;
- acc7 += x1 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x2 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x3 * c0;
- acc1 += x4 * c0;
- acc2 += x5 * c0;
- acc3 += x6 * c0;
- acc4 += x7 * c0;
- acc5 += x0 * c0;
- acc6 += x1 * c0;
- acc7 += x2 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x3 = *(px++);
- /* Perform the multiply-accumulates */
- acc0 += x4 * c0;
- acc1 += x5 * c0;
- acc2 += x6 * c0;
- acc3 += x7 * c0;
- acc4 += x0 * c0;
- acc5 += x1 * c0;
- acc6 += x2 * c0;
- acc7 += x3 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x4 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x5 * c0;
- acc1 += x6 * c0;
- acc2 += x7 * c0;
- acc3 += x0 * c0;
- acc4 += x1 * c0;
- acc5 += x2 * c0;
- acc6 += x3 * c0;
- acc7 += x4 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x5 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x6 * c0;
- acc1 += x7 * c0;
- acc2 += x0 * c0;
- acc3 += x1 * c0;
- acc4 += x2 * c0;
- acc5 += x3 * c0;
- acc6 += x4 * c0;
- acc7 += x5 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x6 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x7 * c0;
- acc1 += x0 * c0;
- acc2 += x1 * c0;
- acc3 += x2 * c0;
- acc4 += x3 * c0;
- acc5 += x4 * c0;
- acc6 += x5 * c0;
- acc7 += x6 * c0;
-
- tapCnt--;
- }
-
- /* If the filter length is not a multiple of 8, compute the remaining filter taps */
- tapCnt = numTaps % 0x8U;
-
- while (tapCnt > 0U)
- {
- /* Read coefficients */
- c0 = *(pb++);
-
- /* Fetch 1 state variable */
- x7 = *(px++);
-
- /* Perform the multiply-accumulates */
- acc0 += x0 * c0;
- acc1 += x1 * c0;
- acc2 += x2 * c0;
- acc3 += x3 * c0;
- acc4 += x4 * c0;
- acc5 += x5 * c0;
- acc6 += x6 * c0;
- acc7 += x7 * c0;
-
- /* Reuse the present sample states for next sample */
- x0 = x1;
- x1 = x2;
- x2 = x3;
- x3 = x4;
- x4 = x5;
- x5 = x6;
- x6 = x7;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Advance the state pointer by 8 to process the next group of 8 samples */
- pState = pState + 8;
-
- /* The results in the 8 accumulators, store in the destination buffer. */
- *pDst++ = acc0;
- *pDst++ = acc1;
- *pDst++ = acc2;
- *pDst++ = acc3;
- *pDst++ = acc4;
- *pDst++ = acc5;
- *pDst++ = acc6;
- *pDst++ = acc7;
-
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x8U;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0.0f;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = (pCoeffs);
-
- i = numTaps;
-
- /* Perform the multiply-accumulates */
- do
- {
- acc0 += *px++ * *pb++;
- i--;
-
- } while (i > 0U);
-
- /* The result is store in the destination buffer. */
- *pDst++ = acc0;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- blkCnt--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the start of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- tapCnt = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-}
-
-#elif defined(ARM_MATH_CM0_FAMILY)
-
-void arm_fir_f32(
-const arm_fir_instance_f32 * S,
-float32_t * pSrc,
-float32_t * pDst,
-uint32_t blockSize)
-{
- float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
-
- /* Run the below code for Cortex-M0 */
-
+ float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,x,b,accv;
+ uint32x4_t x0_u,x1_u,x2_u,xa_u,xb_u;
float32_t acc;
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Initialize blkCnt with blockSize */
- blkCnt = blockSize;
+ /* Loop unrolling */
+ blkCnt = blockSize >> 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 8 samples at a time into state buffers */
+ samples0 = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,samples0);
+
+ pStateCurnt += 4;
+ pSrc += 4 ;
+
+ samples1 = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,samples1);
+
+ pStateCurnt += 4;
+ pSrc += 4 ;
+
+ /* Set the accumulators to zero */
+ accv0 = vdupq_n_f32(0);
+ accv1 = vdupq_n_f32(0);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Loop unroling */
+ i = numTaps >> 2;
+
+ /* Perform the multiply-accumulates */
+ x0 = vld1q_f32(px);
+ x1 = vld1q_f32(px + 4);
+
+ while(i > 0)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ x2 = vld1q_f32(px + 8);
+ b = vld1q_f32(pb);
+ xa = x0;
+ xb = x1;
+ accv0 = vmlaq_n_f32(accv0,xa,b[0]);
+ accv1 = vmlaq_n_f32(accv1,xb,b[0]);
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,b[1]);
+ accv1 = vmlaq_n_f32(accv1,xb,b[1]);
+
+ xa = vextq_f32(x0,x1,2);
+ xb = vextq_f32(x1,x2,2);
+
+ accv0 = vmlaq_n_f32(accv0,xa,b[2]);
+ accv1 = vmlaq_n_f32(accv1,xb,b[2]);
+
+ xa = vextq_f32(x0,x1,3);
+ xb = vextq_f32(x1,x2,3);
+
+ accv0 = vmlaq_n_f32(accv0,xa,b[3]);
+ accv1 = vmlaq_n_f32(accv1,xb,b[3]);
+
+ pb += 4;
+ x0 = x1;
+ x1 = x2;
+ px += 4;
+ i--;
+
+ }
+
+ /* Tail */
+ i = numTaps & 3;
+ x2 = vld1q_f32(px + 8);
+
+ /* Perform the multiply-accumulates */
+ switch(i)
+ {
+ case 3:
+ {
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,2);
+ xb = vextq_f32(x1,x2,2);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ }
+ break;
+ case 2:
+ {
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ }
+ break;
+ case 1:
+ {
+
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* The result is stored in the destination buffer. */
+ vst1q_f32(pDst,accv0);
+ pDst += 4;
+ vst1q_f32(pDst,accv1);
+ pDst += 4;
+
+ /* Advance state pointer by 8 for the next 8 samples */
+ pState = pState + 8;
+
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x7;
while (blkCnt > 0U)
{
@@ -516,7 +308,7 @@ uint32_t blockSize)
} while (i > 0U);
- /* The result is store in the destination buffer. */
+ /* The result is stored in the destination buffer. */
*pDst++ = acc;
/* Advance state pointer by 1 for the next sample */
@@ -545,441 +337,379 @@ uint32_t blockSize)
}
}
+#else
+void arm_fir_f32(
+ const arm_fir_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
+ float32_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
+ float32_t x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
+ float32_t c0; /* Temporary variable to hold coefficient value */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 8 output values simultaneously.
+ * The variables acc0 ... acc7 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+ acc4 = 0.0f;
+ acc5 = 0.0f;
+ acc6 = 0.0f;
+ acc7 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* This is separated from the others to avoid
+ * a call to __aeabi_memmove which would be slower
+ */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+ x3 = *px++;
+ x4 = *px++;
+ x5 = *px++;
+ x6 = *px++;
+
+ /* Loop unrolling: process 8 taps at a time. */
+ tapCnt = numTaps >> 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x7 = *(px++);
+
+ /* acc0 += b[numTaps-1] * x[n-numTaps] */
+ acc0 += x0 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
+ acc2 += x2 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
+ acc3 += x3 * c0;
+
+ /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
+ acc4 += x4 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
+ acc5 += x5 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
+ acc6 += x6 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
+ acc7 += x7 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c0;
+ acc1 += x2 * c0;
+ acc2 += x3 * c0;
+ acc3 += x4 * c0;
+ acc4 += x5 * c0;
+ acc5 += x6 * c0;
+ acc6 += x7 * c0;
+ acc7 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x2 * c0;
+ acc1 += x3 * c0;
+ acc2 += x4 * c0;
+ acc3 += x5 * c0;
+ acc4 += x6 * c0;
+ acc5 += x7 * c0;
+ acc6 += x0 * c0;
+ acc7 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x3 * c0;
+ acc1 += x4 * c0;
+ acc2 += x5 * c0;
+ acc3 += x6 * c0;
+ acc4 += x7 * c0;
+ acc5 += x0 * c0;
+ acc6 += x1 * c0;
+ acc7 += x2 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x3 = *(px++);
+ /* Perform the multiply-accumulates */
+ acc0 += x4 * c0;
+ acc1 += x5 * c0;
+ acc2 += x6 * c0;
+ acc3 += x7 * c0;
+ acc4 += x0 * c0;
+ acc5 += x1 * c0;
+ acc6 += x2 * c0;
+ acc7 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x4 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x5 * c0;
+ acc1 += x6 * c0;
+ acc2 += x7 * c0;
+ acc3 += x0 * c0;
+ acc4 += x1 * c0;
+ acc5 += x2 * c0;
+ acc6 += x3 * c0;
+ acc7 += x4 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x5 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x6 * c0;
+ acc1 += x7 * c0;
+ acc2 += x0 * c0;
+ acc3 += x1 * c0;
+ acc4 += x2 * c0;
+ acc5 += x3 * c0;
+ acc6 += x4 * c0;
+ acc7 += x5 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x6 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x7 * c0;
+ acc1 += x0 * c0;
+ acc2 += x1 * c0;
+ acc3 += x2 * c0;
+ acc4 += x3 * c0;
+ acc5 += x4 * c0;
+ acc6 += x5 * c0;
+ acc7 += x6 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = numTaps % 0x8U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x7 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+ acc4 += x4 * c0;
+ acc5 += x5 * c0;
+ acc6 += x6 * c0;
+ acc7 += x7 * c0;
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+ x3 = x4;
+ x4 = x5;
+ x5 = x6;
+ x6 = x7;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 8 to process the next group of 8 samples */
+ pState = pState + 8;
+
+ /* The results in the 8 accumulators, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+ *pDst++ = acc4;
+ *pDst++ = acc5;
+ *pDst++ = acc6;
+ *pDst++ = acc7;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x8U;
#else
-/* Run the below code for Cortex-M4 and Cortex-M3 */
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
-void arm_fir_f32(
-const arm_fir_instance_f32 * S,
-float32_t * pSrc,
-float32_t * pDst,
-uint32_t blockSize)
-{
- float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
- float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
- float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
- float32_t p0,p1,p2,p3,p4,p5,p6,p7; /* Temporary product values */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
- /* Apply loop unrolling and compute 8 output values simultaneously.
- * The variables acc0 ... acc7 hold output values that are being computed:
- *
- * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
- * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
- */
- blkCnt = blockSize >> 3;
+ /* Set the accumulator to zero */
+ acc0 = 0.0f;
- /* First part of the processing with loop unrolling. Compute 8 outputs at a time.
- ** a second loop below computes the remaining 1 to 7 samples. */
- while (blkCnt > 0U)
- {
- /* Copy four new input samples into the state buffer */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
+ /* Initialize state pointer */
+ px = pState;
- /* Set all accumulators to zero */
- acc0 = 0.0f;
- acc1 = 0.0f;
- acc2 = 0.0f;
- acc3 = 0.0f;
- acc4 = 0.0f;
- acc5 = 0.0f;
- acc6 = 0.0f;
- acc7 = 0.0f;
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
- /* Initialize state pointer */
- px = pState;
+ i = numTaps;
- /* Initialize coeff pointer */
- pb = (pCoeffs);
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc0 += *px++ * *pb++;
- /* This is separated from the others to avoid
- * a call to __aeabi_memmove which would be slower
- */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
+ i--;
+ } while (i > 0U);
- /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
- x0 = *px++;
- x1 = *px++;
- x2 = *px++;
- x3 = *px++;
- x4 = *px++;
- x5 = *px++;
- x6 = *px++;
+ /* Store result in destination buffer. */
+ *pDst++ = acc0;
- /* Loop unrolling. Process 8 taps at a time. */
- tapCnt = numTaps >> 3U;
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
- /* Loop over the number of taps. Unroll by a factor of 8.
- ** Repeat until we've computed numTaps-8 coefficients. */
- while (tapCnt > 0U)
- {
- /* Read the b[numTaps-1] coefficient */
- c0 = *(pb++);
+ /* Decrement loop counter */
+ blkCnt--;
+ }
- /* Read x[n-numTaps-3] sample */
- x7 = *(px++);
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
- /* acc0 += b[numTaps-1] * x[n-numTaps] */
- p0 = x0 * c0;
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
- /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
- p1 = x1 * c0;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
- p2 = x2 * c0;
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
- /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
- p3 = x3 * c0;
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
- /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
- p4 = x4 * c0;
+ /* Decrement loop counter */
+ tapCnt--;
+ }
- /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
- p5 = x5 * c0;
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
- /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
- p6 = x6 * c0;
+#else
- /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
- p7 = x7 * c0;
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
- /* Read the b[numTaps-2] coefficient */
- c0 = *(pb++);
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* Read x[n-numTaps-4] sample */
- x0 = *(px++);
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
+ /* Decrement loop counter */
+ tapCnt--;
+ }
-
- /* Perform the multiply-accumulate */
- p0 = x1 * c0;
- p1 = x2 * c0;
- p2 = x3 * c0;
- p3 = x4 * c0;
- p4 = x5 * c0;
- p5 = x6 * c0;
- p6 = x7 * c0;
- p7 = x0 * c0;
-
- /* Read the b[numTaps-3] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-5] sample */
- x1 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x2 * c0;
- p1 = x3 * c0;
- p2 = x4 * c0;
- p3 = x5 * c0;
- p4 = x6 * c0;
- p5 = x7 * c0;
- p6 = x0 * c0;
- p7 = x1 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x2 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x3 * c0;
- p1 = x4 * c0;
- p2 = x5 * c0;
- p3 = x6 * c0;
- p4 = x7 * c0;
- p5 = x0 * c0;
- p6 = x1 * c0;
- p7 = x2 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x3 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x4 * c0;
- p1 = x5 * c0;
- p2 = x6 * c0;
- p3 = x7 * c0;
- p4 = x0 * c0;
- p5 = x1 * c0;
- p6 = x2 * c0;
- p7 = x3 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x4 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x5 * c0;
- p1 = x6 * c0;
- p2 = x7 * c0;
- p3 = x0 * c0;
- p4 = x1 * c0;
- p5 = x2 * c0;
- p6 = x3 * c0;
- p7 = x4 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x5 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x6 * c0;
- p1 = x7 * c0;
- p2 = x0 * c0;
- p3 = x1 * c0;
- p4 = x2 * c0;
- p5 = x3 * c0;
- p6 = x4 * c0;
- p7 = x5 * c0;
-
- /* Read the b[numTaps-4] coefficient */
- c0 = *(pb++);
-
- /* Read x[n-numTaps-6] sample */
- x6 = *(px++);
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Perform the multiply-accumulates */
- p0 = x7 * c0;
- p1 = x0 * c0;
- p2 = x1 * c0;
- p3 = x2 * c0;
- p4 = x3 * c0;
- p5 = x4 * c0;
- p6 = x5 * c0;
- p7 = x6 * c0;
-
- tapCnt--;
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
- }
-
- /* If the filter length is not a multiple of 8, compute the remaining filter taps */
- tapCnt = numTaps % 0x8U;
-
- while (tapCnt > 0U)
- {
- /* Read coefficients */
- c0 = *(pb++);
-
- /* Fetch 1 state variable */
- x7 = *(px++);
-
- /* Perform the multiply-accumulates */
- p0 = x0 * c0;
- p1 = x1 * c0;
- p2 = x2 * c0;
- p3 = x3 * c0;
- p4 = x4 * c0;
- p5 = x5 * c0;
- p6 = x6 * c0;
- p7 = x7 * c0;
-
- /* Reuse the present sample states for next sample */
- x0 = x1;
- x1 = x2;
- x2 = x3;
- x3 = x4;
- x4 = x5;
- x5 = x6;
- x6 = x7;
-
- acc0 += p0;
- acc1 += p1;
- acc2 += p2;
- acc3 += p3;
- acc4 += p4;
- acc5 += p5;
- acc6 += p6;
- acc7 += p7;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Advance the state pointer by 8 to process the next group of 8 samples */
- pState = pState + 8;
-
- /* The results in the 8 accumulators, store in the destination buffer. */
- *pDst++ = acc0;
- *pDst++ = acc1;
- *pDst++ = acc2;
- *pDst++ = acc3;
- *pDst++ = acc4;
- *pDst++ = acc5;
- *pDst++ = acc6;
- *pDst++ = acc7;
-
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x8U;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0.0f;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = (pCoeffs);
-
- i = numTaps;
-
- /* Perform the multiply-accumulates */
- do
- {
- acc0 += *px++ * *pb++;
- i--;
-
- } while (i > 0U);
-
- /* The result is store in the destination buffer. */
- *pDst++ = acc0;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- blkCnt--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the start of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- tapCnt = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
}
-#endif
-
+#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c b/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
index 212990c..5f8df95 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_fast_q15.c
* Description: Q15 Fast FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,78 +29,78 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @param[in] *S points to an instance of the Q15 FIR filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
- * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
- *
- * \par
- * Refer to the function arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
- * Use the function arm_fir_init_q15() to initialize the filter structure.
+ @brief Processing function for the Q15 FIR filter (fast version).
+ @param[in] S points to an instance of the Q15 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+
+ @remark
+ Refer to \ref arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_init_q15() to initialize the filter structure.
*/
void arm_fir_fast_q15(
const arm_fir_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *pb; /* Temporary pointer for coefficient buffer */
- q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */
- q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
- uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Apply loop unrolling and compute 4 output values simultaneously.
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
* The variables acc0 ... acc3 hold output values that are being computed:
*
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
*/
+ blkCnt = blockSize >> 2U;
- blkCnt = blockSize >> 2;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Copy four new input samples into the state buffer.
- ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
+ /* Copy 4 new input samples into the state buffer. */
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
-
/* Set all accumulators to zero */
acc0 = 0;
acc1 = 0;
@@ -114,19 +114,19 @@ void arm_fir_fast_q15(
pb = pCoeffs;
/* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
- x0 = *__SIMD32(px)++;
+ x0 = read_q15x2_ia (&px);
/* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
- x2 = *__SIMD32(px)++;
+ x2 = read_q15x2_ia (&px);
/* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
- tapCnt = numTaps >> 2;
+ Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2U;
- while (tapCnt > 0)
+ while (tapCnt > 0U)
{
/* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
acc0 = __SMLAD(x0, c0, acc0);
@@ -142,7 +142,7 @@ void arm_fir_fast_q15(
#endif
/* Read state x[n-N-4], x[n-N-5] */
- x0 = _SIMD32_OFFSET(px);
+ x0 = read_q15x2_ia (&px);
/* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
acc1 = __SMLADX(x1, c0, acc1);
@@ -158,13 +158,13 @@ void arm_fir_fast_q15(
acc3 = __SMLADX(x1, c0, acc3);
/* Read coefficients b[N-2], b[N-3] */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
acc0 = __SMLAD(x2, c0, acc0);
/* Read state x[n-N-6], x[n-N-7] with offset */
- x2 = _SIMD32_OFFSET(px + 2U);
+ x2 = read_q15x2_ia (&px);
/* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
acc2 = __SMLAD(x0, c0, acc2);
@@ -182,21 +182,16 @@ void arm_fir_fast_q15(
/* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
acc3 = __SMLADX(x1, c0, acc3);
- /* Update state pointer for next state reading */
- px += 4U;
-
/* Decrement tap count */
tapCnt--;
-
}
/* If the filter length is not a multiple of 4, compute the remaining filter taps.
- ** This is always be 2 taps since the filter length is even. */
+ This is always be 2 taps since the filter length is even. */
if ((numTaps & 0x3U) != 0U)
{
-
/* Read last two coefficients */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* Perform the multiply-accumulates */
acc0 = __SMLAD(x0, c0, acc0);
@@ -210,7 +205,7 @@ void arm_fir_fast_q15(
#endif
/* Read last state variables */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 (px);
/* Perform the multiply-accumulates */
acc1 = __SMLADX(x1, c0, acc1);
@@ -226,38 +221,33 @@ void arm_fir_fast_q15(
acc3 = __SMLADX(x1, c0, acc3);
}
- /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the 4 outputs in the destination buffer. */
-
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ Then store the 4 outputs in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Advance the state pointer by 4 to process the next group of 4 samples */
pState = pState + 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining output samples */
blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Copy two samples into state buffer */
@@ -274,35 +264,37 @@ void arm_fir_fast_q15(
do
{
-
- acc0 += (q31_t) * px++ * *pb++;
- acc0 += (q31_t) * px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
tapCnt--;
}
while (tapCnt > 0U);
- /* The result is in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the output in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
- /* Calculation of count for copying integer writes */
- tapCnt = (numTaps - 1U) >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
@@ -310,24 +302,31 @@ void arm_fir_fast_q15(
*pStateCurnt++ = *pState++;
*pStateCurnt++ = *pState++;
+ /* Decrement loop counter */
tapCnt--;
-
}
- /* Calculation of count for remaining q15_t data */
+ /* Calculate remaining number of copies */
tapCnt = (numTaps - 1U) % 0x4U;
- /* copy remaining data */
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c b/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
index d37e13c..513cb72 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_fast_q31.c
* Description: Processing function for the Q31 Fast FIR filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,73 +29,75 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @param[in] *S points to an instance of the Q31 structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- *
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are added to a 2.30 accumulator.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
- *
- * \par
- * Refer to the function arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
- * Use the function arm_fir_init_q31() to initialize the filter structure.
+ @brief Processing function for the Q31 FIR filter (fast version).
+ @param[in] S points to an instance of the Q31 structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are added to a 2.30 accumulator.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+
+ @remark
+ Refer to \ref arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_init_q31() to initialize the filter structure.
*/
IAR_ONLY_LOW_OPTIMIZATION_ENTER
void arm_fir_fast_q31(
const arm_fir_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t x0, x1, x2, x3; /* Temporary variables to hold state */
- q31_t c0; /* Temporary variable to hold coefficient value */
- q31_t *px; /* Temporary pointer for state */
- q31_t *pb; /* Temporary pointer for coefficient buffer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
- /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Apply loop unrolling and compute 4 output values simultaneously.
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
* The variables acc0 ... acc3 hold output values that are being computed:
*
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
*/
- blkCnt = blockSize >> 2;
+ blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Copy four new input samples into the state buffer */
+ /* Copy 4 new input samples into the state buffer. */
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
@@ -113,17 +115,18 @@ void arm_fir_fast_q31(
/* Initialize coefficient pointer */
pb = pCoeffs;
- /* Read the first three samples from the state buffer:
+ /* Read the first 3 samples from the state buffer:
* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
- i = tapCnt;
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
- while (i > 0U)
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
{
/* Read the b[numTaps] coefficient */
c0 = *pb;
@@ -183,14 +186,14 @@ void arm_fir_fast_q31(
pb += 4U;
px += 4U;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
- i = numTaps - (tapCnt * 4U);
- while (i > 0U)
+ while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *(pb++);
@@ -209,28 +212,33 @@ void arm_fir_fast_q31(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
- /* Advance the state pointer by 4 to process the next group of 4 samples */
- pState = pState + 4;
-
- /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
- ** Then store the 4 outputs in the destination buffer. */
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
+ Then store the 4 outputs in the destination buffer. */
*pDst++ = (q31_t) (acc0 << 1);
*pDst++ = (q31_t) (acc1 << 1);
*pDst++ = (q31_t) (acc2 << 1);
*pDst++ = (q31_t) (acc3 << 1);
- /* Decrement the samples loop counter */
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
blkCnt--;
}
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 4U;
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
@@ -244,39 +252,63 @@ void arm_fir_fast_q31(
px = pState;
/* Initialize Coefficient pointer */
- pb = (pCoeffs);
+ pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
do
{
- multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
+ multAcc_32x32_keep32_R(acc0, (*px++), (*pb++));
i--;
} while (i > 0U);
- /* The result is in 2.30 format. Convert to 1.31
- ** Then store the output in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.31
+ Then store the output in the destination buffer. */
*pDst++ = (q31_t) (acc0 << 1);
/* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
+ pState = pState + 1U;
- /* Decrement the samples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the start of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
/* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
- /* Copy the remaining q31_t data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
@@ -285,9 +317,8 @@ void arm_fir_fast_q31(
tapCnt--;
}
-
}
IAR_ONLY_LOW_OPTIMIZATION_EXIT
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_init_f32.c b/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
index 8bcb736..02e82ad 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_init_f32.c
* Description: Floating-point FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,41 +29,39 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @details
- *
- * @param[in,out] *S points to an instance of the floating-point FIR filter structure.
- * @param[in] numTaps Number of filter coefficients in the filter.
- * @param[in] *pCoeffs points to the filter coefficients buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of samples that are processed per call.
- * @return none.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32().
+ @brief Initialization function for the floating-point FIR filter.
+ @param[in,out] S points to an instance of the floating-point FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed per call
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32().
*/
void arm_fir_init_f32(
- arm_fir_instance_f32 * S,
- uint16_t numTaps,
- float32_t * pCoeffs,
- float32_t * pState,
- uint32_t blockSize)
+ arm_fir_instance_f32 * S,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -71,14 +69,13 @@ void arm_fir_init_f32(
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_init_q15.c b/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
index e4d6ef8..a5b2d06 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_init_q15.c
* Description: Q15 FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,66 +29,63 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @param[in,out] *S points to an instance of the Q15 FIR filter structure.
- * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
- * @param[in] *pCoeffs points to the filter coefficients buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize is number of samples processed per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if
- * numTaps is not greater than or equal to 4 and even.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * Note that numTaps must be even and greater than or equal to 4.
- * To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero.
- * For example, to implement a filter with numTaps=3 and coefficients
- *
- * {0.3, -0.8, 0.3}
- *
- * set numTaps=4 and use the coefficients:
- *
- * {0.3, -0.8, 0.3, 0}.
- *
- * Similarly, to implement a two point filter
- *
- * {0.3, -0.3}
- *
- * set numTaps=4 and use the coefficients:
- *
- * {0.3, -0.3, 0, 0}.
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15().
+ @brief Initialization function for the Q15 FIR filter.
+ @param[in,out] S points to an instance of the Q15 FIR filter structure.
+ @param[in] numTaps number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+ @param[in] pCoeffs points to the filter coefficients buffer.
+ @param[in] pState points to the state buffer.
+ @param[in] blockSize number of samples processed per call.
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : numTaps is not greater than or equal to 4 and even
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ Note that numTaps must be even and greater than or equal to 4.
+ To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero.
+ For example, to implement a filter with numTaps=3 and coefficients
+
+ {0.3, -0.8, 0.3}
+
+ set numTaps=4 and use the coefficients:
+
+ {0.3, -0.8, 0.3, 0}.
+
+ Similarly, to implement a two point filter
+
+ {0.3, -0.3}
+
+ set numTaps=4 and use the coefficients:
+
+ {0.3, -0.3, 0, 0}.
+
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15().
*/
arm_status arm_fir_init_q15(
- arm_fir_instance_q15 * S,
- uint16_t numTaps,
- q15_t * pCoeffs,
- q15_t * pState,
- uint32_t blockSize)
+ arm_fir_instance_q15 * S,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
{
arm_status status;
-
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
/* The Number of filter coefficients in the filter must be even and at least 4 */
if (numTaps & 0x1U)
{
@@ -115,15 +112,13 @@ arm_status arm_fir_init_q15(
#else
- /* Run the below code for Cortex-M0 */
-
/* Assign filter taps */
S->numTaps = numTaps;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
/* Assign state pointer */
@@ -133,10 +128,10 @@ arm_status arm_fir_init_q15(
return (status);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_init_q31.c b/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
index 3308438..7d8376f 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_init_q31.c
* Description: Q31 FIR filter initialization function.
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,41 +29,38 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @details
- *
- * @param[in,out] *S points to an instance of the Q31 FIR filter structure.
- * @param[in] numTaps Number of filter coefficients in the filter.
- * @param[in] *pCoeffs points to the filter coefficients buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of samples that are processed per call.
- * @return none.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31().
+ @brief Initialization function for the Q31 FIR filter.
+ @param[in,out] S points to an instance of the Q31 FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
- * \par
- * pState points to the array of state variables.
- * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7().
+ @brief Initialization function for the Q7 FIR filter.
+ @param[in,out] S points to an instance of the Q7 FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7().
*/
void arm_fir_init_q7(
- arm_fir_instance_q7 * S,
- uint16_t numTaps,
- q7_t * pCoeffs,
- q7_t * pState,
- uint32_t blockSize)
+ arm_fir_instance_q7 * S,
+ uint16_t numTaps,
+ const q7_t * pCoeffs,
+ q7_t * pState,
+ uint32_t blockSize)
{
-
/* Assign filter taps */
S->numTaps = numTaps;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q7_t));
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
index 66cfcf8..ee0ed27 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_f32.c
* Description: Floating-point FIR interpolation sequences
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,120 +29,117 @@
#include "arm_math.h"
/**
- * @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
- *
- * These functions combine an upsampler (zero stuffer) and an FIR filter.
- * They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
- * Conceptually, the functions are equivalent to the block diagram below:
- * \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
- * After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized
- * cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum.
- * The user of the function is responsible for providing the filter coefficients.
- *
- * The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
- * The upsampler inserts L-1 zeros between each sample.
- * Instead of multiplying by these zero values, the FIR filter is designed to skip them.
- * This leads to an efficient implementation without any wasted effort.
- * The functions operate on blocks of input and output data.
- * pSrc points to an array of blockSize input values and
- * pDst points to an array of blockSize*L output values.
- *
- * The library provides separate functions for Q15, Q31, and floating-point data types.
- *
- * \par Algorithm:
- * The functions use a polyphase filter structure:
- *
- * This approach is more efficient than straightforward upsample-then-filter algorithms.
- * With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter.
- * \par
- * pCoeffs points to a coefficient array of size numTaps.
- * numTaps must be a multiple of the interpolation factor L and this is checked by the
- * initialization functions.
- * Internally, the function divides the FIR filter's impulse response into shorter filters of length
- * phaseLength=numTaps/L.
- * Coefficients are stored in time reversed order.
- * \par
- *
- * The state variables are updated after each block of data is processed, the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * The code below statically initializes each of the 3 different data type filter instance structures
- *
- * arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
- * arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
- * arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
- *
- * where L is the interpolation factor; phaseLength=numTaps/L is the
- * length of each of the shorter FIR filters used internally,
- * pCoeffs is the address of the coefficient buffer;
- * pState is the address of the state buffer.
- * Be sure to set the values in the state buffer to zeros when doing static initialization.
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
+
+ These functions combine an upsampler (zero stuffer) and an FIR filter.
+ They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
+ Conceptually, the functions are equivalent to the block diagram below:
+ \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
+ After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized
+ cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum.
+ The user of the function is responsible for providing the filter coefficients.
+
+ The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
+ The upsampler inserts L-1 zeros between each sample.
+ Instead of multiplying by these zero values, the FIR filter is designed to skip them.
+ This leads to an efficient implementation without any wasted effort.
+ The functions operate on blocks of input and output data.
+ pSrc points to an array of blockSize input values and
+ pDst points to an array of blockSize*L output values.
+
+ The library provides separate functions for Q15, Q31, and floating-point data types.
+
+ @par Algorithm
+ The functions use a polyphase filter structure:
+
+ This approach is more efficient than straightforward upsample-then-filter algorithms.
+ With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter.
+ @par
+ pCoeffs points to a coefficient array of size numTaps.
+ numTaps must be a multiple of the interpolation factor L and this is checked by the
+ initialization functions.
+ Internally, the function divides the FIR filter's impulse response into shorter filters of length
+ phaseLength=numTaps/L.
+ Coefficients are stored in time reversed order.
+
+ @par
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
+ arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
+ arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
+
+ @par
+ where L is the interpolation factor; phaseLength=numTaps/L is the
+ length of each of the shorter FIR filters used internally,
+ pCoeffs is the address of the coefficient buffer;
+ pState is the address of the state buffer.
+ Be sure to set the values in the state buffer to zeros when doing static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
/**
- * @brief Processing function for the floating-point FIR interpolator.
- * @param[in] *S points to an instance of the floating-point FIR interpolator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
+ @brief Processing function for floating-point FIR interpolator.
+ @param[in] S points to an instance of the floating-point FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
*/
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
+#if defined(ARM_MATH_NEON)
void arm_fir_interpolate_f32(
const arm_fir_interpolate_instance_f32 * S,
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ float32_t *ptr1; /* Temporary pointers for state buffer */
+ const float32_t *ptr2; /* Temporary pointers for coefficient buffer */
float32_t sum0; /* Accumulators */
float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
uint32_t i, blkCnt, j; /* Loop counters */
@@ -152,22 +149,382 @@ void arm_fir_interpolate_f32(
uint32_t blkCntN4;
float32_t c1, c2, c3;
+ float32x4_t sum0v;
+ float32x4_t accV,accV0,accV1;
+ float32x4_t x0v,x1v,x2v,xa,xb;
+ uint32x4_t x0v_u,x1v_u,x2v_u,xa_u,xb_u;
+ float32x2_t tempV;
+
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = S->pState + (phaseLen - 1U);
/* Initialise blkCnt */
- blkCnt = blockSize / 4;
- blkCntN4 = blockSize - (4 * blkCnt);
+ blkCnt = blockSize >> 3;
+ blkCntN4 = blockSize & 7;
- /* Samples loop unrolled by 4 */
+ /* Loop unrolling */
while (blkCnt > 0U)
+ {
+ /* Copy new input samples into the state buffer */
+ sum0v = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,sum0v);
+ pSrc += 4;
+ pStateCurnt += 4;
+
+ sum0v = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,sum0v);
+ pSrc += 4;
+ pStateCurnt += 4;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ accV0 = vdupq_n_f32(0.0);
+ accV1 = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ x0v = vld1q_f32(ptr1);
+ x1v = vld1q_f32(ptr1 + 4);
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input samples */
+ x2v = vld1q_f32(ptr1 + 8);
+
+ /* Read the coefficients */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c1 = *(ptr2 + S->L);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c1);
+ accV1 = vmlaq_n_f32(accV1,xb,c1);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c2 = *(ptr2 + S->L * 2);
+
+ xa = vextq_f32(x0v,x1v,2);
+ xb = vextq_f32(x1v,x2v,2);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c2);
+ accV1 = vmlaq_n_f32(accV1,xb,c2);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c3 = *(ptr2 + S->L * 3);
+
+ xa = vextq_f32(x0v,x1v,3);
+ xb = vextq_f32(x1v,x2v,3);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c3);
+ accV1 = vmlaq_n_f32(accV1,xb,c3);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+ ptr1 += 4;
+ x0v = x1v;
+ x1v = x2v;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ x2v = vld1q_f32(ptr1 + 8);
+
+ switch (tapCnt)
+ {
+ case 3:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,2);
+ xb = vextq_f32(x1v,x2v,2);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ break;
+
+ case 2:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ break;
+
+ case 1:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ break;
+
+ default:
+ break;
+
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst = accV0[0];
+ *(pDst + S->L) = accV0[1];
+ *(pDst + 2 * S->L) = accV0[2];
+ *(pDst + 3 * S->L) = accV0[3];
+
+ *(pDst + 4 * S->L) = accV1[0];
+ *(pDst + 5 * S->L) = accV1[1];
+ *(pDst + 6 * S->L) = accV1[2];
+ *(pDst + 7 * S->L) = accV1[3];
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 8;
+
+ pDst += S->L * 7;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ while (blkCntN4 > 0U)
{
/* Copy new input sample into the state buffer */
*pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the coefficient */
+ x1v[0] = *(ptr2);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0v = vld1q_f32(ptr1);
+ ptr1 += 4;
+
+ /* Read the coefficient */
+ x1v[1] = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the coefficient */
+ x1v[2] = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the coefficient */
+ x1v[3] = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ sum0v = vmlaq_f32(sum0v,x0v,x1v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
+ sum0 = tempV[0] + tempV[1];
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *(ptr1++) * (*ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (phaseLen - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ sum0v = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,sum0v);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (phaseLen - 1U) % 0x04U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+#else
+
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *ptr1; /* Temporary pointer for state buffer */
+ const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
+ float32_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc0, acc1, acc2, acc3;
+ float32_t x0, x1, x2, x3;
+ float32_t c0, c1, c2, c3;
+#endif
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
@@ -190,7 +547,7 @@ void arm_fir_interpolate_f32(
ptr2 = pCoeffs + (S->L - j);
/* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
tapCnt = phaseLen >> 2U;
x0 = *(ptr1++);
@@ -199,7 +556,6 @@ void arm_fir_interpolate_f32(
while (tapCnt > 0U)
{
-
/* Read the input sample */
x3 = *(ptr1++);
@@ -254,7 +610,7 @@ void arm_fir_interpolate_f32(
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += 4 * S->L;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -263,7 +619,6 @@ void arm_fir_interpolate_f32(
while (tapCnt > 0U)
{
-
/* Read the input sample */
x3 = *(ptr1++);
@@ -284,13 +639,13 @@ void arm_fir_interpolate_f32(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* The result is in the accumulator, store in the destination buffer. */
- *pDst = acc0;
- *(pDst + S->L) = acc1;
+ *(pDst ) = acc0;
+ *(pDst + S->L) = acc1;
*(pDst + 2 * S->L) = acc2;
*(pDst + 3 * S->L) = acc3;
@@ -299,7 +654,7 @@ void arm_fir_interpolate_f32(
/* Increment the address modifier index of coefficient buffer */
j++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
@@ -309,23 +664,31 @@ void arm_fir_interpolate_f32(
pDst += S->L * 3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
- while (blkCntN4 > 0U)
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
/* Loop over the Interpolation factor. */
i = S->L;
+
while (i > 0U)
{
/* Set accumulator to zero */
@@ -337,78 +700,58 @@ void arm_fir_interpolate_f32(
/* Initialize coefficient pointer */
ptr2 = pCoeffs + (S->L - j);
- /* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
tapCnt = phaseLen >> 2U;
+
while (tapCnt > 0U)
{
-
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += *ptr1++ * *ptr2;
/* Upsampling is done by stuffing L-1 zeros between each sample.
* So instead of multiplying zeros with coefficients,
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += x0 * c0;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining outputs */
tapCnt = phaseLen % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum0 += *(ptr1++) * (*ptr2);
+ sum0 += *ptr1++ * *ptr2;
- /* Increment the coefficient pointer by interpolation factor times. */
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -427,66 +770,67 @@ void arm_fir_interpolate_f32(
pState = pState + 1;
/* Decrement the loop counter */
- blkCntN4--;
+ blkCnt--;
}
/* Processing is complete.
- ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
tapCnt = (phaseLen - 1U) >> 2U;
/* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
+ /* Loop unrolling: Compute remaining outputs */
tapCnt = (phaseLen - 1U) % 0x04U;
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-}
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
-void arm_fir_interpolate_f32(
- const arm_fir_interpolate_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
-{
- float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float32_t *pStateCurnt; /* Points to the current sample of the state */
- float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
- float32_t sum; /* Accumulator */
- uint32_t i, blkCnt; /* Loop counters */
- uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+#else
+/* alternate version for CM0_FAMILY */
+
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *ptr1; /* Temporary pointer for state buffer */
+ const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
+ float32_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (phaseLen - 1U);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
/* Total number of intput samples */
blkCnt = blockSize;
@@ -495,7 +839,7 @@ void arm_fir_interpolate_f32(
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Loop over the Interpolation factor. */
i = S->L;
@@ -503,7 +847,7 @@ void arm_fir_interpolate_f32(
while (i > 0U)
{
/* Set accumulator to zero */
- sum = 0.0f;
+ sum0 = 0.0f;
/* Initialize state pointer */
ptr1 = pState;
@@ -517,7 +861,7 @@ void arm_fir_interpolate_f32(
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum += *ptr1++ * *ptr2;
+ sum0 += *ptr1++ * *ptr2;
/* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
@@ -527,9 +871,9 @@ void arm_fir_interpolate_f32(
}
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = sum;
+ *pDst++ = sum0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
@@ -537,7 +881,7 @@ void arm_fir_interpolate_f32(
* to process the next group of interpolation factor number samples */
pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -546,24 +890,25 @@ void arm_fir_interpolate_f32(
** This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
tapCnt = phaseLen - 1U;
+ /* Copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined(ARM_MATH_NEON) */
-
-
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
index 05fc370..287d347 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_init_f32.c
* Description: Floating-point FIR interpolator initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,45 +29,46 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
/**
- * @brief Initialization function for the floating-point FIR interpolator.
- * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure.
- * @param[in] L upsample factor.
- * @param[in] numTaps number of filter coefficients in the filter.
- * @param[in] *pCoeffs points to the filter coefficient buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * the filter length numTaps is not a multiple of the interpolation factor L.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The length of the filter numTaps must be a multiple of the interpolation factor L.
- * \par
- * pState points to the array of state variables.
- * pState is of length (numTaps/L)+blockSize-1 words
- * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32().
+ @brief Initialization function for the floating-point FIR interpolator.
+ @param[in,out] S points to an instance of the floating-point FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ @par
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32().
*/
arm_status arm_fir_interpolate_init_f32(
- arm_fir_interpolate_instance_f32 * S,
- uint8_t L,
- uint16_t numTaps,
- float32_t * pCoeffs,
- float32_t * pState,
- uint32_t blockSize)
+ arm_fir_interpolate_instance_f32 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
{
arm_status status;
@@ -79,7 +80,6 @@ arm_status arm_fir_interpolate_init_f32(
}
else
{
-
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
@@ -89,10 +89,8 @@ arm_status arm_fir_interpolate_init_f32(
/* Assign polyPhaseLength */
S->phaseLength = numTaps / L;
- /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */
- memset(pState, 0,
- (blockSize +
- ((uint32_t) S->phaseLength - 1U)) * sizeof(float32_t));
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(float32_t));
/* Assign state pointer */
S->pState = pState;
@@ -101,9 +99,8 @@ arm_status arm_fir_interpolate_init_f32(
}
return (status);
-
}
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
index 3b3fb79..7f43bbf 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_init_q15.c
* Description: Q15 FIR interpolator initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,45 +29,46 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
/**
- * @brief Initialization function for the Q15 FIR interpolator.
- * @param[in,out] *S points to an instance of the Q15 FIR interpolator structure.
- * @param[in] L upsample factor.
- * @param[in] numTaps number of filter coefficients in the filter.
- * @param[in] *pCoeffs points to the filter coefficient buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * the filter length numTaps is not a multiple of the interpolation factor L.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The length of the filter numTaps must be a multiple of the interpolation factor L.
- * \par
- * pState points to the array of state variables.
- * pState is of length (numTaps/L)+blockSize-1 words
- * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15().
+ @brief Initialization function for the Q15 FIR interpolator.
+ @param[in,out] S points to an instance of the Q15 FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15().
*/
arm_status arm_fir_interpolate_init_q15(
- arm_fir_interpolate_instance_q15 * S,
- uint8_t L,
- uint16_t numTaps,
- q15_t * pCoeffs,
- q15_t * pState,
- uint32_t blockSize)
+ arm_fir_interpolate_instance_q15 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
{
arm_status status;
@@ -79,7 +80,6 @@ arm_status arm_fir_interpolate_init_q15(
}
else
{
-
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
@@ -90,8 +90,7 @@ arm_status arm_fir_interpolate_init_q15(
S->phaseLength = numTaps / L;
/* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
- memset(pState, 0,
- (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q15_t));
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q15_t));
/* Assign state pointer */
S->pState = pState;
@@ -100,9 +99,8 @@ arm_status arm_fir_interpolate_init_q15(
}
return (status);
-
}
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
index 03959c0..973e715 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_init_q31.c
* Description: Q31 FIR interpolator initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,46 +29,45 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
-
/**
- * @brief Initialization function for the Q31 FIR interpolator.
- * @param[in,out] *S points to an instance of the Q31 FIR interpolator structure.
- * @param[in] L upsample factor.
- * @param[in] numTaps number of filter coefficients in the filter.
- * @param[in] *pCoeffs points to the filter coefficient buffer.
- * @param[in] *pState points to the state buffer.
- * @param[in] blockSize number of input samples to process per call.
- * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
- * the filter length numTaps is not a multiple of the interpolation factor L.
- *
- * Description:
- * \par
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The length of the filter numTaps must be a multiple of the interpolation factor L.
- * \par
- * pState points to the array of state variables.
- * pState is of length (numTaps/L)+blockSize-1 words
- * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31().
+ @brief Initialization function for the Q31 FIR interpolator.
+ @param[in,out] S points to an instance of the Q31 FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31().
*/
arm_status arm_fir_interpolate_init_q31(
- arm_fir_interpolate_instance_q31 * S,
- uint8_t L,
- uint16_t numTaps,
- q31_t * pCoeffs,
- q31_t * pState,
- uint32_t blockSize)
+ arm_fir_interpolate_instance_q31 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
{
arm_status status;
@@ -80,7 +79,6 @@ arm_status arm_fir_interpolate_init_q31(
}
else
{
-
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
@@ -91,8 +89,7 @@ arm_status arm_fir_interpolate_init_q31(
S->phaseLength = numTaps / L;
/* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
- memset(pState, 0,
- (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q31_t));
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q31_t));
/* Assign state pointer */
S->pState = pState;
@@ -101,9 +98,8 @@ arm_status arm_fir_interpolate_init_q31(
}
return (status);
-
}
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
index dc0cb4b..7efec94 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_q15.c
* Description: Q15 FIR interpolation
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,68 +29,72 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
/**
- * @brief Processing function for the Q15 FIR interpolator.
- * @param[in] *S points to an instance of the Q15 FIR interpolator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
- * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ @brief Processing function for the Q15 FIR interpolator.
+ @param[in] S points to an instance of the Q15 FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
*/
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
void arm_fir_interpolate_q15(
const arm_fir_interpolate_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
- q63_t sum0; /* Accumulators */
- q15_t x0, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t i, blkCnt, j, tapCnt; /* Loop counters */
- uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
- uint32_t blkCntN2;
- q63_t acc0, acc1;
- q15_t x1;
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *ptr1; /* Temporary pointer for state buffer */
+ const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2, acc3;
+ q15_t x0, x1, x2, x3;
+ q15_t c0, c1, c2, c3;
+#endif
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
- /* Initialise blkCnt */
- blkCnt = blockSize / 2;
- blkCntN2 = blockSize - (2 * blkCnt);
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- /* Samples loop unrolled by 2 */
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
@@ -103,6 +107,8 @@ void arm_fir_interpolate_q15(
/* Set accumulator to zero */
acc0 = 0;
acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
/* Initialize state pointer */
ptr1 = pState;
@@ -111,55 +117,62 @@ void arm_fir_interpolate_q15(
ptr2 = pCoeffs + (S->L - j);
/* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
tapCnt = phaseLen >> 2U;
x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
while (tapCnt > 0U)
{
-
/* Read the input sample */
- x1 = *(ptr1++);
+ x3 = *(ptr1++);
/* Read the coefficient */
c0 = *(ptr2);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
-
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Read the coefficient */
- c0 = *(ptr2 + S->L);
+ c1 = *(ptr2 + S->L);
/* Read the input sample */
x0 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x1 *c0;
- acc1 += (q63_t) x0 *c0;
-
+ acc0 += (q63_t) x1 * c1;
+ acc1 += (q63_t) x2 * c1;
+ acc2 += (q63_t) x3 * c1;
+ acc3 += (q63_t) x0 * c1;
/* Read the coefficient */
- c0 = *(ptr2 + S->L * 2);
+ c2 = *(ptr2 + S->L * 2);
/* Read the input sample */
x1 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
+ acc0 += (q63_t) x2 * c2;
+ acc1 += (q63_t) x3 * c2;
+ acc2 += (q63_t) x0 * c2;
+ acc3 += (q63_t) x1 * c2;
/* Read the coefficient */
- c0 = *(ptr2 + S->L * 3);
+ c3 = *(ptr2 + S->L * 3);
/* Read the input sample */
- x0 = *(ptr1++);
+ x2 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x1 *c0;
- acc1 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x3 * c3;
+ acc1 += (q63_t) x0 * c3;
+ acc2 += (q63_t) x1 * c3;
+ acc3 += (q63_t) x2 * c3;
/* Upsampling is done by stuffing L-1 zeros between each sample.
@@ -167,7 +180,7 @@ void arm_fir_interpolate_q15(
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += 4 * S->L;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -176,59 +189,69 @@ void arm_fir_interpolate_q15(
while (tapCnt > 0U)
{
-
/* Read the input sample */
- x1 = *(ptr1++);
+ x3 = *(ptr1++);
/* Read the coefficient */
c0 = *(ptr2);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
/* update states for next sample processing */
x0 = x1;
+ x1 = x2;
+ x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* The result is in the accumulator, store in the destination buffer. */
- *pDst = (q15_t) (__SSAT((acc0 >> 15), 16));
- *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
+ *(pDst ) = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
+ *(pDst + 2 * S->L) = (q15_t) (__SSAT((acc2 >> 15), 16));
+ *(pDst + 3 * S->L) = (q15_t) (__SSAT((acc3 >> 15), 16));
pDst++;
/* Increment the address modifier index of coefficient buffer */
j++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
/* Advance the state pointer by 1
* to process the next group of interpolation factor number samples */
- pState = pState + 2;
+ pState = pState + 4;
- pDst += S->L;
+ pDst += S->L * 3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blkCntN2;
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* Loop over the blockSize. */
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
@@ -246,90 +269,65 @@ void arm_fir_interpolate_q15(
/* Initialize coefficient pointer */
ptr2 = pCoeffs + (S->L - j);
- /* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
- tapCnt = phaseLen >> 2;
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = phaseLen >> 2U;
+
while (tapCnt > 0U)
{
-
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
/* Upsampling is done by stuffing L-1 zeros between each sample.
* So instead of multiplying zeros with coefficients,
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = phaseLen & 0x3U;
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = phaseLen % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (tapCnt > 0U)
{
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
- /* Increment the coefficient pointer by interpolation factor times. */
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* The result is in the accumulator, store in the destination buffer. */
*pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+ /* Increment the address modifier index of coefficient buffer */
j++;
/* Decrement the loop counter */
@@ -344,71 +342,62 @@ void arm_fir_interpolate_q15(
blkCnt--;
}
-
/* Processing is complete.
- ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
- i = ((uint32_t) phaseLen - 1U) >> 2U;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = (phaseLen - 1U) >> 2U;
/* copy data */
- while (i > 0U)
+ while (tapCnt > 0U)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = (phaseLen - 1U) % 0x04U;
#else
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* Decrement the loop counter */
- i--;
- }
-
- i = ((uint32_t) phaseLen - 1U) % 0x04U;
-
- while (i > 0U)
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
-}
#else
+/* alternate version for CM0_FAMILY */
- /* Run the below code for Cortex-M0 */
-
-void arm_fir_interpolate_q15(
- const arm_fir_interpolate_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
-{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
- q63_t sum; /* Accumulator */
- q15_t x0, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t i, blkCnt, tapCnt; /* Loop counters */
- uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
-
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *ptr1; /* Temporary pointer for state buffer */
+ const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (phaseLen - 1U);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
/* Total number of intput samples */
blkCnt = blockSize;
@@ -417,7 +406,7 @@ void arm_fir_interpolate_q15(
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Loop over the Interpolation factor. */
i = S->L;
@@ -425,7 +414,7 @@ void arm_fir_interpolate_q15(
while (i > 0U)
{
/* Set accumulator to zero */
- sum = 0;
+ sum0 = 0;
/* Initialize state pointer */
ptr1 = pState;
@@ -434,30 +423,24 @@ void arm_fir_interpolate_q15(
ptr2 = pCoeffs + (i - 1U);
/* Loop over the polyPhase length */
- tapCnt = (uint32_t) phaseLen;
+ tapCnt = phaseLen;
while (tapCnt > 0U)
{
- /* Read the coefficient */
- c0 = *ptr2;
+ /* Perform the multiply-accumulate */
+ sum0 += ((q63_t) *ptr1++ * *ptr2);
/* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *ptr1++;
-
- /* Perform the multiply-accumulate */
- sum += ((q31_t) x0 * c0);
-
/* Decrement the loop counter */
tapCnt--;
}
- /* Store the result after converting to 1.15 format in the destination buffer */
- *pDst++ = (q15_t) (__SSAT((sum >> 15), 16));
+ /* Store the result after converting to 1.15 format in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
@@ -465,7 +448,7 @@ void arm_fir_interpolate_q15(
* to process the next group of interpolation factor number samples */
pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -474,23 +457,23 @@ void arm_fir_interpolate_q15(
** This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
- i = (uint32_t) phaseLen - 1U;
+ tapCnt = phaseLen - 1U;
- while (i > 0U)
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
index 2d23b37..d6a8ca3 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_interpolate_q31.c
* Description: Q31 FIR interpolation
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,69 +29,72 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Interpolate
- * @{
+ @addtogroup FIR_Interpolate
+ @{
*/
/**
- * @brief Processing function for the Q31 FIR interpolator.
- * @param[in] *S points to an instance of the Q31 FIR interpolator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L).
- * since numTaps/L additions occur per output sample.
- * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ @brief Processing function for the Q31 FIR interpolator.
+ @param[in] S points to an instance of the Q31 FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L).
+ since numTaps/L additions occur per output sample.
+ After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
*/
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
void arm_fir_interpolate_q31(
const arm_fir_interpolate_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
- q63_t sum0; /* Accumulators */
- q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t i, blkCnt, j; /* Loop counters */
- uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
- uint32_t blkCntN2;
- q63_t acc0, acc1;
- q31_t x1;
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *ptr1; /* Temporary pointer for state buffer */
+ const q31_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2, acc3;
+ q31_t x0, x1, x2, x3;
+ q31_t c0, c1, c2, c3;
+#endif
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
- /* Initialise blkCnt */
- blkCnt = blockSize / 2;
- blkCntN2 = blockSize - (2 * blkCnt);
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- /* Samples loop unrolled by 2 */
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
@@ -104,6 +107,8 @@ void arm_fir_interpolate_q31(
/* Set accumulator to zero */
acc0 = 0;
acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
/* Initialize state pointer */
ptr1 = pState;
@@ -112,55 +117,62 @@ void arm_fir_interpolate_q31(
ptr2 = pCoeffs + (S->L - j);
/* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
tapCnt = phaseLen >> 2U;
x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
while (tapCnt > 0U)
{
-
/* Read the input sample */
- x1 = *(ptr1++);
+ x3 = *(ptr1++);
/* Read the coefficient */
c0 = *(ptr2);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
-
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Read the coefficient */
- c0 = *(ptr2 + S->L);
+ c1 = *(ptr2 + S->L);
/* Read the input sample */
x0 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x1 *c0;
- acc1 += (q63_t) x0 *c0;
-
+ acc0 += (q63_t) x1 * c1;
+ acc1 += (q63_t) x2 * c1;
+ acc2 += (q63_t) x3 * c1;
+ acc3 += (q63_t) x0 * c1;
/* Read the coefficient */
- c0 = *(ptr2 + S->L * 2);
+ c2 = *(ptr2 + S->L * 2);
/* Read the input sample */
x1 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
+ acc0 += (q63_t) x2 * c2;
+ acc1 += (q63_t) x3 * c2;
+ acc2 += (q63_t) x0 * c2;
+ acc3 += (q63_t) x1 * c2;
/* Read the coefficient */
- c0 = *(ptr2 + S->L * 3);
+ c3 = *(ptr2 + S->L * 3);
/* Read the input sample */
- x0 = *(ptr1++);
+ x2 = *(ptr1++);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x1 *c0;
- acc1 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x3 * c3;
+ acc1 += (q63_t) x0 * c3;
+ acc2 += (q63_t) x1 * c3;
+ acc3 += (q63_t) x2 * c3;
/* Upsampling is done by stuffing L-1 zeros between each sample.
@@ -168,7 +180,7 @@ void arm_fir_interpolate_q31(
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += 4 * S->L;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -177,60 +189,69 @@ void arm_fir_interpolate_q31(
while (tapCnt > 0U)
{
-
/* Read the input sample */
- x1 = *(ptr1++);
+ x3 = *(ptr1++);
/* Read the coefficient */
c0 = *(ptr2);
/* Perform the multiply-accumulate */
- acc0 += (q63_t) x0 *c0;
- acc1 += (q63_t) x1 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
/* update states for next sample processing */
x0 = x1;
+ x1 = x2;
+ x2 = x3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* The result is in the accumulator, store in the destination buffer. */
- *pDst = (q31_t) (acc0 >> 31);
- *(pDst + S->L) = (q31_t) (acc1 >> 31);
-
+ *(pDst ) = (q31_t) (acc0 >> 31);
+ *(pDst + S->L) = (q31_t) (acc1 >> 31);
+ *(pDst + 2 * S->L) = (q31_t) (acc2 >> 31);
+ *(pDst + 3 * S->L) = (q31_t) (acc3 >> 31);
pDst++;
/* Increment the address modifier index of coefficient buffer */
j++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
/* Advance the state pointer by 1
* to process the next group of interpolation factor number samples */
- pState = pState + 2;
+ pState = pState + 4;
- pDst += S->L;
+ pDst += S->L * 3;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blkCntN2;
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* Loop over the blockSize. */
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Address modifier index of coefficient buffer */
j = 1U;
@@ -248,84 +269,58 @@ void arm_fir_interpolate_q31(
/* Initialize coefficient pointer */
ptr2 = pCoeffs + (S->L - j);
- /* Loop over the polyPhase length. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
- tapCnt = phaseLen >> 2;
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = phaseLen >> 2U;
+
while (tapCnt > 0U)
{
-
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
/* Upsampling is done by stuffing L-1 zeros between each sample.
* So instead of multiplying zeros with coefficients,
* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Read the coefficient */
- c0 = *(ptr2);
-
- /* Increment the coefficient pointer by interpolation factor times. */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = phaseLen & 0x3U;
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = phaseLen % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (tapCnt > 0U)
{
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
- /* Increment the coefficient pointer by interpolation factor times. */
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *(ptr1++);
-
- /* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -348,64 +343,63 @@ void arm_fir_interpolate_q31(
}
/* Processing is complete.
- ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
tapCnt = (phaseLen - 1U) >> 2U;
/* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
+ /* Loop unrolling: Compute remaining outputs */
tapCnt = (phaseLen - 1U) % 0x04U;
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
-}
-
-
#else
-void arm_fir_interpolate_q31(
- const arm_fir_interpolate_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
-{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
- /* Run the below code for Cortex-M0 */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- q63_t sum; /* Accumulator */
- q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
- uint32_t i, blkCnt; /* Loop counters */
- uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *ptr1; /* Temporary pointer for state buffer */
+ const q31_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
/* Total number of intput samples */
blkCnt = blockSize;
@@ -414,7 +408,7 @@ void arm_fir_interpolate_q31(
while (blkCnt > 0U)
{
/* Copy new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
/* Loop over the Interpolation factor. */
i = S->L;
@@ -422,7 +416,7 @@ void arm_fir_interpolate_q31(
while (i > 0U)
{
/* Set accumulator to zero */
- sum = 0;
+ sum0 = 0;
/* Initialize state pointer */
ptr1 = pState;
@@ -430,30 +424,25 @@ void arm_fir_interpolate_q31(
/* Initialize coefficient pointer */
ptr2 = pCoeffs + (i - 1U);
+ /* Loop over the polyPhase length */
tapCnt = phaseLen;
while (tapCnt > 0U)
{
- /* Read the coefficient */
- c0 = *(ptr2);
+ /* Perform the multiply-accumulate */
+ sum0 += ((q63_t) *ptr1++ * *ptr2);
/* Increment the coefficient pointer by interpolation factor times. */
ptr2 += S->L;
- /* Read the input sample */
- x0 = *ptr1++;
-
- /* Perform the multiply-accumulate */
- sum += (q63_t) x0 *c0;
-
/* Decrement the loop counter */
tapCnt--;
}
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = (q31_t) (sum >> 31);
+ *pDst++ = (q31_t) (sum0 >> 31);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
@@ -461,32 +450,32 @@ void arm_fir_interpolate_q31(
* to process the next group of interpolation factor number samples */
pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
** This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
tapCnt = phaseLen - 1U;
- /* copy data */
+ /* Copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /**
- * @} end of FIR_Interpolate group
- */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
index 369c9e4..a3d95c1 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_fir_lattice_f32.c
- * Description: Processing function for the floating-point FIR Lattice filter
+ * Description: Processing function for floating-point FIR Lattice filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,282 +29,305 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters
- *
- * This set of functions implements Finite Impulse Response (FIR) lattice filters
- * for Q15, Q31 and floating-point data types. Lattice filters are used in a
- * variety of adaptive filter applications. The filter structure is feedforward and
- * the net impulse response is finite length.
- * The functions operate on blocks
- * of input and output data and each call to the function processes
- * blockSize samples through the filter. pSrc and
- * pDst point to input and output arrays containing blockSize values.
- *
- * \par Algorithm:
- * \image html FIRLattice.gif "Finite Impulse Response Lattice filter"
- * The following difference equation is implemented:
- *
- * f0[n] = g0[n] = x[n]
- * fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
- * gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
- * y[n] = fM[n]
- *
- * \par
- * pCoeffs points to tha array of reflection coefficients of size numStages.
- * Reflection Coefficients are stored in the following order.
- * \par
- *
- * {k1, k2, ..., kM}
- *
- * where M is number of stages
- * \par
- * pState points to a state array of size numStages.
- * The state variables (g values) hold previous inputs and are stored in the following order.
- *
- * {g0[n], g1[n], g2[n] ...gM-1[n]}
- *
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pCoeffs, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
- *
- *arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};
- *arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};
- *arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};
- *
- * \par
- * where numStages is the number of stages in the filter; pState is the address of the state buffer;
- * pCoeffs is the address of the coefficient buffer.
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the FIR Lattice filter functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters
+
+ This set of functions implements Finite Impulse Response (FIR) lattice filters
+ for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ variety of adaptive filter applications. The filter structure is feedforward and
+ the net impulse response is finite length.
+ The functions operate on blocks
+ of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst point to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ \image html FIRLattice.gif "Finite Impulse Response Lattice filter"
+ The following difference equation is implemented:
+ @par
+
+ f0[n] = g0[n] = x[n]
+ fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
+ gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
+ y[n] = fM[n]
+
+ @par
+ pCoeffs points to tha array of reflection coefficients of size numStages.
+ Reflection Coefficients are stored in the following order.
+ @par
+
+ {k1, k2, ..., kM}
+
+ where M is number of stages
+ @par
+ pState points to a state array of size numStages.
+ The state variables (g values) hold previous inputs and are stored in the following order.
+
+ {g0[n], g1[n], g2[n] ...gM-1[n]}
+
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+
+ arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};
+ arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};
+ arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};
+
+ @par
+ where numStages is the number of stages in the filter;
+ pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR Lattice filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
-
- /**
- * @brief Processing function for the floating-point FIR lattice filter.
- * @param[in] *S points to an instance of the floating-point FIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of samples to process.
- * @return none.
- */
+/**
+ @brief Processing function for the floating-point FIR lattice filter.
+ @param[in] S points to an instance of the floating-point FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
void arm_fir_lattice_f32(
const arm_fir_lattice_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
{
- float32_t *pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float32_t *px; /* temporary state pointer */
- float32_t *pk; /* temporary coefficient pointer */
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* Temporary state pointer */
+ const float32_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ float32_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ float32_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ float32_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
-#if defined (ARM_MATH_DSP)
+ gcurr0 = 0.0f;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
- float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
- float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
- float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
- uint32_t numStages = S->numStages; /* Number of stages in the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- gcurr1 = 0.0f;
- pState = &S->pState[0];
-
- blkCnt = blockSize >> 2;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
-
/* Read two samples from input buffer */
/* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
fcurr1 = *pSrc++;
- fcurr2 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
/* Initialize state pointer */
px = pState;
- /* Read g0(n-1) from state */
- gcurr1 = *px;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
/* Process first sample for first tap */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (fcurr1 * (*pk)) + gcurr1;
+ gnext0 = (fcurr0 * (*pk)) + gcurr0;
/* Process second sample for first tap */
- /* for sample 2 processing */
- fnext2 = fcurr2 + ((*pk) * fcurr1);
- gnext2 = (fcurr2 * (*pk)) + fcurr1;
+ fnext1 = (fcurr0 * (*pk)) + fcurr1;
+ gnext1 = (fcurr1 * (*pk)) + fcurr0;
/* Read next two samples from input buffer */
/* f0(n+2) = x(n+2) */
+ fcurr2 = *pSrc++;
fcurr3 = *pSrc++;
- fcurr4 = *pSrc++;
-
- /* Copy only last input samples into the state buffer
- which will be used for next four samples processing */
- *px++ = fcurr4;
/* Process third sample for first tap */
- fnext3 = fcurr3 + ((*pk) * fcurr2);
- gnext3 = (fcurr3 * (*pk)) + fcurr2;
+ fnext2 = (fcurr1 * (*pk)) + fcurr2;
+ gnext2 = (fcurr2 * (*pk)) + fcurr1;
/* Process fourth sample for first tap */
- fnext4 = fcurr4 + ((*pk) * fcurr3);
- gnext4 = (fcurr4 * (*pk++)) + fcurr3;
+ fnext3 = (fcurr2 * (*pk )) + fcurr3;
+ gnext3 = (fcurr3 * (*pk++)) + fcurr2;
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = fcurr3;
/* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
fcurr1 = fnext1;
fcurr2 = fnext2;
fcurr3 = fnext3;
- fcurr4 = fnext4;
/* Loop unrolling. Process 4 taps at a time . */
stageCnt = (numStages - 1U) >> 2U;
/* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numStages-3 coefficients. */
+ Repeat until we've computed numStages-3 coefficients. */
/* Process 2nd, 3rd, 4th and 5th taps ... here */
while (stageCnt > 0U)
{
/* Read g1(n-1), g3(n-1) .... from state */
- gcurr1 = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = gnext4;
+ *px++ = gnext3;
/* Process first sample for 2nd, 6th .. tap */
/* Sample processing for K2, K6.... */
/* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
/* Process second sample for 2nd, 6th .. tap */
/* for sample 2 processing */
- fnext2 = fcurr2 + ((*pk) * gnext1);
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
/* Process third sample for 2nd, 6th .. tap */
- fnext3 = fcurr3 + ((*pk) * gnext2);
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
/* Process fourth sample for 2nd, 6th .. tap */
- fnext4 = fcurr4 + ((*pk) * gnext3);
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
/* g2(n) = f1(n) * K2 + g1(n-1) */
/* Calculation of state values for next stage */
- gnext4 = (fcurr4 * (*pk)) + gnext3;
gnext3 = (fcurr3 * (*pk)) + gnext2;
+
gnext2 = (fcurr2 * (*pk)) + gnext1;
- gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
/* Read g2(n-1), g4(n-1) .... from state */
- gcurr1 = *px;
+ gcurr0 = *px;
/* save g2(n) in state buffer */
- *px++ = gnext4;
+ *px++ = gnext3;
/* Sample processing for K3, K7.... */
/* Process first sample for 3rd, 7th .. tap */
/* f3(n) = f2(n) + K3 * g2(n-1) */
- fcurr1 = fnext1 + ((*pk) * gcurr1);
+ fcurr0 = (gcurr0 * (*pk)) + fnext0;
+
/* Process second sample for 3rd, 7th .. tap */
- fcurr2 = fnext2 + ((*pk) * gnext1);
+ fcurr1 = (gnext0 * (*pk)) + fnext1;
+
/* Process third sample for 3rd, 7th .. tap */
- fcurr3 = fnext3 + ((*pk) * gnext2);
+ fcurr2 = (gnext1 * (*pk)) + fnext2;
+
/* Process fourth sample for 3rd, 7th .. tap */
- fcurr4 = fnext4 + ((*pk) * gnext3);
+ fcurr3 = (gnext2 * (*pk)) + fnext3;
/* Calculation of state values for next stage */
/* g3(n) = f2(n) * K3 + g2(n-1) */
- gnext4 = (fnext4 * (*pk)) + gnext3;
gnext3 = (fnext3 * (*pk)) + gnext2;
+
gnext2 = (fnext2 * (*pk)) + gnext1;
- gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+ gnext1 = (fnext1 * (*pk)) + gnext0;
+
+ gnext0 = (fnext0 * (*pk++)) + gcurr0;
/* Read g1(n-1), g3(n-1) .... from state */
- gcurr1 = *px;
+ gcurr0 = *px;
/* save g3(n) in state buffer */
- *px++ = gnext4;
+ *px++ = gnext3;
/* Sample processing for K4, K8.... */
/* Process first sample for 4th, 8th .. tap */
/* f4(n) = f3(n) + K4 * g3(n-1) */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
/* Process second sample for 4th, 8th .. tap */
/* for sample 2 processing */
- fnext2 = fcurr2 + ((*pk) * gnext1);
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
/* Process third sample for 4th, 8th .. tap */
- fnext3 = fcurr3 + ((*pk) * gnext2);
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
/* Process fourth sample for 4th, 8th .. tap */
- fnext4 = fcurr4 + ((*pk) * gnext3);
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
/* g4(n) = f3(n) * K4 + g3(n-1) */
/* Calculation of state values for next stage */
- gnext4 = (fcurr4 * (*pk)) + gnext3;
gnext3 = (fcurr3 * (*pk)) + gnext2;
+
gnext2 = (fcurr2 * (*pk)) + gnext1;
- gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
/* Read g2(n-1), g4(n-1) .... from state */
- gcurr1 = *px;
+ gcurr0 = *px;
/* save g4(n) in state buffer */
- *px++ = gnext4;
+ *px++ = gnext3;
/* Sample processing for K5, K9.... */
/* Process first sample for 5th, 9th .. tap */
/* f5(n) = f4(n) + K5 * g4(n-1) */
- fcurr1 = fnext1 + ((*pk) * gcurr1);
+ fcurr0 = (gcurr0 * (*pk)) + fnext0;
+
/* Process second sample for 5th, 9th .. tap */
- fcurr2 = fnext2 + ((*pk) * gnext1);
+ fcurr1 = (gnext0 * (*pk)) + fnext1;
+
/* Process third sample for 5th, 9th .. tap */
- fcurr3 = fnext3 + ((*pk) * gnext2);
+ fcurr2 = (gnext1 * (*pk)) + fnext2;
+
/* Process fourth sample for 5th, 9th .. tap */
- fcurr4 = fnext4 + ((*pk) * gnext3);
+ fcurr3 = (gnext2 * (*pk)) + fnext3;
/* Calculation of state values for next stage */
/* g5(n) = f4(n) * K5 + g4(n-1) */
- gnext4 = (fnext4 * (*pk)) + gnext3;
gnext3 = (fnext3 * (*pk)) + gnext2;
+
gnext2 = (fnext2 * (*pk)) + gnext1;
- gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+ gnext1 = (fnext1 * (*pk)) + gnext0;
+
+ gnext0 = (fnext0 * (*pk++)) + gcurr0;
stageCnt--;
}
@@ -314,144 +337,84 @@ void arm_fir_lattice_f32(
while (stageCnt > 0U)
{
- gcurr1 = *px;
+ gcurr0 = *px;
/* save g value in state buffer */
- *px++ = gnext4;
+ *px++ = gnext3;
/* Process four samples for last three taps here */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
- fnext2 = fcurr2 + ((*pk) * gnext1);
- fnext3 = fcurr3 + ((*pk) * gnext2);
- fnext4 = fcurr4 + ((*pk) * gnext3);
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext4 = (fcurr4 * (*pk)) + gnext3;
gnext3 = (fcurr3 * (*pk)) + gnext2;
+
gnext2 = (fcurr2 * (*pk)) + gnext1;
- gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
/* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
fcurr1 = fnext1;
fcurr2 = fnext2;
fcurr3 = fnext3;
- fcurr4 = fnext4;
stageCnt--;
-
}
/* The results in the 4 accumulators, store in the destination buffer. */
/* y(n) = fN(n) */
+ *pDst++ = fcurr0;
*pDst++ = fcurr1;
*pDst++ = fcurr2;
*pDst++ = fcurr3;
- *pDst++ = fcurr4;
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* f0(n) = x(n) */
- fcurr1 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
-
- /* Initialize state pointer */
- px = pState;
-
- /* read g2(n) from state buffer */
- gcurr1 = *px;
-
- /* for sample 1 processing */
- /* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
- /* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (fcurr1 * (*pk++)) + gcurr1;
-
- /* save g1(n) in state buffer */
- *px++ = fcurr1;
-
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr1 = fnext1;
-
- stageCnt = (numStages - 1U);
-
- /* stage loop */
- while (stageCnt > 0U)
- {
- /* read g2(n) from state buffer */
- gcurr1 = *px;
-
- /* save g1(n) in state buffer */
- *px++ = gnext1;
-
- /* Sample processing for K2, K3.... */
- /* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext1 = fcurr1 + ((*pk) * gcurr1);
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext1 = (fcurr1 * (*pk++)) + gcurr1;
-
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr1 = fnext1;
-
- stageCnt--;
-
- }
-
- /* y(n) = fN(n) */
- *pDst++ = fcurr1;
-
- blkCnt--;
-
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
- float32_t fcurr, fnext, gcurr, gnext; /* temporary variables */
- uint32_t numStages = S->numStages; /* Length of the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
-
- pState = &S->pState[0];
-
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* f0(n) = x(n) */
- fcurr = *pSrc++;
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
/* Initialize coeff pointer */
pk = pCoeffs;
- /* Initialize state pointer */
- px = pState;
-
- /* read g0(n-1) from state buffer */
- gcurr = *px;
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
/* for sample 1 processing */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext = fcurr + ((*pk) * gcurr);
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext = (fcurr * (*pk++)) + gcurr;
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
- /* save f0(n) in state buffer */
- *px++ = fcurr;
+ /* save g1(n) in state buffer */
+ *px++ = fcurr0;
- /* f1(n) is saved in fcurr
- for next stage processing */
- fcurr = fnext;
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt = (numStages - 1U);
@@ -459,36 +422,32 @@ void arm_fir_lattice_f32(
while (stageCnt > 0U)
{
/* read g2(n) from state buffer */
- gcurr = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = gnext;
+ *px++ = gnext0;
/* Sample processing for K2, K3.... */
/* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext = fcurr + ((*pk) * gcurr);
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext = (fcurr * (*pk++)) + gcurr;
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr = fnext;
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt--;
-
}
/* y(n) = fN(n) */
- *pDst++ = fcurr;
+ *pDst++ = fcurr0;
blkCnt--;
-
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
index 2e31a15..7929629 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_lattice_init_f32.c
* Description: Floating-point FIR Lattice filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,28 +29,28 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
/**
- * @brief Initialization function for the floating-point FIR lattice filter.
- * @param[in] *S points to an instance of the floating-point FIR lattice structure.
- * @param[in] numStages number of filter stages.
- * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
- * @param[in] *pState points to the state buffer. The array is of length numStages.
- * @return none.
+ @brief Initialization function for the floating-point FIR lattice filter.
+ @param[in] S points to an instance of the floating-point FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
*/
void arm_fir_lattice_init_f32(
- arm_fir_lattice_instance_f32 * S,
- uint16_t numStages,
- float32_t * pCoeffs,
- float32_t * pState)
+ arm_fir_lattice_instance_f32 * S,
+ uint16_t numStages,
+ const float32_t * pCoeffs,
+ float32_t * pState)
{
/* Assign filter taps */
S->numStages = numStages;
@@ -63,9 +63,8 @@ void arm_fir_lattice_init_f32(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
index ab5afd6..5c80dff 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_lattice_init_q15.c
* Description: Q15 FIR Lattice filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,28 +29,28 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
- /**
- * @brief Initialization function for the Q15 FIR lattice filter.
- * @param[in] *S points to an instance of the Q15 FIR lattice structure.
- * @param[in] numStages number of filter stages.
- * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
- * @param[in] *pState points to the state buffer. The array is of length numStages.
- * @return none.
- */
+/**
+ @brief Initialization function for the Q15 FIR lattice filter.
+ @param[in] S points to an instance of the Q15 FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
+ */
void arm_fir_lattice_init_q15(
- arm_fir_lattice_instance_q15 * S,
- uint16_t numStages,
- q15_t * pCoeffs,
- q15_t * pState)
+ arm_fir_lattice_instance_q15 * S,
+ uint16_t numStages,
+ const q15_t * pCoeffs,
+ q15_t * pState)
{
/* Assign filter taps */
S->numStages = numStages;
@@ -63,9 +63,8 @@ void arm_fir_lattice_init_q15(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
index 4dc30cc..476296d 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_lattice_init_q31.c
* Description: Q31 FIR lattice filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,28 +29,28 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
- /**
- * @brief Initialization function for the Q31 FIR lattice filter.
- * @param[in] *S points to an instance of the Q31 FIR lattice structure.
- * @param[in] numStages number of filter stages.
- * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
- * @param[in] *pState points to the state buffer. The array is of length numStages.
- * @return none.
- */
+/**
+ @brief Initialization function for the Q31 FIR lattice filter.
+ @param[in] S points to an instance of the Q31 FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
+ */
void arm_fir_lattice_init_q31(
- arm_fir_lattice_instance_q31 * S,
- uint16_t numStages,
- q31_t * pCoeffs,
- q31_t * pState)
+ arm_fir_lattice_instance_q31 * S,
+ uint16_t numStages,
+ const q31_t * pCoeffs,
+ q31_t * pState)
{
/* Assign filter taps */
S->numStages = numStages;
@@ -63,9 +63,8 @@ void arm_fir_lattice_init_q31(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
index 4c4e849..42e7c0d 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_lattice_q15.c
* Description: Q15 FIR lattice filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,277 +29,275 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
-
/**
- * @brief Processing function for the Q15 FIR lattice filter.
- * @param[in] *S points to an instance of the Q15 FIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of samples to process.
- * @return none.
+ @brief Processing function for Q15 FIR lattice filter.
+ @param[in] S points to an instance of the Q15 FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
*/
void arm_fir_lattice_q15(
const arm_fir_lattice_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *px; /* temporary state pointer */
- q15_t *pk; /* temporary coefficient pointer */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* Temporary state pointer */
+ const q15_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ q31_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ q31_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ q31_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ gcurr0 = 0;
- q31_t fcurnt1, fnext1, gcurnt1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */
- q31_t fcurnt2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
- q31_t fcurnt3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
- q31_t fcurnt4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
- uint32_t numStages = S->numStages; /* Number of stages in the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
-
- pState = &S->pState[0];
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
-
/* Read two samples from input buffer */
/* f0(n) = x(n) */
- fcurnt1 = *pSrc++;
- fcurnt2 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
+ fcurr0 = *pSrc++;
+ fcurr1 = *pSrc++;
/* Initialize state pointer */
px = pState;
- /* Read g0(n-1) from state */
- gcurnt1 = *px;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
/* Process first sample for first tap */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1;
- fnext1 = __SSAT(fnext1, 16);
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (q31_t) ((fcurnt1 * (*pk)) >> 15U) + gcurnt1;
- gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fcurr0 * (*pk)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
/* Process second sample for first tap */
- /* for sample 2 processing */
- fnext2 = (q31_t) ((fcurnt1 * (*pk)) >> 15U) + fcurnt2;
- fnext2 = __SSAT(fnext2, 16);
-
- gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + fcurnt1;
- gnext2 = __SSAT(gnext2, 16);
-
+ fnext1 = (q31_t) ((fcurr0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + fcurr0;
+ gnext1 = __SSAT(gnext1, 16);
/* Read next two samples from input buffer */
/* f0(n+2) = x(n+2) */
- fcurnt3 = *pSrc++;
- fcurnt4 = *pSrc++;
-
- /* Copy only last input samples into the state buffer
- which is used for next four samples processing */
- *px++ = (q15_t) fcurnt4;
+ fcurr2 = *pSrc++;
+ fcurr3 = *pSrc++;
/* Process third sample for first tap */
- fnext3 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + fcurnt3;
- fnext3 = __SSAT(fnext3, 16);
- gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + fcurnt2;
- gnext3 = __SSAT(gnext3, 16);
+ fnext2 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + fcurr1;
+ gnext2 = __SSAT(gnext2, 16);
/* Process fourth sample for first tap */
- fnext4 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + fcurnt4;
- fnext4 = __SSAT(fnext4, 16);
- gnext4 = (q31_t) ((fcurnt4 * (*pk++)) >> 15U) + fcurnt3;
- gnext4 = __SSAT(gnext4, 16);
+ fnext3 = (q31_t) ((fcurr2 * (*pk )) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
+ gnext3 = (q31_t) ((fcurr3 * (*pk++)) >> 15U) + fcurr2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = (q15_t) fcurr3;
/* Update of f values for next coefficient set processing */
- fcurnt1 = fnext1;
- fcurnt2 = fnext2;
- fcurnt3 = fnext3;
- fcurnt4 = fnext4;
-
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
/* Loop unrolling. Process 4 taps at a time . */
- stageCnt = (numStages - 1U) >> 2;
-
+ stageCnt = (numStages - 1U) >> 2U;
/* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numStages-3 coefficients. */
+ Repeat until we've computed numStages-3 coefficients. */
/* Process 2nd, 3rd, 4th and 5th taps ... here */
while (stageCnt > 0U)
{
/* Read g1(n-1), g3(n-1) .... from state */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = (q15_t) gnext4;
+ *px++ = (q15_t) gnext3;
/* Process first sample for 2nd, 6th .. tap */
/* Sample processing for K2, K6.... */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1;
- fnext1 = __SSAT(fnext1, 16);
-
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* Process second sample for 2nd, 6th .. tap */
/* for sample 2 processing */
- fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2;
- fnext2 = __SSAT(fnext2, 16);
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+
/* Process third sample for 2nd, 6th .. tap */
- fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3;
- fnext3 = __SSAT(fnext3, 16);
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+
/* Process fourth sample for 2nd, 6th .. tap */
- /* fnext4 = fcurnt4 + (*pk) * gnext3; */
- fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4;
- fnext4 = __SSAT(fnext4, 16);
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
/* g1(n) = f0(n) * K1 + g0(n-1) */
/* Calculation of state values for next stage */
- gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3;
- gnext4 = __SSAT(gnext4, 16);
- gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
gnext3 = __SSAT(gnext3, 16);
- gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
gnext2 = __SSAT(gnext2, 16);
- gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1;
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
/* Read g2(n-1), g4(n-1) .... from state */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = (q15_t) gnext4;
+ *px++ = (q15_t) gnext3;
/* Sample processing for K3, K7.... */
/* Process first sample for 3rd, 7th .. tap */
/* f3(n) = f2(n) + K3 * g2(n-1) */
- fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fnext1;
- fcurnt1 = __SSAT(fcurnt1, 16);
+ fcurr0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fnext0;
+ fcurr0 = __SSAT(fcurr0, 16);
/* Process second sample for 3rd, 7th .. tap */
- fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
- fcurnt2 = __SSAT(fcurnt2, 16);
+ fcurr1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fnext1;
+ fcurr1 = __SSAT(fcurr1, 16);
/* Process third sample for 3rd, 7th .. tap */
- fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
- fcurnt3 = __SSAT(fcurnt3, 16);
+ fcurr2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
+ fcurr2 = __SSAT(fcurr2, 16);
/* Process fourth sample for 3rd, 7th .. tap */
- fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fnext4;
- fcurnt4 = __SSAT(fcurnt4, 16);
+ fcurr3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
+ fcurr3 = __SSAT(fcurr3, 16);
/* Calculation of state values for next stage */
/* g3(n) = f2(n) * K3 + g2(n-1) */
- gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15U) + gnext3;
- gnext4 = __SSAT(gnext4, 16);
-
gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2;
gnext3 = __SSAT(gnext3, 16);
gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1;
gnext2 = __SSAT(gnext2, 16);
- gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15U) + gcurnt1;
+ gnext1 = (q31_t) ((fnext1 * (*pk)) >> 15U) + gnext0;
gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fnext0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
/* Read g1(n-1), g3(n-1) .... from state */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = (q15_t) gnext4;
+ *px++ = (q15_t) gnext3;
/* Sample processing for K4, K8.... */
/* Process first sample for 4th, 8th .. tap */
/* f4(n) = f3(n) + K4 * g3(n-1) */
- fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1;
- fnext1 = __SSAT(fnext1, 16);
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* Process second sample for 4th, 8th .. tap */
/* for sample 2 processing */
- fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2;
- fnext2 = __SSAT(fnext2, 16);
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
/* Process third sample for 4th, 8th .. tap */
- fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3;
- fnext3 = __SSAT(fnext3, 16);
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
/* Process fourth sample for 4th, 8th .. tap */
- fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4;
- fnext4 = __SSAT(fnext4, 16);
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
/* g4(n) = f3(n) * K4 + g3(n-1) */
/* Calculation of state values for next stage */
- gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3;
- gnext4 = __SSAT(gnext4, 16);
-
- gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
gnext3 = __SSAT(gnext3, 16);
- gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
gnext2 = __SSAT(gnext2, 16);
- gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1;
+
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
/* Read g2(n-1), g4(n-1) .... from state */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g4(n) in state buffer */
- *px++ = (q15_t) gnext4;
+ *px++ = (q15_t) gnext3;
/* Sample processing for K5, K9.... */
/* Process first sample for 5th, 9th .. tap */
/* f5(n) = f4(n) + K5 * g4(n-1) */
- fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fnext1;
- fcurnt1 = __SSAT(fcurnt1, 16);
+ fcurr0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fnext0;
+ fcurr0 = __SSAT(fcurr0, 16);
/* Process second sample for 5th, 9th .. tap */
- fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
- fcurnt2 = __SSAT(fcurnt2, 16);
+ fcurr1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fnext1;
+ fcurr1 = __SSAT(fcurr1, 16);
/* Process third sample for 5th, 9th .. tap */
- fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
- fcurnt3 = __SSAT(fcurnt3, 16);
+ fcurr2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
+ fcurr2 = __SSAT(fcurr2, 16);
/* Process fourth sample for 5th, 9th .. tap */
- fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fnext4;
- fcurnt4 = __SSAT(fcurnt4, 16);
+ fcurr3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
+ fcurr3 = __SSAT(fcurr3, 16);
/* Calculation of state values for next stage */
/* g5(n) = f4(n) * K5 + g4(n-1) */
- gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15U) + gnext3;
- gnext4 = __SSAT(gnext4, 16);
gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2;
gnext3 = __SSAT(gnext3, 16);
+
gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1;
gnext2 = __SSAT(gnext2, 16);
- gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15U) + gcurnt1;
+
+ gnext1 = (q31_t) ((fnext1 * (*pk)) >> 15U) + gnext0;
gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fnext0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
stageCnt--;
}
@@ -308,95 +306,98 @@ void arm_fir_lattice_q15(
while (stageCnt > 0U)
{
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g value in state buffer */
- *px++ = (q15_t) gnext4;
+ *px++ = (q15_t) gnext3;
/* Process four samples for last three taps here */
- fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1;
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
fnext1 = __SSAT(fnext1, 16);
- fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2;
+
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
fnext2 = __SSAT(fnext2, 16);
- fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3;
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
fnext3 = __SSAT(fnext3, 16);
- fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4;
- fnext4 = __SSAT(fnext4, 16);
-
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3;
- gnext4 = __SSAT(gnext4, 16);
- gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
gnext3 = __SSAT(gnext3, 16);
- gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1;
+
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
gnext2 = __SSAT(gnext2, 16);
- gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1;
+
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
/* Update of f values for next coefficient set processing */
- fcurnt1 = fnext1;
- fcurnt2 = fnext2;
- fcurnt3 = fnext3;
- fcurnt4 = fnext4;
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
stageCnt--;
-
}
/* The results in the 4 accumulators, store in the destination buffer. */
/* y(n) = fN(n) */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ = __PKHBT(fcurnt1, fcurnt2, 16);
- *__SIMD32(pDst)++ = __PKHBT(fcurnt3, fcurnt4, 16);
-
+ write_q15x2_ia (&pDst, __PKHBT(fcurr0, fcurr1, 16));
+ write_q15x2_ia (&pDst, __PKHBT(fcurr2, fcurr3, 16));
#else
-
- *__SIMD32(pDst)++ = __PKHBT(fcurnt2, fcurnt1, 16);
- *__SIMD32(pDst)++ = __PKHBT(fcurnt4, fcurnt3, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pDst, __PKHBT(fcurr1, fcurr0, 16));
+ write_q15x2_ia (&pDst, __PKHBT(fcurr3, fcurr2, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* f0(n) = x(n) */
- fcurnt1 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
+ fcurr0 = *pSrc++;
/* Initialize state pointer */
px = pState;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
/* read g2(n) from state buffer */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* for sample 1 processing */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15U) + fcurnt1;
- fnext1 = __SSAT(fnext1, 16);
-
+ fnext0 = (((q31_t) gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15U) + gcurnt1;
- gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (((q31_t) fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
/* save g1(n) in state buffer */
- *px++ = (q15_t) fcurnt1;
+ *px++ = (q15_t) fcurr0;
- /* f1(n) is saved in fcurnt1
- for next stage processing */
- fcurnt1 = fnext1;
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt = (numStages - 1U);
@@ -404,79 +405,65 @@ void arm_fir_lattice_q15(
while (stageCnt > 0U)
{
/* read g2(n) from state buffer */
- gcurnt1 = *px;
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = (q15_t) gnext1;
+ *px++ = (q15_t) gnext0;
/* Sample processing for K2, K3.... */
/* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15U) + fcurnt1;
- fnext1 = __SSAT(fnext1, 16);
+ fnext0 = (((q31_t) gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15U) + gcurnt1;
- gnext1 = __SSAT(gnext1, 16);
+ gnext0 = (((q31_t) fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
-
- /* f1(n) is saved in fcurnt1
- for next stage processing */
- fcurnt1 = fnext1;
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt--;
-
}
/* y(n) = fN(n) */
- *pDst++ = __SSAT(fcurnt1, 16);
-
+ *pDst++ = __SSAT(fcurr0, 16);
blkCnt--;
-
}
#else
-
- /* Run the below code for Cortex-M0 */
-
- q31_t fcurnt, fnext, gcurnt, gnext; /* temporary variables */
- uint32_t numStages = S->numStages; /* Length of the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
-
- pState = &S->pState[0];
+/* alternate version for CM0_FAMILY */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* f0(n) = x(n) */
- fcurnt = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
+ fcurr0 = *pSrc++;
/* Initialize state pointer */
px = pState;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
/* read g0(n-1) from state buffer */
- gcurnt = *px;
+ gcurr0 = *px;
/* for sample 1 processing */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
- fnext = __SSAT(fnext, 16);
-
+ fnext0 = ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext, 16);
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
- gnext = __SSAT(gnext, 16);
+ gnext0 = ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
/* save f0(n) in state buffer */
- *px++ = (q15_t) fcurnt;
+ *px++ = (q15_t) fcurr0;
- /* f1(n) is saved in fcurnt
- for next stage processing */
- fcurnt = fnext;
+ /* f1(n) is saved in fcurr for next stage processing */
+ fcurr0 = fnext0;
stageCnt = (numStages - 1U);
@@ -484,41 +471,36 @@ void arm_fir_lattice_q15(
while (stageCnt > 0U)
{
/* read g1(n-1) from state buffer */
- gcurnt = *px;
+ gcurr0 = *px;
/* save g0(n-1) in state buffer */
- *px++ = (q15_t) gnext;
+ *px++ = (q15_t) gnext0;
/* Sample processing for K2, K3.... */
/* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
- fnext = __SSAT(fnext, 16);
+ fnext0 = ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
/* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
- gnext = __SSAT(gnext, 16);
+ gnext0 = ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
-
- /* f1(n) is saved in fcurnt
- for next stage processing */
- fcurnt = fnext;
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt--;
-
}
/* y(n) = fN(n) */
- *pDst++ = __SSAT(fcurnt, 16);
-
+ *pDst++ = __SSAT(fcurr0, 16);
blkCnt--;
-
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
}
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c b/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
index 8acfd34..c8d28d7 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_lattice_q31.c
* Description: Q31 FIR lattice filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,313 +29,477 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Lattice
- * @{
+ @addtogroup FIR_Lattice
+ @{
*/
-
/**
- * @brief Processing function for the Q31 FIR lattice filter.
- * @param[in] *S points to an instance of the Q31 FIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits.
+ @brief Processing function for the Q31 FIR lattice filter.
+ @param[in] S points to an instance of the Q31 FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits.
*/
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
void arm_fir_lattice_q31(
const arm_fir_lattice_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *px; /* temporary state pointer */
- q31_t *pk; /* temporary coefficient pointer */
- q31_t fcurr1, fnext1, gcurr1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */
- q31_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
- uint32_t numStages = S->numStages; /* Length of the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
- q31_t k;
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* Temporary state pointer */
+ const q31_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ q31_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
- pState = &S->pState[0];
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
- blkCnt = blockSize >> 1U;
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ q31_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ q31_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
+
+ gcurr0 = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
- a second loop below computes the remaining 1 sample. */
while (blkCnt > 0U)
{
+ /* Read two samples from input buffer */
/* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
fcurr1 = *pSrc++;
- /* f0(n) = x(n) */
- fcurr2 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
-
/* Initialize state pointer */
px = pState;
- /* read g0(n - 1) from state buffer */
- gcurr1 = *px;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
- /* Read the reflection coefficient */
- k = *pk++;
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
- /* for sample 1 processing */
+ /* Process first sample for first tap */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
/* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
- fnext1 = fcurr1 + (fnext1 << 1U);
- gnext1 = gcurr1 + (gnext1 << 1U);
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
- /* for sample 1 processing */
- /* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext2 = (q31_t) (((q63_t) fcurr1 * k) >> 32);
+ /* Process second sample for first tap */
+ fnext1 = (q31_t) (((q63_t) fcurr0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + fcurr0;
- /* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32);
- fnext2 = fcurr2 + (fnext2 << 1U);
- gnext2 = fcurr1 + (gnext2 << 1U);
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr2 = *pSrc++;
+ fcurr3 = *pSrc++;
- /* save g1(n) in state buffer */
- *px++ = fcurr2;
+ /* Process third sample for first tap */
+ fnext2 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + fcurr1;
- /* f1(n) is saved in fcurr1
- for next stage processing */
+ /* Process fourth sample for first tap */
+ fnext3 = (q31_t) (((q63_t) fcurr2 * (*pk )) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk++)) >> 32U);
+ gnext3 = (gnext3 << 1U) + fcurr2;
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
fcurr1 = fnext1;
fcurr2 = fnext2;
+ fcurr3 = fnext3;
- stageCnt = (numStages - 1U);
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1U) >> 2U;
- /* stage loop */
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
while (stageCnt > 0U)
{
-
- /* Read the reflection coefficient */
- k = *pk++;
-
- /* read g2(n) from state buffer */
- gcurr1 = *px;
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
/* save g1(n) in state buffer */
- *px++ = gnext2;
+ *px++ = gnext3;
- /* Sample processing for K2, K3.... */
- /* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
- fnext2 = (q31_t) (((q63_t) gnext1 * k) >> 32);
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
- fnext1 = fcurr1 + (fnext1 << 1U);
- fnext2 = fcurr2 + (fnext2 << 1U);
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32);
- gnext2 = gnext1 + (gnext2 << 1U);
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
- gnext1 = gcurr1 + (gnext1 << 1U);
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
- /* f1(n) is saved in fcurr1
- for next stage processing */
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fcurr0 = (fcurr0 << 1U) + fnext0;
+
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fcurr1 = (fcurr1 << 1U) + fnext1;
+
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fcurr2 = (fcurr2 << 1U) + fnext2;
+
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fcurr3 = (fcurr3 << 1U) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext3 = (q31_t) (((q63_t) fnext3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fnext2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fnext1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fnext0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+
+ /* Process third sample for 4th, 8th .. tap */
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fcurr0 = (fcurr0 << 1U) + fnext0;
+
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fcurr1 = (fcurr1 << 1U) + fnext1;
+
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fcurr2 = (fcurr2 << 1U) + fnext2;
+
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fcurr3 = (fcurr3 << 1U) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext3 = (q31_t) (((q63_t) fnext3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fnext2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fnext1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fnext0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1U) % 0x4U;
+
+ while (stageCnt > 0U)
+ {
+ gcurr0 = *px;
+
+ /* save g value in state buffer */
+ *px++ = gnext3;
+
+ /* Process four samples for last three taps here */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
fcurr1 = fnext1;
fcurr2 = fnext2;
+ fcurr3 = fnext3;
stageCnt--;
-
}
+ /* The results in the 4 accumulators, store in the destination buffer. */
/* y(n) = fN(n) */
+ *pDst++ = fcurr0;
*pDst++ = fcurr1;
*pDst++ = fcurr2;
+ *pDst++ = fcurr3;
blkCnt--;
-
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x2U;
-
- while (blkCnt > 0U)
- {
- /* f0(n) = x(n) */
- fcurr1 = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
-
- /* Initialize state pointer */
- px = pState;
-
- /* read g0(n - 1) from state buffer */
- gcurr1 = *px;
-
- /* Read the reflection coefficient */
- k = *pk++;
-
- /* for sample 1 processing */
- /* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
- fnext1 = fcurr1 + (fnext1 << 1U);
-
- /* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
- gnext1 = gcurr1 + (gnext1 << 1U);
-
- /* save g1(n) in state buffer */
- *px++ = fcurr1;
-
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr1 = fnext1;
-
- stageCnt = (numStages - 1U);
-
- /* stage loop */
- while (stageCnt > 0U)
- {
- /* Read the reflection coefficient */
- k = *pk++;
-
- /* read g2(n) from state buffer */
- gcurr1 = *px;
-
- /* save g1(n) in state buffer */
- *px++ = gnext1;
-
- /* Sample processing for K2, K3.... */
- /* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
- fnext1 = fcurr1 + (fnext1 << 1U);
-
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
- gnext1 = gcurr1 + (gnext1 << 1U);
-
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr1 = fnext1;
-
- stageCnt--;
-
- }
-
-
- /* y(n) = fN(n) */
- *pDst++ = fcurr1;
-
- blkCnt--;
-
- }
-
-
-}
-
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
-/* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
-void arm_fir_lattice_q31(
- const arm_fir_lattice_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
-{
- q31_t *pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *px; /* temporary state pointer */
- q31_t *pk; /* temporary coefficient pointer */
- q31_t fcurr, fnext, gcurr, gnext; /* temporary variables */
- uint32_t numStages = S->numStages; /* Length of the filter */
- uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- pState = &S->pState[0];
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+
+ blkCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* f0(n) = x(n) */
- fcurr = *pSrc++;
-
- /* Initialize coeff pointer */
- pk = (pCoeffs);
+ fcurr0 = *pSrc++;
/* Initialize state pointer */
px = pState;
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
/* read g0(n-1) from state buffer */
- gcurr = *px;
+ gcurr0 = *px;
/* for sample 1 processing */
/* f1(n) = f0(n) + K1 * g0(n-1) */
- fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
- /* g1(n) = f0(n) * K1 + g0(n-1) */
- gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
- /* save g1(n) in state buffer */
- *px++ = fcurr;
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext << 1U) + fcurr0;
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr = fnext;
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* save f0(n) in state buffer */
+ *px++ = fcurr0;
+
+ /* f1(n) is saved in fcurr for next stage processing */
+ fcurr0 = fnext0;
stageCnt = (numStages - 1U);
/* stage loop */
while (stageCnt > 0U)
{
- /* read g2(n) from state buffer */
- gcurr = *px;
+ /* read g1(n-1) from state buffer */
+ gcurr0 = *px;
- /* save g1(n) in state buffer */
- *px++ = gnext;
+ /* save g0(n-1) in state buffer */
+ *px++ = gnext0;
/* Sample processing for K2, K3.... */
/* f2(n) = f1(n) + K2 * g1(n-1) */
- fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
- /* g2(n) = f1(n) * K2 + g1(n-1) */
- gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
- /* f1(n) is saved in fcurr1
- for next stage processing */
- fcurr = fnext;
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
stageCnt--;
-
}
/* y(n) = fN(n) */
- *pDst++ = fcurr;
+ *pDst++ = fcurr0;
blkCnt--;
-
}
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
/**
- * @} end of FIR_Lattice group
+ @} end of FIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_q15.c b/DSP/Source/FilteringFunctions/arm_fir_q15.c
index e970a10..e20798e 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_q15.c
* Description: Q15 FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,334 +29,78 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @brief Processing function for the Q15 FIR filter.
- * @param[in] *S points to an instance of the Q15 FIR structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, state buffers should be aligned by 32-bit
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
- * Lastly, the accumulator is saturated to yield a result in 1.15 format.
- *
- * \par
- * Refer to the function arm_fir_fast_q15() for a faster but less precise implementation of this function.
+ @brief Processing function for the Q15 FIR filter.
+ @param[in] S points to an instance of the Q15 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+
+ @remark
+ Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
*/
-#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
-
void arm_fir_q15(
const arm_fir_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *px1; /* Temporary q15 pointer for state buffer */
- q15_t *pb; /* Temporary pointer for coefficient buffer */
- q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */
- q63_t acc0, acc1, acc2, acc3; /* Accumulators */
- uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Apply loop unrolling and compute 4 output values simultaneously.
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
* The variables acc0 ... acc3 hold output values that are being computed:
*
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
*/
+ blkCnt = blockSize >> 2U;
- blkCnt = blockSize >> 2;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Copy four new input samples into the state buffer.
- ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
-
- /* Set all accumulators to zero */
- acc0 = 0;
- acc1 = 0;
- acc2 = 0;
- acc3 = 0;
-
- /* Initialize state pointer of type q15 */
- px1 = pState;
-
- /* Initialize coeff pointer of type q31 */
- pb = pCoeffs;
-
- /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
- x0 = _SIMD32_OFFSET(px1);
-
- /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
- x1 = _SIMD32_OFFSET(px1 + 1U);
-
- px1 += 2U;
-
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
- tapCnt = numTaps >> 2;
-
- while (tapCnt > 0U)
- {
- /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
- c0 = *__SIMD32(pb)++;
-
- /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
- acc0 = __SMLALD(x0, c0, acc0);
-
- /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
- acc1 = __SMLALD(x1, c0, acc1);
-
- /* Read state x[n-N-2], x[n-N-3] */
- x2 = _SIMD32_OFFSET(px1);
-
- /* Read state x[n-N-3], x[n-N-4] */
- x3 = _SIMD32_OFFSET(px1 + 1U);
-
- /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
- acc2 = __SMLALD(x2, c0, acc2);
-
- /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
- acc3 = __SMLALD(x3, c0, acc3);
-
- /* Read coefficients b[N-2], b[N-3] */
- c0 = *__SIMD32(pb)++;
-
- /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
- acc0 = __SMLALD(x2, c0, acc0);
-
- /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
- acc1 = __SMLALD(x3, c0, acc1);
-
- /* Read state x[n-N-4], x[n-N-5] */
- x0 = _SIMD32_OFFSET(px1 + 2U);
-
- /* Read state x[n-N-5], x[n-N-6] */
- x1 = _SIMD32_OFFSET(px1 + 3U);
-
- /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
- acc2 = __SMLALD(x0, c0, acc2);
-
- /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
- acc3 = __SMLALD(x1, c0, acc3);
-
- px1 += 4U;
-
- tapCnt--;
-
- }
-
-
- /* If the filter length is not a multiple of 4, compute the remaining filter taps.
- ** This is always be 2 taps since the filter length is even. */
- if ((numTaps & 0x3U) != 0U)
- {
- /* Read 2 coefficients */
- c0 = *__SIMD32(pb)++;
-
- /* Fetch 4 state variables */
- x2 = _SIMD32_OFFSET(px1);
-
- x3 = _SIMD32_OFFSET(px1 + 1U);
-
- /* Perform the multiply-accumulates */
- acc0 = __SMLALD(x0, c0, acc0);
-
- px1 += 2U;
-
- acc1 = __SMLALD(x1, c0, acc1);
- acc2 = __SMLALD(x2, c0, acc2);
- acc3 = __SMLALD(x3, c0, acc3);
- }
-
- /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the 4 outputs in the destination buffer. */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
-#else
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-
-
- /* Advance the state pointer by 4 to process the next group of 4 samples */
- pState = pState + 4;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* Copy two samples into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0;
-
- /* Initialize state pointer of type q15 */
- px1 = pState;
-
- /* Initialize coeff pointer of type q31 */
- pb = pCoeffs;
-
- tapCnt = numTaps >> 1;
-
- do
- {
-
- c0 = *__SIMD32(pb)++;
- x0 = *__SIMD32(px1)++;
-
- acc0 = __SMLALD(x0, c0, acc0);
- tapCnt--;
- }
- while (tapCnt > 0U);
-
- /* The result is in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the output in the destination buffer. */
- *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- /* Calculation of count for copying integer writes */
- tapCnt = (numTaps - 1U) >> 2;
-
- while (tapCnt > 0U)
- {
-
- /* Copy state values to start of state buffer */
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
-
- tapCnt--;
-
- }
-
- /* Calculation of count for remaining q15_t data */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* copy remaining data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-}
-
-#else /* UNALIGNED_SUPPORT_DISABLE */
-
-void arm_fir_q15(
- const arm_fir_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
-{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q63_t acc0, acc1, acc2, acc3; /* Accumulators */
- q15_t *pb; /* Temporary pointer for coefficient buffer */
- q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */
- q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
- uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
-
-
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Apply loop unrolling and compute 4 output values simultaneously.
- * The variables acc0 ... acc3 hold output values that are being computed:
- *
- * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
- * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
- */
-
- blkCnt = blockSize >> 2;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* Copy four new input samples into the state buffer.
- ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
+ /* Copy 4 new input samples into the state buffer. */
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
-
/* Set all accumulators to zero */
acc0 = 0;
acc1 = 0;
@@ -370,19 +114,19 @@ void arm_fir_q15(
pb = pCoeffs;
/* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
- x0 = *__SIMD32(px)++;
+ x0 = read_q15x2_ia (&px);
/* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
- x2 = *__SIMD32(px)++;
+ x2 = read_q15x2_ia (&px);
/* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
- tapCnt = numTaps >> 2;
+ Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2U;
- while (tapCnt > 0)
+ while (tapCnt > 0U)
{
/* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
acc0 = __SMLALD(x0, c0, acc0);
@@ -398,7 +142,7 @@ void arm_fir_q15(
#endif
/* Read state x[n-N-4], x[n-N-5] */
- x0 = _SIMD32_OFFSET(px);
+ x0 = read_q15x2_ia (&px);
/* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
acc1 = __SMLALDX(x1, c0, acc1);
@@ -414,13 +158,13 @@ void arm_fir_q15(
acc3 = __SMLALDX(x1, c0, acc3);
/* Read coefficients b[N-2], b[N-3] */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
acc0 = __SMLALD(x2, c0, acc0);
/* Read state x[n-N-6], x[n-N-7] with offset */
- x2 = _SIMD32_OFFSET(px + 2U);
+ x2 = read_q15x2_ia (&px);
/* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
acc2 = __SMLALD(x0, c0, acc2);
@@ -438,21 +182,16 @@ void arm_fir_q15(
/* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
acc3 = __SMLALDX(x1, c0, acc3);
- /* Update state pointer for next state reading */
- px += 4U;
-
/* Decrement tap count */
tapCnt--;
-
}
/* If the filter length is not a multiple of 4, compute the remaining filter taps.
- ** This is always be 2 taps since the filter length is even. */
+ This is always be 2 taps since the filter length is even. */
if ((numTaps & 0x3U) != 0U)
{
-
/* Read last two coefficients */
- c0 = *__SIMD32(pb)++;
+ c0 = read_q15x2_ia ((q15_t **) &pb);
/* Perform the multiply-accumulates */
acc0 = __SMLALD(x0, c0, acc0);
@@ -466,7 +205,7 @@ void arm_fir_q15(
#endif
/* Read last state variables */
- x0 = *__SIMD32(px);
+ x0 = read_q15x2 (px);
/* Perform the multiply-accumulates */
acc1 = __SMLALDX(x1, c0, acc1);
@@ -482,37 +221,33 @@ void arm_fir_q15(
acc3 = __SMLALDX(x1, c0, acc3);
}
- /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the 4 outputs in the destination buffer. */
-
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ Then store the 4 outputs in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
-
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
#else
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
-
- *__SIMD32(pDst)++ =
- __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Advance the state pointer by 4 to process the next group of 4 samples */
- pState = pState + 4;
+ pState = pState + 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining output samples */
blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Copy two samples into state buffer */
@@ -529,33 +264,37 @@ void arm_fir_q15(
do
{
- acc0 += (q31_t) * px++ * *pb++;
- acc0 += (q31_t) * px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
+
tapCnt--;
}
while (tapCnt > 0U);
- /* The result is in 2.30 format. Convert to 1.15 with saturation.
- ** Then store the output in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
- /* Calculation of count for copying integer writes */
- tapCnt = (numTaps - 1U) >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
@@ -563,117 +302,31 @@ void arm_fir_q15(
*pStateCurnt++ = *pState++;
*pStateCurnt++ = *pState++;
+ /* Decrement loop counter */
tapCnt--;
-
}
- /* Calculation of count for remaining q15_t data */
+ /* Calculate remaining number of copies */
tapCnt = (numTaps - 1U) % 0x4U;
- /* copy remaining data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
+#else
- /* Decrement the loop counter */
- tapCnt--;
- }
-}
-
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
-#else /* ARM_MATH_CM0_FAMILY */
-
-
-/* Run the below code for Cortex-M0 */
-
-void arm_fir_q15(
- const arm_fir_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
-{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
-
-
-
- q15_t *px; /* Temporary pointer for state buffer */
- q15_t *pb; /* Temporary pointer for coefficient buffer */
- q63_t acc; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
-
- /* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Initialize blkCnt with blockSize */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc = 0;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = pCoeffs;
-
- tapCnt = numTaps;
-
- /* Perform the multiply-accumulates */
- do
- {
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc += (q31_t) * px++ * *pb++;
- tapCnt--;
- } while (tapCnt > 0U);
-
- /* The result is in 2.30 format. Convert to 1.15
- ** Then store the output in the destination buffer. */
- *pDst++ = (q15_t) __SSAT((acc >> 15U), 16);
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- /* Decrement the samples loop counter */
- blkCnt--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- /* Copy numTaps number of values */
+ /* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
- /* copy data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
-
-
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_q31.c b/DSP/Source/FilteringFunctions/arm_fir_q31.c
index 4ca8295..c57371b 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_q31.c
* Description: Q31 FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,80 +29,74 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @param[in] *S points to an instance of the Q31 FIR filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
- * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- * \par
- * Refer to the function arm_fir_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ @brief Processing function for Q31 FIR filter.
+ @param[in] S points to an instance of the Q31 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
*/
void arm_fir_q31(
const arm_fir_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t x0, x1, x2; /* Temporary variables to hold state */
- q31_t c0; /* Temporary variable to hold coefficient value */
- q31_t *px; /* Temporary pointer for state */
- q31_t *pb; /* Temporary pointer for coefficient buffer */
- q63_t acc0, acc1, acc2; /* Accumulators */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt, tapCntN3; /* Loop counters */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2; /* Temporary variables to hold state values */
+ q31_t c0; /* Temporary variable to hold coefficient value */
+#endif
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Apply loop unrolling and compute 4 output values simultaneously.
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
* The variables acc0 ... acc3 hold output values that are being computed:
*
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
*/
+
blkCnt = blockSize / 3;
- blockSize = blockSize - (3 * blkCnt);
- tapCnt = numTaps / 3;
- tapCntN3 = numTaps - (3 * tapCnt);
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Copy three new input samples into the state buffer */
+ /* Copy 3 new input samples into the state buffer. */
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
@@ -118,15 +112,14 @@ void arm_fir_q31(
/* Initialize coefficient pointer */
pb = pCoeffs;
- /* Read the first two samples from the state buffer:
- * x[n-numTaps], x[n-numTaps-1] */
- x0 = *(px++);
- x1 = *(px++);
+ /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */
+ x0 = *px++;
+ x1 = *px++;
- /* Loop unrolling. Process 3 taps at a time. */
- i = tapCnt;
+ /* Loop unrolling: process 3 taps at a time. */
+ tapCnt = numTaps / 3;
- while (i > 0U)
+ while (tapCnt > 0U)
{
/* Read the b[numTaps] coefficient */
c0 = *pb;
@@ -160,15 +153,14 @@ void arm_fir_q31(
acc1 += ((q63_t) x0 * c0);
acc2 += ((q63_t) x1 * c0);
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
- /* If the filter length is not a multiple of 3, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = numTaps % 0x3U;
- i = tapCntN3;
-
- while (i > 0U)
+ while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *(pb++);
@@ -185,117 +177,39 @@ void arm_fir_q31(
x0 = x1;
x1 = x2;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
/* Advance the state pointer by 3 to process the next group of 3 samples */
pState = pState + 3;
- /* The results in the 3 accumulators are in 2.30 format. Convert to 1.31
- ** Then store the 3 outputs in the destination buffer. */
+ /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */
*pDst++ = (q31_t) (acc0 >> 31U);
*pDst++ = (q31_t) (acc1 >> 31U);
*pDst++ = (q31_t) (acc2 >> 31U);
- /* Decrement the samples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 3, compute any remaining output samples here.
- ** No loop unrolling is used. */
-
- while (blockSize > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = (pCoeffs);
-
- i = numTaps;
-
- /* Perform the multiply-accumulates */
- do
- {
- acc0 += (q63_t) * (px++) * (*(pb++));
- i--;
- } while (i > 0U);
-
- /* The result is in 2.62 format. Convert to 1.31
- ** Then store the output in the destination buffer. */
- *pDst++ = (q31_t) (acc0 >> 31U);
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- /* Decrement the samples loop counter */
- blockSize--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- tapCnt = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x3U;
#else
-/* Run the below code for Cortex-M0 */
-
- q31_t *px; /* Temporary pointer for state */
- q31_t *pb; /* Temporary pointer for coefficient buffer */
- q63_t acc; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Length of the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
-
- /* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Initialize blkCnt with blockSize */
+ /* Initialize blkCnt with number of taps */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
- acc = 0;
+ acc0 = 0;
/* Initialize state pointer */
px = pState;
@@ -309,45 +223,66 @@ void arm_fir_q31(
do
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc += (q63_t) * px++ * *pb++;
+ acc0 += (q63_t) *px++ * *pb++;
+
i--;
} while (i > 0U);
- /* The result is in 2.62 format. Convert to 1.31
- ** Then store the output in the destination buffer. */
- *pDst++ = (q31_t) (acc >> 31U);
+ /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31U);
/* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
+ pState = pState + 1U;
- /* Decrement the samples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
- /* Copy numTaps number of values */
- tapCnt = numTaps - 1U;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Copy the data */
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_q7.c b/DSP/Source/FilteringFunctions/arm_fir_q7.c
index 23e60ad..5f6d354 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_q7.c
@@ -3,13 +3,13 @@
* Title: arm_fir_q7.c
* Description: Q7 FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,72 +29,70 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR
- * @{
+ @addtogroup FIR
+ @{
*/
/**
- * @param[in] *S points to an instance of the Q7 FIR filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * The accumulator is converted to 18.7 format by discarding the low 7 bits.
- * Finally, the result is truncated to 1.7 format.
+ @brief Processing function for Q7 FIR filter.
+ @param[in] S points to an instance of the Q7 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ The accumulator is converted to 18.7 format by discarding the low 7 bits.
+ Finally, the result is truncated to 1.7 format.
*/
void arm_fir_q7(
const arm_fir_instance_q7 * S,
- q7_t * pSrc,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
{
+ q7_t *pState = S->pState; /* State pointer */
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *pStateCurnt; /* Points to the current sample of the state */
+ q7_t *px; /* Temporary pointer for state buffer */
+ const q7_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q7_t *pState = S->pState; /* State pointer */
- q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q7_t *pStateCurnt; /* Points to the current sample of the state */
- q7_t x0, x1, x2, x3; /* Temporary variables to hold state */
- q7_t c0; /* Temporary variable to hold coefficient value */
- q7_t *px; /* Temporary pointer for state */
- q7_t *pb; /* Temporary pointer for coefficient buffer */
- q31_t acc0, acc1, acc2, acc3; /* Accumulators */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q7_t x0, x1, x2, x3, c0; /* Temporary variables to hold state */
+#endif
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Apply loop unrolling and compute 4 output values simultaneously.
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
* The variables acc0 ... acc3 hold output values that are being computed:
*
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
- * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
- * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
*/
- blkCnt = blockSize >> 2;
+ blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Copy four new input samples into the state buffer */
+ /* Copy 4 new input samples into the state buffer. */
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
*pStateCurnt++ = *pSrc++;
@@ -112,17 +110,18 @@ void arm_fir_q7(
/* Initialize coefficient pointer */
pb = pCoeffs;
- /* Read the first three samples from the state buffer:
+ /* Read the first 3 samples from the state buffer:
* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
- x0 = *(px++);
- x1 = *(px++);
- x2 = *(px++);
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
- i = tapCnt;
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
- while (i > 0U)
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
{
/* Read the b[numTaps] coefficient */
c0 = *pb;
@@ -182,14 +181,14 @@ void arm_fir_q7(
pb += 4U;
px += 4U;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
- i = numTaps - (tapCnt * 4U);
- while (i > 0U)
+ while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *(pb++);
@@ -208,15 +207,12 @@ void arm_fir_q7(
x1 = x2;
x2 = x3;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
- /* Advance the state pointer by 4 to process the next group of 4 samples */
- pState = pState + 4;
-
- /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
- ** Then store the 4 outputs in the destination buffer. */
+ /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
+ Then store the 4 outputs in the destination buffer. */
acc0 = __SSAT((acc0 >> 7U), 8);
*pDst++ = acc0;
acc1 = __SSAT((acc1 >> 7U), 8);
@@ -226,14 +222,22 @@ void arm_fir_q7(
acc3 = __SSAT((acc3 >> 7U), 8);
*pDst++ = acc3;
- /* Decrement the samples loop counter */
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
blkCnt--;
}
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 4U;
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
@@ -247,7 +251,7 @@ void arm_fir_q7(
px = pState;
/* Initialize Coefficient pointer */
- pb = (pCoeffs);
+ pb = pCoeffs;
i = numTaps;
@@ -258,27 +262,30 @@ void arm_fir_q7(
i--;
} while (i > 0U);
- /* The result is in 2.14 format. Convert to 1.7
- ** Then store the output in the destination buffer. */
+ /* The result is in 2.14 format. Convert to 1.7
+ Then store the output in the destination buffer. */
*pDst++ = __SSAT((acc0 >> 7U), 8);
/* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
+ pState = pState + 1U;
- /* Decrement the samples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
tapCnt = (numTaps - 1U) >> 2U;
- /* copy data */
+ /* Copy data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
@@ -286,14 +293,21 @@ void arm_fir_q7(
*pStateCurnt++ = *pState++;
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* Calculate remaining number of copies */
tapCnt = (numTaps - 1U) % 0x4U;
- /* Copy the remaining q31_t data */
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
@@ -302,84 +316,8 @@ void arm_fir_q7(
tapCnt--;
}
-#else
-
-/* Run the below code for Cortex-M0 */
-
- uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
- uint32_t i, blkCnt; /* Loop counters */
- q7_t *pState = S->pState; /* State pointer */
- q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q7_t *px, *pb; /* Temporary pointers to state and coeff */
- q31_t acc = 0; /* Accumlator */
- q7_t *pStateCurnt; /* Points to the current sample of the state */
-
-
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
-
- /* Initialize blkCnt with blockSize */
- blkCnt = blockSize;
-
- /* Perform filtering upto BlockSize - BlockSize%4 */
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set accumulator to zero */
- acc = 0;
-
- /* Initialize state pointer of type q7 */
- px = pState;
-
- /* Initialize coeff pointer of type q7 */
- pb = pCoeffs;
-
-
- i = numTaps;
-
- while (i > 0U)
- {
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc += (q15_t) * px++ * *pb++;
- i--;
- }
-
- /* Store the 1.7 format filter output in destination buffer */
- *pDst++ = (q7_t) __SSAT((acc >> 7), 8);
-
- /* Advance the state pointer by 1 to process the next sample */
- pState = pState + 1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
-
- /* Copy numTaps number of values */
- i = (numTaps - 1U);
-
- /* Copy q7_t data */
- while (i > 0U)
- {
- *pStateCurnt++ = *pState++;
- i--;
- }
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR group
+ @} end of FIR group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
index bba2936..f44f037 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_f32.c
* Description: Floating-point sparse FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,117 +29,111 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters
- *
- * This group of functions implements sparse FIR filters.
- * Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero.
- * Sparse filters are used for simulating reflections in communications and audio applications.
- *
- * There are separate functions for Q7, Q15, Q31, and floating-point data types.
- * The functions operate on blocks of input and output data and each call to the function processes
- * blockSize samples through the filter. pSrc and
- * pDst points to input and output arrays respectively containing blockSize values.
- *
- * \par Algorithm:
- * The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients.
- * This is in addition to the coefficient array b.
- * The implementation essentially skips the multiplications by zero and leads to an efficient realization.
- *
- * \par
- * \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients"
- * \par
- * pCoeffs points to a coefficient array of size numTaps;
- * pTapDelay points to an array of nonzero indices and is also of size numTaps;
- * pState points to a state array of size maxDelay + blockSize, where
- * maxDelay is the largest offset value that is ever used in the pTapDelay array.
- * Some of the processing functions also require temporary working buffers.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared.
- * There are separate instance structure declarations for each of the 4 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numTaps, pCoeffs, pTapDelay, maxDelay, stateIndex, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * The code below statically initializes each of the 4 different data type filter instance structures
- *
- * \par
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the sparse FIR filter functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters
+
+ This group of functions implements sparse FIR filters.
+ Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero.
+ Sparse filters are used for simulating reflections in communications and audio applications.
+
+ There are separate functions for Q7, Q15, Q31, and floating-point data types.
+ The functions operate on blocks of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst points to input and output arrays respectively containing blockSize values.
+
+ @par Algorithm
+ The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients.
+ This is in addition to the coefficient array b.
+ The implementation essentially skips the multiplications by zero and leads to an efficient realization.
+
+ @par
+ \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients"
+ @par
+ pCoeffs points to a coefficient array of size numTaps;
+ pTapDelay points to an array of nonzero indices and is also of size numTaps;
+ pState points to a state array of size maxDelay + blockSize, where
+ maxDelay is the largest offset value that is ever used in the pTapDelay array.
+ Some of the processing functions also require temporary working buffers.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 4 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, pTapDelay, maxDelay, stateIndex, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 4 different data type filter instance structures
+
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the sparse FIR filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
/**
- * @brief Processing function for the floating-point sparse FIR filter.
- * @param[in] *S points to an instance of the floating-point sparse FIR structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
+ @brief Processing function for the floating-point sparse FIR filter.
+ @param[in] S points to an instance of the floating-point sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
*/
void arm_fir_sparse_f32(
- arm_fir_sparse_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- float32_t * pScratchIn,
- uint32_t blockSize)
+ arm_fir_sparse_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ float32_t * pScratchIn,
+ uint32_t blockSize)
{
-
- float32_t *pState = S->pState; /* State pointer */
- float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float32_t *px; /* Scratch buffer pointer */
- float32_t *py = pState; /* Temporary pointers for state buffer */
- float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
- float32_t *pOut; /* Destination pointer */
- int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
- uint32_t delaySize = S->maxDelay + blockSize; /* state length */
- uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- int32_t readIndex; /* Read index of the state buffer */
- uint32_t tapCnt, blkCnt; /* loop counters */
- float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
-
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* Scratch buffer pointer */
+ float32_t *py = pState; /* Temporary pointers for state buffer */
+ float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ float32_t *pOut; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
/* BlockSize of Input samples are copied into the state buffer */
/* StateIndex points to the starting position to write in the state buffer */
- arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
- (int32_t *) pSrc, 1, blockSize);
-
+ arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, (int32_t *) pSrc, 1, blockSize);
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -152,46 +146,51 @@ void arm_fir_sparse_f32(
/* blockSize samples are read from the state buffer */
arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
- /* Working pointer for the scratch buffer */
+ /* Working pointer for the scratch buffer of state values */
px = pb;
- /* Working pointer for destination buffer */
+ /* Working pointer for scratch buffer of output values */
pOut = pDst;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 Multiplications at a time. */
+ /* Loop unrolling: Compute 4 outputs at a time. */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform Multiplications and store in destination buffer */
*pOut++ = *px++ * coeff;
- *pOut++ = *px++ * coeff;
- *pOut++ = *px++ * coeff;
+
*pOut++ = *px++ * coeff;
- /* Decrement the loop counter */
+ *pOut++ = *px++ * coeff;
+
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* Perform Multiplications and store in destination buffer */
+ /* Perform Multiplication and store in destination buffer */
*pOut++ = *px++ * coeff;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -200,7 +199,7 @@ void arm_fir_sparse_f32(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -213,169 +212,56 @@ void arm_fir_sparse_f32(
while (tapCnt > 0U)
{
-
/* Working pointer for state buffer is updated */
py = pState;
/* blockSize samples are read from the state buffer */
arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
- /* Working pointer for the scratch buffer */
+ /* Working pointer for the scratch buffer of state values */
px = pb;
- /* Working pointer for destination buffer */
+ /* Working pointer for scratch buffer of output values */
pOut = pDst;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
*pOut++ += *px++ * coeff;
- *pOut++ += *px++ * coeff;
- *pOut++ += *px++ * coeff;
+
*pOut++ += *px++ * coeff;
- /* Decrement the loop counter */
+ *pOut++ += *px++ * coeff;
+
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pOut++ += *px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex -
- (int32_t) blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Decrement the tap loop counter */
- tapCnt--;
- }
-
- /* Compute last tap without the final read of pTapDelay */
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
-
- /* Working pointer for the scratch buffer */
- px = pb;
-
- /* Working pointer for destination buffer */
- pOut = pDst;
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2U;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pOut++ += *px++ * coeff;
- *pOut++ += *px++ * coeff;
- *pOut++ += *px++ * coeff;
- *pOut++ += *px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pOut++ += *px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
-/* Run the below code for Cortex-M0 */
-
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiplications and store in destination buffer */
- *pOut++ = *px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Loop over the number of taps. */
- tapCnt = (uint32_t) numTaps - 2U;
-
- while (tapCnt > 0U)
- {
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
-
- /* Working pointer for the scratch buffer */
- px = pb;
-
- /* Working pointer for destination buffer */
- pOut = pDst;
-
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
*pOut++ += *px++ * coeff;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -384,8 +270,7 @@ void arm_fir_sparse_f32(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex =
- ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -393,41 +278,64 @@ void arm_fir_sparse_f32(
readIndex += (int32_t) delaySize;
}
- /* Decrement the tap loop counter */
+ /* Decrement tap loop counter */
tapCnt--;
}
- /* Compute last tap without the final read of pTapDelay */
+ /* Compute last tap without the final read of pTapDelay */
- /* Working pointer for state buffer is updated */
- py = pState;
+ /* Working pointer for state buffer is updated */
+ py = pState;
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
- /* Working pointer for the scratch buffer */
- px = pb;
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
- /* Working pointer for destination buffer */
- pOut = pDst;
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
- blkCnt = blockSize;
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pOut++ += *px++ * coeff;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
-#endif /* #if defined (ARM_MATH_DSP) */
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
index d663679..7745e71 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_init_f32.c
* Description: Floating-point sparse FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,43 +29,42 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
/**
- * @brief Initialization function for the floating-point sparse FIR filter.
- * @param[in,out] *S points to an instance of the floating-point sparse FIR structure.
- * @param[in] numTaps number of nonzero coefficients in the filter.
- * @param[in] *pCoeffs points to the array of filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] *pTapDelay points to the array of offset times.
- * @param[in] maxDelay maximum offset time supported.
- * @param[in] blockSize number of samples that will be processed per block.
- * @return none
- *
- * Description:
- * \par
- * pCoeffs holds the filter coefficients and has length numTaps.
- * pState holds the filter's state variables and must be of length
- * maxDelay + blockSize, where maxDelay
- * is the maximum number of delay line values.
- * blockSize is the
- * number of samples processed by the arm_fir_sparse_f32() function.
+ @brief Initialization function for the floating-point sparse FIR filter.
+ @param[in,out] S points to an instance of the floating-point sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of samples processed by the arm_fir_sparse_f32() function.
*/
void arm_fir_sparse_init_f32(
- arm_fir_sparse_instance_f32 * S,
- uint16_t numTaps,
- float32_t * pCoeffs,
- float32_t * pState,
- int32_t * pTapDelay,
- uint16_t maxDelay,
- uint32_t blockSize)
+ arm_fir_sparse_instance_f32 * S,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -87,9 +86,8 @@ void arm_fir_sparse_init_f32(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
index 08c2d0e..d07d611 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_init_q15.c
* Description: Q15 sparse FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,43 +29,42 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
/**
- * @brief Initialization function for the Q15 sparse FIR filter.
- * @param[in,out] *S points to an instance of the Q15 sparse FIR structure.
- * @param[in] numTaps number of nonzero coefficients in the filter.
- * @param[in] *pCoeffs points to the array of filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] *pTapDelay points to the array of offset times.
- * @param[in] maxDelay maximum offset time supported.
- * @param[in] blockSize number of samples that will be processed per block.
- * @return none
- *
- * Description:
- * \par
- * pCoeffs holds the filter coefficients and has length numTaps.
- * pState holds the filter's state variables and must be of length
- * maxDelay + blockSize, where maxDelay
- * is the maximum number of delay line values.
- * blockSize is the
- * number of words processed by arm_fir_sparse_q15() function.
+ @brief Initialization function for the Q15 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q15 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of words processed by arm_fir_sparse_q15() function.
*/
void arm_fir_sparse_init_q15(
- arm_fir_sparse_instance_q15 * S,
- uint16_t numTaps,
- q15_t * pCoeffs,
- q15_t * pState,
- int32_t * pTapDelay,
- uint16_t maxDelay,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q15 * S,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -87,9 +86,8 @@ void arm_fir_sparse_init_q15(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
index 4a94232..7c32cea 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_init_q31.c
* Description: Q31 sparse FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,42 +29,41 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
/**
- * @brief Initialization function for the Q31 sparse FIR filter.
- * @param[in,out] *S points to an instance of the Q31 sparse FIR structure.
- * @param[in] numTaps number of nonzero coefficients in the filter.
- * @param[in] *pCoeffs points to the array of filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] *pTapDelay points to the array of offset times.
- * @param[in] maxDelay maximum offset time supported.
- * @param[in] blockSize number of samples that will be processed per block.
- * @return none
- *
- * Description:
- * \par
- * pCoeffs holds the filter coefficients and has length numTaps.
- * pState holds the filter's state variables and must be of length
- * maxDelay + blockSize, where maxDelay
- * is the maximum number of delay line values.
- * blockSize is the number of words processed by arm_fir_sparse_q31() function.
+ @brief Initialization function for the Q31 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q31 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the number of words processed by arm_fir_sparse_q31() function.
*/
void arm_fir_sparse_init_q31(
- arm_fir_sparse_instance_q31 * S,
- uint16_t numTaps,
- q31_t * pCoeffs,
- q31_t * pState,
- int32_t * pTapDelay,
- uint16_t maxDelay,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q31 * S,
+ uint16_t numTaps,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -86,9 +85,8 @@ void arm_fir_sparse_init_q31(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
index 58d6705..98153f3 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_init_q7.c
* Description: Q7 sparse FIR filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,43 +29,42 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
/**
- * @brief Initialization function for the Q7 sparse FIR filter.
- * @param[in,out] *S points to an instance of the Q7 sparse FIR structure.
- * @param[in] numTaps number of nonzero coefficients in the filter.
- * @param[in] *pCoeffs points to the array of filter coefficients.
- * @param[in] *pState points to the state buffer.
- * @param[in] *pTapDelay points to the array of offset times.
- * @param[in] maxDelay maximum offset time supported.
- * @param[in] blockSize number of samples that will be processed per block.
- * @return none
- *
- * Description:
- * \par
- * pCoeffs holds the filter coefficients and has length numTaps.
- * pState holds the filter's state variables and must be of length
- * maxDelay + blockSize, where maxDelay
- * is the maximum number of delay line values.
- * blockSize is the
- * number of samples processed by the arm_fir_sparse_q7() function.
+ @brief Initialization function for the Q7 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q7 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of samples processed by the arm_fir_sparse_q7() function.
*/
void arm_fir_sparse_init_q7(
- arm_fir_sparse_instance_q7 * S,
- uint16_t numTaps,
- q7_t * pCoeffs,
- q7_t * pState,
- int32_t * pTapDelay,
- uint16_t maxDelay,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q7 * S,
+ uint16_t numTaps,
+ const q7_t * pCoeffs,
+ q7_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -87,9 +86,8 @@ void arm_fir_sparse_init_q7(
/* Assign state pointer */
S->pState = pState;
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
index e17f2bd..9cea93e 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_q15.c
* Description: Q15 sparse FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,72 +29,68 @@
#include "arm_math.h"
/**
- * @addtogroup FIR_Sparse
- * @{
+ @ingroup groupFilters
*/
/**
- * @brief Processing function for the Q15 sparse FIR filter.
- * @param[in] *S points to an instance of the Q15 sparse FIR structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
- * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 32-bit accumulator.
- * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
- * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
- * If the accumulator result overflows it will wrap around rather than saturate.
- * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
- * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ @addtogroup FIR_Sparse
+ @{
*/
+/**
+ @brief Processing function for the Q15 sparse FIR filter.
+ @param[in] S points to an instance of the Q15 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] pScratchOut points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 32-bit accumulator.
+ The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
+ Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
+ If the accumulator result overflows it will wrap around rather than saturate.
+ After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
+ In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ */
void arm_fir_sparse_q15(
- arm_fir_sparse_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- q15_t * pScratchIn,
- q31_t * pScratchOut,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ q15_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* Temporary pointers for scratch buffer */
+ q15_t *py = pState; /* Temporary pointers for state buffer */
+ q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q15_t *pOut = pDst; /* Working pointer for output */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
+ q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pIn = pSrc; /* Working pointer for input */
- q15_t *pOut = pDst; /* Working pointer for output */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *px; /* Temporary pointers for scratch buffer */
- q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
- q15_t *py = pState; /* Temporary pointers for state buffer */
- int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
- uint32_t delaySize = S->maxDelay + blockSize; /* state length */
- uint16_t numTaps = S->numTaps; /* Filter order */
- int32_t readIndex; /* Read index of the state buffer */
- uint32_t tapCnt, blkCnt; /* loop counters */
- q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
- q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */
-
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t in1, in2; /* Temporary variables */
-
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in1, in2; /* Temporary variables */
+#endif
/* BlockSize of Input samples are copied into the state buffer */
/* StateIndex points to the starting position to write in the state buffer */
- arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
+ arm_circularWrite_q15(py, (int32_t) delaySize, &S->stateIndex, 1,pSrc, 1, blockSize);
/* Loop over the number of taps. */
tapCnt = numTaps;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -106,8 +102,8 @@ void arm_fir_sparse_q15(
py = pState;
/* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -115,32 +111,40 @@ void arm_fir_sparse_q15(
/* Working pointer for scratch buffer of output values */
pScratchOut = pScr2;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 multiplications at a time. */
- blkCnt = blockSize >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
+ /* Perform Multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -149,7 +153,7 @@ void arm_fir_sparse_q15(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -166,8 +170,8 @@ void arm_fir_sparse_q15(
py = pState;
/* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -175,32 +179,40 @@ void arm_fir_sparse_q15(
/* Working pointer for scratch buffer of output values */
pScratchOut = pScr2;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -209,7 +221,7 @@ void arm_fir_sparse_q15(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -217,136 +229,18 @@ void arm_fir_sparse_q15(
readIndex += (int32_t) delaySize;
}
- /* Decrement the tap loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Compute last tap without the final read of pTapDelay */
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
- *pScratchOut++ += (q31_t) * px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* All the output values are in pScratchOut buffer.
- Convert them into 1.15 format, saturate and store in the destination buffer. */
- /* Loop over the blockSize. */
- blkCnt = blockSize >> 2;
-
- while (blkCnt > 0U)
- {
- in1 = *pScr2++;
- in2 = *pScr2++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
- 16);
-
-#else
- *__SIMD32(pOut)++ =
- __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
- 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- in1 = *pScr2++;
-
- in2 = *pScr2++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pOut)++ =
- __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
- 16);
-
-#else
-
- *__SIMD32(pOut)++ =
- __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
- 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-
- blkCnt--;
-
- }
-
- /* If the blockSize is not a multiple of 4,
- remaining samples are processed in the below loop */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
- blkCnt--;
- }
-
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* BlockSize of Input samples are copied into the state buffer */
- /* StateIndex points to the starting position to write in the state buffer */
- arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
-
- /* Loop over the number of taps. */
- tapCnt = numTaps;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
+ /* Compute last tap without the final read of pTapDelay */
/* Working pointer for state buffer is updated */
py = pState;
/* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -354,117 +248,94 @@ void arm_fir_sparse_q15(
/* Working pointer for scratch buffer of output values */
pScratchOut = pScr2;
- blkCnt = blockSize;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
- /* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+#else
- /* Wraparound of readIndex */
- if (readIndex < 0)
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
{
- readIndex += (int32_t) delaySize;
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
}
- /* Loop over the number of taps. */
- tapCnt = (uint32_t) numTaps - 2U;
-
- while (tapCnt > 0U)
- {
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Decrement the tap loop counter */
- tapCnt--;
- }
-
- /* Compute last tap without the final read of pTapDelay */
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q15(py, delaySize, &readIndex, 1,
- pb, pb, blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- *pScratchOut++ += (q31_t) * px++ * coeff;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
/* All the output values are in pScratchOut buffer.
Convert them into 1.15 format, saturate and store in the destination buffer. */
- /* Loop over the blockSize. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ in1 = *pScr2++;
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ in1 = *pScr2++;
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
*pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
index e441716..86d3e1d 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_q31.c
* Description: Q31 sparse FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,52 +28,53 @@
#include "arm_math.h"
-
/**
- * @addtogroup FIR_Sparse
- * @{
+ @ingroup groupFilters
*/
/**
- * @brief Processing function for the Q31 sparse FIR filter.
- * @param[in] *S points to an instance of the Q31 sparse FIR structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 32-bit accumulator.
- * The 1.31 x 1.31 multiplications are truncated to 2.30 format.
- * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
- * If the accumulator result overflows, it wraps around rather than saturate.
- * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 sparse FIR filter.
+ @param[in] S points to an instance of the Q31 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 32-bit accumulator.
+ The 1.31 x 1.31 multiplications are truncated to 2.30 format.
+ This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
+ If the accumulator result overflows, it wraps around rather than saturate.
+ In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
*/
void arm_fir_sparse_q31(
- arm_fir_sparse_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- q31_t * pScratchIn,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ q31_t * pScratchIn,
+ uint32_t blockSize)
{
-
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *px; /* Scratch buffer pointer */
- q31_t *py = pState; /* Temporary pointers for state buffer */
- q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
- q31_t *pOut; /* Destination pointer */
- q63_t out; /* Temporary output variable */
- int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
- uint32_t delaySize = S->maxDelay + blockSize; /* state length */
- uint16_t numTaps = S->numTaps; /* Filter order */
- int32_t readIndex; /* Read index of the state buffer */
- uint32_t tapCnt, blkCnt; /* loop counters */
- q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
- q31_t in;
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* Scratch buffer pointer */
+ q31_t *py = pState; /* Temporary pointers for state buffer */
+ q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q31_t *pOut; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
+ q31_t in;
+ q63_t out; /* Temporary output variable */
/* BlockSize of Input samples are copied into the state buffer */
@@ -95,8 +96,7 @@ void arm_fir_sparse_q31(
/* blockSize samples are read from the state buffer */
arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -105,233 +105,42 @@ void arm_fir_sparse_q31(
pOut = pDst;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 Multiplications at a time. */
- blkCnt = blockSize >> 2;
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
- /* Perform Multiplications and store in the destination buffer */
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
- /* Decrement the loop counter */
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* Perform Multiplications and store in the destination buffer */
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Loop over the number of taps. */
- tapCnt = (uint32_t) numTaps - 2U;
-
- while (tapCnt > 0U)
- {
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pOut = pDst;
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
-
- while (blkCnt > 0U)
- {
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Decrement the tap loop counter */
- tapCnt--;
- }
-
- /* Compute last tap without the final read of pTapDelay */
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pOut = pDst;
-
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
-
- while (blkCnt > 0U)
- {
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Working output pointer is updated */
- pOut = pDst;
-
- /* Output is converted into 1.31 format. */
- /* Loop over the blockSize. Unroll by a factor of 4.
- * process 4 output samples at a time. */
- blkCnt = blockSize >> 2;
-
- while (blkCnt > 0U)
- {
- in = *pOut << 1;
- *pOut++ = in;
- in = *pOut << 1;
- *pOut++ = in;
- in = *pOut << 1;
- *pOut++ = in;
- in = *pOut << 1;
- *pOut++ = in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4,
- * process the remaining output samples */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- in = *pOut << 1;
- *pOut++ = in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* Perform Multiplications and store in the destination buffer */
- *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+ /* Perform Multiplication and store in destination buffer */
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -358,8 +167,7 @@ void arm_fir_sparse_q31(
/* blockSize samples are read from the state buffer */
arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -367,16 +175,53 @@ void arm_fir_sparse_q31(
/* Working pointer for scratch buffer of output values */
pOut = pDst;
- blkCnt = blockSize;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
+ out += ((q63_t) *px++ * coeff) >> 32;
*pOut++ = (q31_t) (out);
- /* Decrement the loop counter */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
blkCnt--;
}
@@ -393,58 +238,120 @@ void arm_fir_sparse_q31(
readIndex += (int32_t) delaySize;
}
- /* Decrement the tap loop counter */
+ /* Decrement tap loop counter */
tapCnt--;
}
- /* Compute last tap without the final read of pTapDelay */
+ /* Compute last tap without the final read of pTapDelay */
- /* Working pointer for state buffer is updated */
- py = pState;
+ /* Working pointer for state buffer is updated */
+ py = pState;
- /* blockSize samples are read from the state buffer */
- arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
- (int32_t *) pb, (int32_t *) pb, blockSize, 1,
- blockSize);
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
- /* Working pointer for the scratch buffer of state values */
- px = pb;
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
- /* Working pointer for scratch buffer of output values */
- pOut = pDst;
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
- blkCnt = blockSize;
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- out = *pOut;
- out += ((q63_t) * px++ * coeff) >> 32;
- *pOut++ = (q31_t) (out);
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
/* Working output pointer is updated */
pOut = pDst;
/* Output is converted into 1.31 format. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
in = *pOut << 1;
*pOut++ = in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c b/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
index c1b4ce3..7a2b57f 100644
--- a/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
+++ b/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
@@ -3,13 +3,13 @@
* Title: arm_fir_sparse_q7.c
* Description: Q7 sparse FIR filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -28,78 +28,70 @@
#include "arm_math.h"
-
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_Sparse
- * @{
+ @addtogroup FIR_Sparse
+ @{
*/
-
/**
- * @brief Processing function for the Q7 sparse FIR filter.
- * @param[in] *S points to an instance of the Q7 sparse FIR structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
- * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
- * @param[in] blockSize number of input samples to process per call.
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
- * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
- * Finally, the result is truncated to 1.7 format.
+ @brief Processing function for the Q7 sparse FIR filter.
+ @param[in] S points to an instance of the Q7 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] pScratchOut points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ The accumulator is then converted to 18.7 format by discarding the low 7 bits.
+ Finally, the result is truncated to 1.7 format.
*/
void arm_fir_sparse_q7(
- arm_fir_sparse_instance_q7 * S,
- q7_t * pSrc,
- q7_t * pDst,
- q7_t * pScratchIn,
- q31_t * pScratchOut,
- uint32_t blockSize)
+ arm_fir_sparse_instance_q7 * S,
+ const q7_t * pSrc,
+ q7_t * pDst,
+ q7_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
{
+ q7_t *pState = S->pState; /* State pointer */
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *px; /* Scratch buffer pointer */
+ q7_t *py = pState; /* Temporary pointers for state buffer */
+ q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q7_t *pOut = pDst; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
+ q31_t in;
+ q7_t coeff = *pCoeffs++; /* Read the coefficient value */
- q7_t *pState = S->pState; /* State pointer */
- q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q7_t *px; /* Scratch buffer pointer */
- q7_t *py = pState; /* Temporary pointers for state buffer */
- q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
- q7_t *pOut = pDst; /* Destination pointer */
- int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
- uint32_t delaySize = S->maxDelay + blockSize; /* state length */
- uint16_t numTaps = S->numTaps; /* Filter order */
- int32_t readIndex; /* Read index of the state buffer */
- uint32_t tapCnt, blkCnt; /* loop counters */
- q7_t coeff = *pCoeffs++; /* Read the coefficient value */
- q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
- q31_t in;
-
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q7_t in1, in2, in3, in4;
+#if defined (ARM_MATH_LOOPUNROLL)
+ q7_t in1, in2, in3, in4;
+#endif
/* BlockSize of Input samples are copied into the state buffer */
/* StateIndex points to the starting position to write in the state buffer */
- arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
- blockSize);
+ arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, blockSize);
/* Loop over the number of taps. */
tapCnt = numTaps;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -111,8 +103,8 @@ void arm_fir_sparse_q7(
py = pState;
/* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -120,32 +112,40 @@ void arm_fir_sparse_q7(
/* Working pointer for scratch buffer of output values */
pScratchOut = pScr2;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 multiplications at a time. */
- blkCnt = blockSize >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
- *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
+ /* Perform Multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -154,7 +154,7 @@ void arm_fir_sparse_q7(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -171,8 +171,8 @@ void arm_fir_sparse_q7(
py = pState;
/* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
/* Working pointer for the scratch buffer of state values */
px = pb;
@@ -180,9 +180,11 @@ void arm_fir_sparse_q7(
/* Working pointer for scratch buffer of output values */
pScratchOut = pScr2;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
@@ -196,21 +198,27 @@ void arm_fir_sparse_q7(
in = *pScratchOut + ((q31_t) * px++ * coeff);
*pScratchOut++ = in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Perform Multiply-Accumulate */
- in = *pScratchOut + ((q31_t) * px++ * coeff);
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
*pScratchOut++ = in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -219,8 +227,7 @@ void arm_fir_sparse_q7(
coeff = *pCoeffs++;
/* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex -
- (int32_t) blockSize) - *pTapDelay++;
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
/* Wraparound of readIndex */
if (readIndex < 0)
@@ -228,63 +235,73 @@ void arm_fir_sparse_q7(
readIndex += (int32_t) delaySize;
}
- /* Decrement the tap loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Compute last tap without the final read of pTapDelay */
+ /* Compute last tap without the final read of pTapDelay */
- /* Working pointer for state buffer is updated */
- py = pState;
+ /* Working pointer for state buffer is updated */
+ py = pState;
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
- /* Working pointer for the scratch buffer of state values */
- px = pb;
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
- /* Loop over the blockSize. Unroll by a factor of 4.
- * Compute 4 MACS at a time. */
- blkCnt = blockSize >> 2;
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
- /* If the blockSize is not a multiple of 4,
- * compute the remaining samples */
- blkCnt = blockSize % 0x4U;
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
+ /* Decrement loop counter */
+ blkCnt--;
+ }
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
/* All the output values are in pScratchOut buffer.
Convert them into 1.15 format, saturate and store in the destination buffer. */
- /* Loop over the blockSize. */
- blkCnt = blockSize >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
@@ -293,177 +310,32 @@ void arm_fir_sparse_q7(
in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
- *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
+ write_q7x4_ia (&pOut, __PACKq7(in1, in2, in3, in4));
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4,
- remaining samples are processed in the below loop */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
- /* BlockSize of Input samples are copied into the state buffer */
- /* StateIndex points to the starting position to write in the state buffer */
- arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
- blockSize);
-
- /* Loop over the number of taps. */
- tapCnt = numTaps;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- /* Loop over the blockSize */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
- while (blkCnt > 0U)
- {
- /* Perform multiplication and store in the scratch buffer */
- *pScratchOut++ = ((q31_t) * px++ * coeff);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Loop over the number of taps. */
- tapCnt = (uint32_t) numTaps - 2U;
-
- while (tapCnt > 0U)
- {
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- /* Loop over the blockSize */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Load the coefficient value and
- * increment the coefficient buffer for the next set of state values */
- coeff = *pCoeffs++;
-
- /* Read Index, from where the state buffer should be read, is calculated. */
- readIndex =
- ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
-
- /* Wraparound of readIndex */
- if (readIndex < 0)
- {
- readIndex += (int32_t) delaySize;
- }
-
- /* Decrement the tap loop counter */
- tapCnt--;
- }
-
- /* Compute last tap without the final read of pTapDelay */
-
- /* Working pointer for state buffer is updated */
- py = pState;
-
- /* blockSize samples are read from the state buffer */
- arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
- (int32_t) blockSize, 1, blockSize);
-
- /* Working pointer for the scratch buffer of state values */
- px = pb;
-
- /* Working pointer for scratch buffer of output values */
- pScratchOut = pScr2;
-
- /* Loop over the blockSize */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Perform Multiply-Accumulate */
- in = *pScratchOut + ((q31_t) * px++ * coeff);
- *pScratchOut++ = in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* All the output values are in pScratchOut buffer.
- Convert them into 1.15 format, saturate and store in the destination buffer. */
- /* Loop over the blockSize. */
- blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
*pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
- /* Decrement the blockSize loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of FIR_Sparse group
+ @} end of FIR_Sparse group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
index 424be38..c48efe3 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
@@ -3,13 +3,13 @@
* Title: arm_iir_lattice_f32.c
* Description: Floating-point IIR Lattice filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,120 +29,118 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
- *
- * This set of functions implements lattice filters
- * for Q15, Q31 and floating-point data types. Lattice filters are used in a
- * variety of adaptive filter applications. The filter structure has feedforward and
- * feedback components and the net impulse response is infinite length.
- * The functions operate on blocks
- * of input and output data and each call to the function processes
- * blockSize samples through the filter. pSrc and
- * pDst point to input and output arrays containing blockSize values.
+ @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
- * \par Algorithm:
- * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
- *
- * fN(n) = x(n)
- * fm-1(n) = fm(n) - km * gm-1(n-1) for m = N, N-1, ...1
- * gm(n) = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
- * y(n) = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
- *
- * \par
- * pkCoeffs points to array of reflection coefficients of size numStages.
- * Reflection coefficients are stored in time-reversed order.
- * \par
- *
- * {kN, kN-1, ....k1}
- *
- * pvCoeffs points to the array of ladder coefficients of size (numStages+1).
- * Ladder coefficients are stored in time-reversed order.
- * \par
- *
- * {vN, vN-1, ...v0}
- *
- * pState points to a state array of size numStages + blockSize.
- * The state variables shown in the figure above (the g values) are stored in the pState array.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pkCoeffs, pvCoeffs, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
- *
- *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
- *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
- *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
- *
- * \par
- * where numStages is the number of stages in the filter; pState points to the state buffer array;
- * pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients.
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ This set of functions implements lattice filters
+ for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ variety of adaptive filter applications. The filter structure has feedforward and
+ feedback components and the net impulse response is infinite length.
+ The functions operate on blocks
+ of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst point to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
+ @par
+
+ fN(n) = x(n)
+ fm-1(n) = fm(n) - km * gm-1(n-1) for m = N, N-1, ..., 1
+ gm(n) = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ..., 1
+ y(n) = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+
+ @par
+ pkCoeffs points to array of reflection coefficients of size numStages.
+ Reflection Coefficients are stored in time-reversed order.
+ @par
+
+ {kN, kN-1, ..., k1}
+
+ @par
+ pvCoeffs points to the array of ladder coefficients of size (numStages+1).
+ Ladder coefficients are stored in time-reversed order.
+
+ {vN, vN-1, ..., v0}
+
+ @par
+ pState points to a state array of size numStages + blockSize.
+ The state variables shown in the figure above (the g values) are stored in the pState array.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pkCoeffs, pvCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+
+ arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+
+ @par
+ where numStages is the number of stages in the filter; pState points to the state buffer array;
+ pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
/**
- * @brief Processing function for the floating-point IIR lattice filter.
- * @param[in] *S points to an instance of the floating-point IIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process.
- * @return none.
+ @brief Processing function for the floating-point IIR lattice filter.
+ @param[in] S points to an instance of the floating-point IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
*/
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
void arm_iir_lattice_f32(
const arm_iir_lattice_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
-{
- float32_t fnext1, gcurr1, gnext; /* Temporary variables for lattice stages */
- float32_t acc; /* Accumlator */
- uint32_t blkCnt, tapCnt; /* temporary variables for counts */
- float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
- uint32_t numStages = S->numStages; /* number of stages */
- float32_t *pState; /* State pointer */
- float32_t *pStateCurnt; /* State current pointer */
- float32_t k1, k2;
- float32_t v1, v2, v3, v4;
- float32_t gcurr2;
- float32_t fnext2;
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pStateCur; /* State current pointer */
+ float32_t acc; /* Accumlator */
+ float32_t fnext1, fnext2, gcurr1, gnext; /* Temporary variables for lattice stages */
+ float32_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t gcurr2; /* Temporary variables for lattice stages */
+ float32_t k1, k2;
+ float32_t v1, v2, v3, v4;
+#endif
/* initialise loop count */
blkCnt = blockSize;
- /* initialise state pointer */
- pState = &S->pState[0];
-
/* Sample processing */
while (blkCnt > 0U)
{
@@ -152,19 +150,23 @@ void arm_iir_lattice_f32(
/* Initialize Ladder coeff pointer */
pv = &S->pvCoeffs[0];
+
/* Initialize Reflection coeff pointer */
pk = &S->pkCoeffs[0];
/* Initialize state read pointer */
px1 = pState;
+
/* Initialize state write pointer */
px2 = pState;
/* Set accumulator to zero */
acc = 0.0;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = (numStages) >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages) >> 2U;
while (tapCnt > 0U)
{
@@ -259,12 +261,19 @@ void arm_iir_lattice_f32(
px1 += 4U;
pv += 4U;
+ /* Decrement loop counter */
tapCnt--;
-
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = (numStages) % 0x4U;
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numStages;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (tapCnt > 0U)
{
@@ -277,8 +286,8 @@ void arm_iir_lattice_f32(
*px2++ = gnext;
fnext2 = fnext1;
+ /* Decrement loop counter */
tapCnt--;
-
}
/* y(n) += g0(n) * v0 */
@@ -292,144 +301,54 @@ void arm_iir_lattice_f32(
/* Advance the state pointer by 4 to process the next group of 4 samples */
pState = pState + 1U;
+ /* Decrement loop counter */
blkCnt--;
-
}
/* Processing is complete. Now copy last S->numStages samples to start of the buffer
for the preperation of next frame process */
/* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
+ pStateCur = &S->pState[0];
pState = &S->pState[blockSize];
+ /* Copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
tapCnt = numStages >> 2U;
- /* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- tapCnt--;
-
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numStages) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-}
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
#else
-void arm_iir_lattice_f32(
- const arm_iir_lattice_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
-{
- float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */
- float32_t acc; /* Accumlator */
- uint32_t blkCnt, tapCnt; /* temporary variables for counts */
- float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
- uint32_t numStages = S->numStages; /* number of stages */
- float32_t *pState; /* State pointer */
- float32_t *pStateCurnt; /* State current pointer */
-
-
- /* Run the below code for Cortex-M0 */
-
- blkCnt = blockSize;
-
- pState = &S->pState[0];
-
- /* Sample processing */
- while (blkCnt > 0U)
- {
- /* Read Sample from input buffer */
- /* fN(n) = x(n) */
- fcurr = *pSrc++;
-
- /* Initialize state read pointer */
- px1 = pState;
- /* Initialize state write pointer */
- px2 = pState;
- /* Set accumulator to zero */
- acc = 0.0f;
- /* Initialize Ladder coeff pointer */
- pv = &S->pvCoeffs[0];
- /* Initialize Reflection coeff pointer */
- pk = &S->pkCoeffs[0];
-
-
- /* Process sample for numStages */
- tapCnt = numStages;
-
- while (tapCnt > 0U)
- {
- gcurr = *px1++;
- /* Process sample for last taps */
- fnext = fcurr - ((*pk) * gcurr);
- gnext = (fnext * (*pk++)) + gcurr;
-
- /* Output samples for last taps */
- acc += (gnext * (*pv++));
- *px2++ = gnext;
- fcurr = fnext;
-
- /* Decrementing loop counter */
- tapCnt--;
-
- }
-
- /* y(n) += g0(n) * v0 */
- acc += (fnext * (*pv));
-
- *px2++ = fnext;
-
- /* write out into pDst */
- *pDst++ = acc;
-
- /* Advance the state pointer by 1 to process the next group of samples */
- pState = pState + 1U;
- blkCnt--;
-
- }
-
- /* Processing is complete. Now copy last S->numStages samples to start of the buffer
- for the preperation of next frame process */
-
- /* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
- pState = &S->pState[blockSize];
-
+ /* Initialize blkCnt with number of samples */
tapCnt = numStages;
- /* Copy the data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
/**
- * @} end of IIR_Lattice group
+ @} end of IIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
index 530c7ff..bd9f933 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_iir_lattice_init_f32.c
* Description: Floating-point IIR lattice filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,23 +29,23 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
/**
- * @brief Initialization function for the floating-point IIR lattice filter.
- * @param[in] *S points to an instance of the floating-point IIR lattice structure.
- * @param[in] numStages number of stages in the filter.
- * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages.
- * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1.
- * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize.
- * @param[in] blockSize number of samples to process.
- * @return none.
+ @brief Initialization function for the floating-point IIR lattice filter.
+ @param[in] S points to an instance of the floating-point IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
*/
void arm_iir_lattice_init_f32(
@@ -70,10 +70,8 @@ void arm_iir_lattice_init_f32(
/* Assign state pointer */
S->pState = pState;
-
-
}
- /**
- * @} end of IIR_Lattice group
- */
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
index 9b991f8..01abf48 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_iir_lattice_init_q15.c
* Description: Q15 IIR lattice filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,24 +29,24 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
- /**
- * @brief Initialization function for the Q15 IIR lattice filter.
- * @param[in] *S points to an instance of the Q15 IIR lattice structure.
- * @param[in] numStages number of stages in the filter.
- * @param[in] *pkCoeffs points to reflection coefficient buffer. The array is of length numStages.
- * @param[in] *pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1.
- * @param[in] *pState points to state buffer. The array is of length numStages+blockSize.
- * @param[in] blockSize number of samples to process per call.
- * @return none.
- */
+/**
+ @brief Initialization function for the Q15 IIR lattice filter.
+ @param[in] S points to an instance of the Q15 IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
+ */
void arm_iir_lattice_init_q15(
arm_iir_lattice_instance_q15 * S,
@@ -70,10 +70,8 @@ void arm_iir_lattice_init_q15(
/* Assign state pointer */
S->pState = pState;
-
-
}
/**
- * @} end of IIR_Lattice group
+ @} end of IIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
index 1543206..b472f6c 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_iir_lattice_init_q31.c
* Description: Initialization function for the Q31 IIR lattice filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,24 +29,24 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
- /**
- * @brief Initialization function for the Q31 IIR lattice filter.
- * @param[in] *S points to an instance of the Q31 IIR lattice structure.
- * @param[in] numStages number of stages in the filter.
- * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages.
- * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1.
- * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize.
- * @param[in] blockSize number of samples to process.
- * @return none.
- */
+/**
+ @brief Initialization function for the Q31 IIR lattice filter.
+ @param[in] S points to an instance of the Q31 IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
+ */
void arm_iir_lattice_init_q31(
arm_iir_lattice_instance_q31 * S,
@@ -70,10 +70,8 @@ void arm_iir_lattice_init_q31(
/* Assign state pointer */
S->pState = pState;
-
-
}
/**
- * @} end of IIR_Lattice group
+ @} end of IIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
index 8f68068..9dbea81 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_iir_lattice_q15.c
- * Description: Q15 IIR lattice filter processing function
+ * Description: Q15 IIR Lattice filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,64 +29,55 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
/**
- * @brief Processing function for the Q15 IIR lattice filter.
- * @param[in] *S points to an instance of the Q15 IIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
- * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ @brief Processing function for the Q15 IIR lattice filter.
+ @param[in] S points to an instance of the Q15 IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
*/
void arm_iir_lattice_q15(
const arm_iir_lattice_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pStateCur; /* State current pointer */
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q63_t acc; /* Accumlator */
+ q15_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+ q15_t out; /* Temporary variable for output */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t fcurr, fnext, gcurr = 0, gnext; /* Temporary variables for lattice stages */
- q15_t gnext1, gnext2; /* Temporary variables for lattice stages */
- uint32_t stgCnt; /* Temporary variables for counts */
- q63_t acc; /* Accumlator */
- uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
- q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
- uint32_t numStages = S->numStages; /* number of stages */
- q15_t *pState; /* State pointer */
- q15_t *pStateCurnt; /* State current pointer */
- q15_t out; /* Temporary variable for output */
- q31_t v; /* Temporary variable for ladder coefficient */
-#ifdef UNALIGNED_SUPPORT_DISABLE
- q15_t v1, v2;
+#if defined (ARM_MATH_DSP) && defined (ARM_MATH_LOOPUNROLL)
+ q15_t gnext1, gnext2; /* Temporary variables for lattice stages */
+ q31_t v; /* Temporary variable for ladder coefficient */
#endif
-
+ /* initialise loop count */
blkCnt = blockSize;
- pState = &S->pState[0];
+#if defined (ARM_MATH_DSP)
/* Sample processing */
while (blkCnt > 0U)
@@ -95,57 +86,62 @@ void arm_iir_lattice_q15(
/* fN(n) = x(n) */
fcurr = *pSrc++;
- /* Initialize state read pointer */
- px1 = pState;
- /* Initialize state write pointer */
- px2 = pState;
- /* Set accumulator to zero */
- acc = 0;
/* Initialize Ladder coeff pointer */
pv = &S->pvCoeffs[0];
+
/* Initialize Reflection coeff pointer */
pk = &S->pkCoeffs[0];
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
/* Process sample for first tap */
gcurr = *px1++;
/* fN-1(n) = fN(n) - kN * gN-1(n-1) */
fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
fnext = __SSAT(fnext, 16);
+
/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
gnext = __SSAT(gnext, 16);
+
/* write gN(n) into state for next sample processing */
*px2++ = (q15_t) gnext;
- /* y(n) += gN(n) * vN */
- acc += (q31_t) ((gnext * (*pv++)));
+ /* y(n) += gN(n) * vN */
+ acc += (q31_t) ((gnext * (*pv++)));
/* Update f values for next coefficient processing */
fcurr = fnext;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = (numStages - 1U) >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages - 1U) >> 2U;
while (tapCnt > 0U)
{
-
/* Process sample for 2nd, 6th ...taps */
/* Read gN-2(n-1) from state buffer */
gcurr = *px1++;
- /* Process sample for 2nd, 6th .. taps */
/* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
fnext = __SSAT(fnext, 16);
/* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
gnext1 = (q15_t) __SSAT(gnext, 16);
- /* write gN-1(n) into state */
+ /* write gN-1(n) into state for next sample processing */
*px2++ = (q15_t) gnext1;
-
/* Process sample for 3nd, 7th ...taps */
- /* Read gN-3(n-1) from state */
+ /* Read gN-3(n-1) from state buffer */
gcurr = *px1++;
/* Process sample for 3rd, 7th .. taps */
/* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
@@ -158,39 +154,15 @@ void arm_iir_lattice_q15(
*px2++ = (q15_t) gnext2;
/* Read vN-1 and vN-2 at a time */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- v = *__SIMD32(pv)++;
-
-#else
-
- v1 = *pv++;
- v2 = *pv++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- v = __PKHBT(v1, v2, 16);
-
-#else
-
- v = __PKHBT(v2, v1, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
+ v = read_q15x2_ia (&pv);
/* Pack gN-1(n) and gN-2(n) */
#ifndef ARM_MATH_BIG_ENDIAN
-
gnext = __PKHBT(gnext1, gnext2, 16);
-
#else
-
gnext = __PKHBT(gnext2, gnext1, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* y(n) += gN-1(n) * vN-1 */
/* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
@@ -198,9 +170,8 @@ void arm_iir_lattice_q15(
/* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
acc = __SMLALD(gnext, v, acc);
-
/* Process sample for 4th, 8th ...taps */
- /* Read gN-4(n-1) from state */
+ /* Read gN-4(n-1) from state buffer */
gcurr = *px1++;
/* Process sample for 4th, 8th .. taps */
/* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
@@ -212,9 +183,8 @@ void arm_iir_lattice_q15(
/* write gN-3(n) for the next sample process */
*px2++ = (q15_t) gnext1;
-
/* Process sample for 5th, 9th ...taps */
- /* Read gN-5(n-1) from state */
+ /* Read gN-5(n-1) from state buffer */
gcurr = *px1++;
/* Process sample for 5th, 9th .. taps */
/* fN-5(n) = fN-4(n) - kN-4 * gN-5(n-1) */
@@ -227,38 +197,14 @@ void arm_iir_lattice_q15(
*px2++ = (q15_t) gnext2;
/* Read vN-3 and vN-4 at a time */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- v = *__SIMD32(pv)++;
-
-#else
-
- v1 = *pv++;
- v2 = *pv++;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
- v = __PKHBT(v1, v2, 16);
-
-#else
-
- v = __PKHBT(v2, v1, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
+ v = read_q15x2_ia (&pv);
/* Pack gN-3(n) and gN-4(n) */
-#ifndef ARM_MATH_BIG_ENDIAN
-
+#ifndef ARM_MATH_BIG_ENDIAN
gnext = __PKHBT(gnext1, gnext2, 16);
-
#else
-
gnext = __PKHBT(gnext2, gnext1, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* y(n) += gN-4(n) * vN-4 */
/* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
@@ -266,15 +212,22 @@ void arm_iir_lattice_q15(
/* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
acc = __SMLALD(gnext, v, acc);
+ /* Decrement loop counter */
tapCnt--;
-
}
fnext = fcurr;
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = (numStages - 1U) % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
gcurr = *px1++;
@@ -283,11 +236,13 @@ void arm_iir_lattice_q15(
fnext = __SSAT(fnext, 16);
gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
gnext = __SSAT(gnext, 16);
+
/* Output samples for last taps */
acc += (q31_t) (((q31_t) gnext * (*pv++)));
*px2++ = (q15_t) gnext;
fcurr = fnext;
+ /* Decrement loop counter */
tapCnt--;
}
@@ -302,70 +257,52 @@ void arm_iir_lattice_q15(
/* Advance the state pointer by 4 to process the next group of 4 samples */
pState = pState + 1U;
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
/* Processing is complete. Now copy last S->numStages samples to start of the buffer
for the preperation of next frame process */
+
/* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
+ pStateCur = &S->pState[0];
pState = &S->pState[blockSize];
- stgCnt = (numStages >> 2U);
-
/* copy data */
- while (stgCnt > 0U)
- {
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_LOOPUNROLL)
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numStages >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
#else
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* Decrement the loop counter */
- stgCnt--;
-
- }
-
- /* Calculation of count for remaining q15_t data */
- stgCnt = (numStages) % 0x4U;
-
- /* copy data */
- while (stgCnt > 0U)
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- stgCnt--;
+ /* Decrement loop counter */
+ tapCnt--;
}
-#else
-
- /* Run the below code for Cortex-M0 */
-
- q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
- uint32_t stgCnt; /* Temporary variables for counts */
- q63_t acc; /* Accumlator */
- uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
- q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
- uint32_t numStages = S->numStages; /* number of stages */
- q15_t *pState; /* State pointer */
- q15_t *pStateCurnt; /* State current pointer */
- q15_t out; /* Temporary variable for output */
-
-
- blkCnt = blockSize;
-
- pState = &S->pState[0];
+#else /* #if defined (ARM_MATH_DSP) */
/* Sample processing */
while (blkCnt > 0U)
@@ -374,17 +311,21 @@ void arm_iir_lattice_q15(
/* fN(n) = x(n) */
fcurr = *pSrc++;
- /* Initialize state read pointer */
- px1 = pState;
- /* Initialize state write pointer */
- px2 = pState;
- /* Set accumulator to zero */
- acc = 0;
/* Initialize Ladder coeff pointer */
pv = &S->pvCoeffs[0];
+
/* Initialize Reflection coeff pointer */
pk = &S->pkCoeffs[0];
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
tapCnt = numStages;
while (tapCnt > 0U)
@@ -394,14 +335,18 @@ void arm_iir_lattice_q15(
/* fN-1(n) = fN(n) - kN * gN-1(n-1) */
fnext = fcurr - ((gcurr * (*pk)) >> 15);
fnext = __SSAT(fnext, 16);
+
/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
gnext = ((fnext * (*pk++)) >> 15) + gcurr;
gnext = __SSAT(gnext, 16);
+
/* Output samples */
/* y(n) += gN(n) * vN */
acc += (q31_t) ((gnext * (*pv++)));
+
/* write gN(n) into state for next sample processing */
*px2++ = (q15_t) gnext;
+
/* Update f values for next coefficient processing */
fcurr = fnext;
@@ -419,34 +364,33 @@ void arm_iir_lattice_q15(
/* Advance the state pointer by 1 to process the next group of samples */
pState = pState + 1U;
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
/* Processing is complete. Now copy last S->numStages samples to start of the buffer
for the preperation of next frame process */
+
/* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
+ pStateCur = &S->pState[0];
pState = &S->pState[blockSize];
- stgCnt = numStages;
+ tapCnt = numStages;
- /* copy data */
- while (stgCnt > 0U)
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- stgCnt--;
+ /* Decrement loop counter */
+ tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
-
-
-
/**
- * @} end of IIR_Lattice group
+ @} end of IIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c b/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
index a14dd7a..c4b9a76 100644
--- a/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_iir_lattice_q31.c
- * Description: Q31 IIR lattice filter processing function
+ * Description: Q31 IIR Lattice filter processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,55 +29,50 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup IIR_Lattice
- * @{
+ @addtogroup IIR_Lattice
+ @{
*/
/**
- * @brief Processing function for the Q31 IIR lattice filter.
- * @param[in] *S points to an instance of the Q31 IIR lattice structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits.
- * After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format.
+ @brief Processing function for the Q31 IIR lattice filter.
+ @param[in] S points to an instance of the Q31 IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits.
+ After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format.
*/
void arm_iir_lattice_q31(
const arm_iir_lattice_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
-{
- q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
- q63_t acc; /* Accumlator */
- uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
- q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
- uint32_t numStages = S->numStages; /* number of stages */
- q31_t *pState; /* State pointer */
- q31_t *pStateCurnt; /* State current pointer */
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pStateCur; /* State current pointer */
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q63_t acc; /* Accumlator */
+ q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+
+ /* initialise loop count */
blkCnt = blockSize;
- pState = &S->pState[0];
-
-
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
/* Sample processing */
while (blkCnt > 0U)
{
@@ -85,43 +80,51 @@ void arm_iir_lattice_q31(
/* fN(n) = x(n) */
fcurr = *pSrc++;
- /* Initialize state read pointer */
- px1 = pState;
- /* Initialize state write pointer */
- px2 = pState;
- /* Set accumulator to zero */
- acc = 0;
/* Initialize Ladder coeff pointer */
pv = &S->pvCoeffs[0];
+
/* Initialize Reflection coeff pointer */
pk = &S->pkCoeffs[0];
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
/* Process sample for first tap */
gcurr = *px1++;
/* fN-1(n) = fN(n) - kN * gN-1(n-1) */
- fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+
/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+
/* write gN-1(n-1) into state for next sample processing */
*px2++ = gnext;
- /* y(n) += gN(n) * vN */
+
+ /* y(n) += gN(n) * vN */
acc += ((q63_t) gnext * *pv++);
/* Update f values for next coefficient processing */
fcurr = fnext;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = (numStages - 1U) >> 2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages - 1U) >> 2U;
while (tapCnt > 0U)
{
-
- /* Process sample for 2nd, 6th .. taps */
+ /* Process sample for 2nd, 6th ...taps */
/* Read gN-2(n-1) from state buffer */
gcurr = *px1++;
/* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
- fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
/* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
/* y(n) += gN-1(n) * vN-1 */
@@ -135,7 +138,7 @@ void arm_iir_lattice_q31(
gcurr = *px1++;
/* Process sample for 3rd, 7th .. taps */
/* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
- fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
/* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
/* y(n) += gN-2(n) * vN-2 */
@@ -144,13 +147,12 @@ void arm_iir_lattice_q31(
/* write gN-2(n) into state for next sample processing */
*px2++ = gnext;
-
/* Process sample for 4th, 8th ...taps */
/* Read gN-4(n-1) from state buffer */
gcurr = *px1++;
/* Process sample for 4th, 8th .. taps */
/* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
- fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
/* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
/* y(n) += gN-3(n) * vN-3 */
@@ -159,48 +161,55 @@ void arm_iir_lattice_q31(
/* write gN-3(n) into state for next sample processing */
*px2++ = gnext;
-
/* Process sample for 5th, 9th ...taps */
/* Read gN-5(n-1) from state buffer */
gcurr = *px1++;
/* Process sample for 5th, 9th .. taps */
/* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */
- fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
/* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
/* y(n) += gN-4(n) * vN-4 */
/* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
acc += ((q63_t) gnext * *pv++);
+
/* write gN-4(n) into state for next sample processing */
*px2++ = gnext;
+ /* Decrement loop counter */
tapCnt--;
-
}
fnext = fcurr;
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = (numStages - 1U) % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
gcurr = *px1++;
/* Process sample for last taps */
- fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+
/* Output samples for last taps */
acc += ((q63_t) gnext * *pv++);
*px2++ = gnext;
fcurr = fnext;
+ /* Decrement loop counter */
tapCnt--;
-
}
/* y(n) += g0(n) * v0 */
- acc += (q63_t) fnext *(
- *pv++);
+ acc += ((q63_t) fnext * *pv++);
*px2++ = fnext;
@@ -209,47 +218,55 @@ void arm_iir_lattice_q31(
/* Advance the state pointer by 4 to process the next group of 4 samples */
pState = pState + 1U;
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
/* Processing is complete. Now copy last S->numStages samples to start of the buffer
for the preperation of next frame process */
/* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
+ pStateCur = &S->pState[0];
pState = &S->pState[blockSize];
+ /* Copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
tapCnt = numStages >> 2U;
- /* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
-
}
- /* Calculate remaining number of copies */
- tapCnt = (numStages) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- };
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
/* Sample processing */
while (blkCnt > 0U)
{
@@ -257,17 +274,21 @@ void arm_iir_lattice_q31(
/* fN(n) = x(n) */
fcurr = *pSrc++;
- /* Initialize state read pointer */
- px1 = pState;
- /* Initialize state write pointer */
- px2 = pState;
- /* Set accumulator to zero */
- acc = 0;
/* Initialize Ladder coeff pointer */
pv = &S->pvCoeffs[0];
+
/* Initialize Reflection coeff pointer */
pk = &S->pkCoeffs[0];
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
tapCnt = numStages;
while (tapCnt > 0U)
@@ -275,18 +296,18 @@ void arm_iir_lattice_q31(
gcurr = *px1++;
/* Process sample */
/* fN-1(n) = fN(n) - kN * gN-1(n-1) */
- fnext =
- clip_q63_to_q31(((q63_t) fcurr -
- ((q31_t) (((q63_t) gcurr * (*pk)) >> 31))));
+ fnext = clip_q63_to_q31(((q63_t) fcurr - ((q31_t) (((q63_t) gcurr * (*pk )) >> 31))));
+
/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
- gnext =
- clip_q63_to_q31(((q63_t) gcurr +
- ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+ gnext = clip_q63_to_q31(((q63_t) gcurr + ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+
/* Output samples */
- /* y(n) += gN(n) * vN */
+ /* y(n) += gN(n) * vN */
acc += ((q63_t) gnext * *pv++);
+
/* write gN-1(n-1) into state for next sample processing */
*px2++ = gnext;
+
/* Update f values for next coefficient processing */
fcurr = fnext;
@@ -294,8 +315,7 @@ void arm_iir_lattice_q31(
}
/* y(n) += g0(n) * v0 */
- acc += (q63_t) fnext *(
- *pv++);
+ acc += ((q63_t) fnext * *pv++);
*px2++ = fnext;
@@ -304,35 +324,33 @@ void arm_iir_lattice_q31(
/* Advance the state pointer by 1 to process the next group of samples */
pState = pState + 1U;
- blkCnt--;
+ /* Decrement loop counter */
+ blkCnt--;
}
/* Processing is complete. Now copy last S->numStages samples to start of the buffer
for the preperation of next frame process */
/* Points to the start of the state buffer */
- pStateCurnt = &S->pState[0];
+ pStateCur = &S->pState[0];
pState = &S->pState[blockSize];
tapCnt = numStages;
- /* Copy the remaining q31_t data */
+ /* Copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_DSP) */
}
-
-
-
/**
- * @} end of IIR_Lattice group
+ @} end of IIR_Lattice group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_f32.c b/DSP/Source/FilteringFunctions/arm_lms_f32.c
index e5728b4..4fc6e7e 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_f32.c
@@ -3,13 +3,13 @@
* Title: arm_lms_f32.c
* Description: Processing function for the floating-point LMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,146 +29,143 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup LMS Least Mean Square (LMS) Filters
- *
- * LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
- * LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
- * Adaptive filters are often used in communication systems, equalizers, and noise removal.
- * The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
- * The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
- *
- * An LMS filter consists of two components as shown below.
- * The first component is a standard transversal or FIR filter.
- * The second component is a coefficient update mechanism.
- * The LMS filter has two input signals.
- * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
- * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
- * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
- * This "error signal" tends towards zero as the filter adapts.
- * The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
- * \image html LMS.gif "Internal structure of the Least Mean Square filter"
- *
- * The functions operate on blocks of data and each call to the function processes
- * blockSize samples through the filter.
- * pSrc points to input signal, pRef points to reference signal,
- * pOut points to output signal and pErr points to error signal.
- * All arrays contain blockSize values.
- *
- * The functions operate on a block-by-block basis.
- * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
- * The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
- *
- * \par Algorithm:
- * The output signal y[n] is computed by a standard FIR filter:
- *
- *
- * \par
- * The error signal equals the difference between the reference signal d[n] and the filter output:
- *
- * e[n] = d[n] - y[n].
- *
- *
- * \par
- * After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis:
- *
- * b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1
- *
- * where mu is the step size and controls the rate of coefficient convergence.
- *\par
- * In the APIs, pCoeffs points to a coefficient array of size numTaps.
- * Coefficients are stored in time reversed order.
- * \par
- *
- * \par
- * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
- * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
- * to be avoided and yields a significant speed improvement.
- * The state variables are updated after each block of data is processed.
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter and
- * coefficient and state arrays cannot be shared among instances.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * The code below statically initializes each of the 3 different data type filter instance structures
- *
- * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
- * pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients.
- *
- * \par Fixed-Point Behavior:
- * Care must be taken when using the Q15 and Q31 versions of the LMS filter.
- * The following issues must be considered:
- * - Scaling of coefficients
- * - Overflow and saturation
- *
- * \par Scaling of Coefficients:
- * Filter coefficients are represented as fractional values and
- * coefficients are restricted to lie in the range [-1 +1).
- * The fixed-point functions have an additional scaling parameter postShift.
- * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
- * This essentially scales the filter coefficients by 2^postShift and
- * allows the filter coefficients to exceed the range [+1 -1).
- * The value of postShift is set by the user based on the expected gain through the system being modeled.
- *
- * \par Overflow and Saturation:
- * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
- * described separately as part of the function specific documentation below.
+ @defgroup LMS Least Mean Square (LMS) Filters
+
+ LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
+ LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
+ Adaptive filters are often used in communication systems, equalizers, and noise removal.
+ The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
+ The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
+
+ An LMS filter consists of two components as shown below.
+ The first component is a standard transversal or FIR filter.
+ The second component is a coefficient update mechanism.
+ The LMS filter has two input signals.
+ The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ This "error signal" tends towards zero as the filter adapts.
+ The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ \image html LMS.gif "Internal structure of the Least Mean Square filter"
+
+ The functions operate on blocks of data and each call to the function processes
+ blockSize samples through the filter.
+ pSrc points to input signal, pRef points to reference signal,
+ pOut points to output signal and pErr points to error signal.
+ All arrays contain blockSize values.
+
+ The functions operate on a block-by-block basis.
+ Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+
+ @par Algorithm
+ The output signal y[n] is computed by a standard FIR filter:
+
+
+ @par
+ The error signal equals the difference between the reference signal d[n] and the filter output:
+
+ e[n] = d[n] - y[n].
+
+
+ @par
+ After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis:
+
+ b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1
+
+ where mu is the step size and controls the rate of coefficient convergence.
+ @par
+ In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed.
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter and
+ coefficient and state arrays cannot be shared among instances.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
+
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
+ arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
+ arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
+
+ where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the Q15 and Q31 versions of the LMS filter.
+ The following issues must be considered:
+ - Scaling of coefficients
+ - Overflow and saturation
+
+ @par Scaling of Coefficients
+ Filter coefficients are represented as fractional values and
+ coefficients are restricted to lie in the range [-1 +1).
+ The fixed-point functions have an additional scaling parameter postShift.
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ This essentially scales the filter coefficients by 2^postShift and
+ allows the filter coefficients to exceed the range [+1 -1).
+ The value of postShift is set by the user based on the expected gain through the system being modeled.
+
+ @par Overflow and Saturation
+ Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ described separately as part of the function specific documentation below.
*/
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
/**
- * @details
- * This function operates on floating-point data types.
- *
- * @brief Processing function for floating-point LMS filter.
- * @param[in] *S points to an instance of the floating-point LMS filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[in] *pRef points to the block of reference data.
- * @param[out] *pOut points to the block of output data.
- * @param[out] *pErr points to the block of error data.
- * @param[in] blockSize number of samples to process.
- * @return none.
+ @brief Processing function for floating-point LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_lms_f32(
const arm_lms_instance_f32 * S,
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pRef,
float32_t * pOut,
float32_t * pErr,
@@ -184,6 +181,9 @@ void arm_lms_f32(
float32_t sum, e, d; /* accumulator, error, reference data sample */
float32_t w = 0.0f; /* weight factor */
+ float32x4_t tempV, sumV, xV, bV;
+ float32x2_t tempV2;
+
e = 0.0f;
d = 0.0f;
@@ -193,11 +193,6 @@ void arm_lms_f32(
blkCnt = blockSize;
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
while (blkCnt > 0U)
{
/* Copy the new input sample into the state buffer */
@@ -211,21 +206,27 @@ void arm_lms_f32(
/* Set the accumulator to zero */
sum = 0.0f;
+ sumV = vdupq_n_f32(0.0);
- /* Loop unrolling. Process 4 taps at a time. */
+ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ sumV = vmlaq_f32(sumV, xV, bV);
+
+ px += 4;
+ pb += 4;
/* Decrement the loop counter */
tapCnt--;
}
+ tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = tempV2[0] + tempV2[1];
+
/* If the filter length is not a multiple of 4, compute the remaining filter taps */
tapCnt = numTaps % 0x4U;
@@ -256,24 +257,21 @@ void arm_lms_f32(
/* Initialize coeff pointer */
pb = (pCoeffs);
- /* Loop unrolling. Process 4 taps at a time. */
+ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Update filter coefficients */
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- *pb = *pb + (w * (*px++));
- pb++;
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ px += 4;
+ bV = vmlaq_n_f32(bV,xV,w);
- *pb = *pb + (w * (*px++));
- pb++;
+ vst1q_f32(pb,bV);
+ pb += 4;
- *pb = *pb + (w * (*px++));
- pb++;
-
- *pb = *pb + (w * (*px++));
- pb++;
/* Decrement the loop counter */
tapCnt--;
@@ -307,16 +305,16 @@ void arm_lms_f32(
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Loop unrolling for (numTaps - 1U) samples copy */
+ /* Process 4 taps at a time for (numTaps - 1U) samples copy */
tapCnt = (numTaps - 1U) >> 2U;
/* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ tempV = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,tempV);
+ pState += 4;
+ pStateCurnt += 4;
/* Decrement the loop counter */
tapCnt--;
@@ -334,9 +332,37 @@ void arm_lms_f32(
tapCnt--;
}
-#else
- /* Run the below code for Cortex-M0 */
+}
+#else
+void arm_lms_f32(
+ const arm_lms_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
while (blkCnt > 0U)
{
@@ -346,85 +372,162 @@ void arm_lms_f32(
/* Initialize pState pointer */
px = pState;
- /* Initialize pCoeffs pointer */
+ /* Initialize coefficient pointer */
pb = pCoeffs;
/* Set the accumulator to zero */
- sum = 0.0f;
+ acc = 0.0f;
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum += (*px++) * (*pb++);
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
/* Decrement the loop counter */
tapCnt--;
}
- /* The result is stored in the destination buffer. */
- *pOut++ = sum;
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
/* Compute and store error */
- d = (float32_t) (*pRef++);
- e = d - sum;
+ e = (float32_t) *pRef++ - acc;
*pErr++ = e;
- /* Weighting factor for the LMS version */
+ /* Calculation of Weighting factor for updating filter coefficients */
w = e * mu;
/* Initialize pState pointer */
- px = pState;
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
- /* Initialize pCoeffs pointer */
+ /* Initialize coefficient pointer */
pb = pCoeffs;
- /* Loop over numTaps number of values */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- *pb = *pb + (w * (*px++));
+ *pb += w * (*px++);
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- * start of the state buffer. This prepares the state buffer for the
- * next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Copy (numTaps - 1U) samples */
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* Copy the data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of LMS group
- */
+ @} end of LMS group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_lms_init_f32.c b/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
index 9fc87f1..f418f46 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_lms_init_f32.c
* Description: Floating-point LMS filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,29 +29,27 @@
#include "arm_math.h"
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
- /**
- * @brief Initialization function for floating-point LMS filter.
- * @param[in] *S points to an instance of the floating-point LMS filter structure.
- * @param[in] numTaps number of filter coefficients.
- * @param[in] *pCoeffs points to the coefficient buffer.
- * @param[in] *pState points to state buffer.
- * @param[in] mu step size that controls filter coefficient updates.
- * @param[in] blockSize number of samples to process.
- * @return none.
- */
-
/**
- * \par Description:
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The initial filter coefficients serve as a starting point for the adaptive filter.
- * pState points to an array of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_lms_f32().
+ @brief Initialization function for floating-point LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_lms_f32().
*/
void arm_lms_init_f32(
@@ -79,5 +77,5 @@ void arm_lms_init_f32(
}
/**
- * @} end of LMS group
+ @} end of LMS group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_init_q15.c b/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
index 3a13f26..fe0a5c5 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_lms_init_q15.c
* Description: Q15 LMS filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,35 +29,35 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
/**
-* @brief Initialization function for the Q15 LMS filter.
-* @param[in] *S points to an instance of the Q15 LMS filter structure.
-* @param[in] numTaps number of filter coefficients.
-* @param[in] *pCoeffs points to the coefficient buffer.
-* @param[in] *pState points to the state buffer.
-* @param[in] mu step size that controls filter coefficient updates.
-* @param[in] blockSize number of samples to process.
-* @param[in] postShift bit shift applied to coefficients.
-* @return none.
-*
-* \par Description:
-* pCoeffs points to the array of filter coefficients stored in time reversed order:
-*
-* The initial filter coefficients serve as a starting point for the adaptive filter.
-* pState points to the array of state variables and size of array is
-* numTaps+blockSize-1 samples, where blockSize is the number of
-* input samples processed by each call to arm_lms_q15().
-*/
+ @brief Initialization function for the Q15 LMS filter.
+ @param[in] S points to an instance of the Q15 LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to the array of state variables and size of array is
+ numTaps+blockSize-1 samples, where blockSize is the number of
+ input samples processed by each call to arm_lms_q15().
+ */
void arm_lms_init_q15(
arm_lms_instance_q15 * S,
@@ -85,9 +85,8 @@ void arm_lms_init_q15(
/* Assign postShift value to be applied */
S->postShift = postShift;
-
}
/**
- * @} end of LMS group
+ @} end of LMS group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_init_q31.c b/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
index 5859c84..3410b9f 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_lms_init_q31.c
* Description: Q31 LMS filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,34 +29,34 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
- /**
- * @brief Initialization function for Q31 LMS filter.
- * @param[in] *S points to an instance of the Q31 LMS filter structure.
- * @param[in] numTaps number of filter coefficients.
- * @param[in] *pCoeffs points to coefficient buffer.
- * @param[in] *pState points to state buffer.
- * @param[in] mu step size that controls filter coefficient updates.
- * @param[in] blockSize number of samples to process.
- * @param[in] postShift bit shift applied to coefficients.
- * @return none.
- *
- * \par Description:
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The initial filter coefficients serve as a starting point for the adaptive filter.
- * pState points to an array of length numTaps+blockSize-1 samples,
- * where blockSize is the number of input samples processed by each call to
- * arm_lms_q31().
+/**
+ @brief Initialization function for Q31 LMS filter.
+ @param[in] S points to an instance of the Q31 LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @param[in] postShift bit shift applied to coefficients
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples,
+ where blockSize is the number of input samples processed by each call to
+ arm_lms_q31().
*/
void arm_lms_init_q31(
@@ -75,7 +75,7 @@ void arm_lms_init_q31(
S->pCoeffs = pCoeffs;
/* Clear state buffer and size is always blockSize + numTaps - 1 */
- memset(pState, 0, ((uint32_t) numTaps + (blockSize - 1U)) * sizeof(q31_t));
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
/* Assign state pointer */
S->pState = pState;
@@ -85,9 +85,8 @@ void arm_lms_init_q31(
/* Assign postShift value to be applied */
S->postShift = postShift;
-
}
/**
- * @} end of LMS group
+ @} end of LMS group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c b/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
index 3fdc5a1..28ab04a 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_lms_norm_f32.c
- * Description: Processing function for the floating-point Normalised LMS
+ * Description: Processing function for the floating-point NLMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,138 +29,137 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @defgroup LMS_NORM Normalized LMS Filters
- *
- * This set of functions implements a commonly used adaptive filter.
- * It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
- * factor which increases the adaptation rate of the filter.
- * The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
- *
- * A normalized least mean square (NLMS) filter consists of two components as shown below.
- * The first component is a standard transversal or FIR filter.
- * The second component is a coefficient update mechanism.
- * The NLMS filter has two input signals.
- * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
- * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
- * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
- * This "error signal" tends towards zero as the filter adapts.
- * The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
- * \image html LMS.gif "Internal structure of the NLMS adaptive filter"
- *
- * The functions operate on blocks of data and each call to the function processes
- * blockSize samples through the filter.
- * pSrc points to input signal, pRef points to reference signal,
- * pOut points to output signal and pErr points to error signal.
- * All arrays contain blockSize values.
- *
- * The functions operate on a block-by-block basis.
- * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
- * The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
- *
- * \par Algorithm:
- * The output signal y[n] is computed by a standard FIR filter:
- *
- * where mu is the step size and controls the rate of coefficient convergence.
- *\par
- * In the APIs, pCoeffs points to a coefficient array of size numTaps.
- * Coefficients are stored in time reversed order.
- * \par
- *
- * \par
- * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
- * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
- * to be avoided and yields a significant speed improvement.
- * The state variables are updated after each block of data is processed.
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter and
- * coefficient and state arrays cannot be shared among instances.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero.
- * For Q7, Q15, and Q31 the following fields must also be initialized;
- * recipTable, postShift
- *
- * \par
- * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
- * \par Fixed-Point Behavior:
- * Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
- * The following issues must be considered:
- * - Scaling of coefficients
- * - Overflow and saturation
- *
- * \par Scaling of Coefficients:
- * Filter coefficients are represented as fractional values and
- * coefficients are restricted to lie in the range [-1 +1).
- * The fixed-point functions have an additional scaling parameter postShift.
- * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
- * This essentially scales the filter coefficients by 2^postShift and
- * allows the filter coefficients to exceed the range [+1 -1).
- * The value of postShift is set by the user based on the expected gain through the system being modeled.
- *
- * \par Overflow and Saturation:
- * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
- * described separately as part of the function specific documentation below.
- */
+ @defgroup LMS_NORM Normalized LMS Filters
+ This set of functions implements a commonly used adaptive filter.
+ It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
+ factor which increases the adaptation rate of the filter.
+ The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
+
+ A normalized least mean square (NLMS) filter consists of two components as shown below.
+ The first component is a standard transversal or FIR filter.
+ The second component is a coefficient update mechanism.
+ The NLMS filter has two input signals.
+ The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ This "error signal" tends towards zero as the filter adapts.
+ The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ \image html LMS.gif "Internal structure of the NLMS adaptive filter"
+
+ The functions operate on blocks of data and each call to the function processes
+ blockSize samples through the filter.
+ pSrc points to input signal, pRef points to reference signal,
+ pOut points to output signal and pErr points to error signal.
+ All arrays contain blockSize values.
+
+ The functions operate on a block-by-block basis.
+ Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+
+ @par Algorithm
+ The output signal y[n] is computed by a standard FIR filter:
+
+ where mu is the step size and controls the rate of coefficient convergence.
+ @par
+ In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter and
+ coefficient and state arrays cannot be shared among instances.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero.
+ For Q7, Q15, and Q31 the following fields must also be initialized;
+ recipTable, postShift
+ @par
+ Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+ @par Fixed-Point Behavior
+ Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
+ The following issues must be considered:
+ - Scaling of coefficients
+ - Overflow and saturation
+
+ @par Scaling of Coefficients
+ Filter coefficients are represented as fractional values and
+ coefficients are restricted to lie in the range [-1 +1).
+ The fixed-point functions have an additional scaling parameter postShift.
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ This essentially scales the filter coefficients by 2^postShift and
+ allows the filter coefficients to exceed the range [+1 -1).
+ The value of postShift is set by the user based on the expected gain through the system being modeled.
+
+ @par Overflow and Saturation
+ Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ described separately as part of the function specific documentation below.
+ */
/**
- * @addtogroup LMS_NORM
- * @{
+ @addtogroup LMS_NORM
+ @{
*/
+/**
+ @brief Processing function for floating-point normalized LMS filter.
+ @param[in] S points to an instance of the floating-point normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
- /**
- * @brief Processing function for floating-point normalized LMS filter.
- * @param[in] *S points to an instance of the floating-point normalized LMS filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[in] *pRef points to the block of reference data.
- * @param[out] *pOut points to the block of output data.
- * @param[out] *pErr points to the block of error data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- */
-
+#if defined(ARM_MATH_NEON)
void arm_lms_norm_f32(
arm_lms_norm_instance_f32 * S,
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pRef,
float32_t * pOut,
float32_t * pErr,
@@ -177,6 +176,9 @@ void arm_lms_norm_f32(
float32_t sum, e, d; /* accumulator, error, reference data sample */
float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */
+ float32x4_t tempV, sumV, xV, bV;
+ float32x2_t tempV2;
+
/* Initializations of error, difference, Coefficient update */
e = 0.0f;
d = 0.0f;
@@ -192,11 +194,6 @@ void arm_lms_norm_f32(
/* Loop over blockSize number of values */
blkCnt = blockSize;
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
while (blkCnt > 0U)
{
/* Copy the new input sample into the state buffer */
@@ -217,21 +214,26 @@ void arm_lms_norm_f32(
/* Set the accumulator to zero */
sum = 0.0f;
+ sumV = vdupq_n_f32(0.0);
- /* Loop unrolling. Process 4 taps at a time. */
+ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
- sum += (*px++) * (*pb++);
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ sumV = vmlaq_f32(sumV, xV, bV);
+
+ px += 4;
+ pb += 4;
/* Decrement the loop counter */
tapCnt--;
}
+ tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = tempV2[0] + tempV2[1];
/* If the filter length is not a multiple of 4, compute the remaining filter taps */
tapCnt = numTaps % 0x4U;
@@ -263,24 +265,20 @@ void arm_lms_norm_f32(
/* Initialize coeff pointer */
pb = (pCoeffs);
- /* Loop unrolling. Process 4 taps at a time. */
+ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Update filter coefficients */
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- *pb += w * (*px++);
- pb++;
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ px += 4;
+ bV = vmlaq_n_f32(bV,xV,w);
- *pb += w * (*px++);
- pb++;
-
- *pb += w * (*px++);
- pb++;
-
- *pb += w * (*px++);
- pb++;
+ vst1q_f32(pb,bV);
+ pb += 4;
/* Decrement the loop counter */
@@ -319,16 +317,16 @@ void arm_lms_norm_f32(
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Loop unrolling for (numTaps - 1U)/4 samples copy */
+ /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
tapCnt = (numTaps - 1U) >> 2U;
/* copy data */
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
+ tempV = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,tempV);
+ pState += 4;
+ pStateCurnt += 4;
/* Decrement the loop counter */
tapCnt--;
@@ -346,9 +344,41 @@ void arm_lms_norm_f32(
tapCnt--;
}
+}
#else
+void arm_lms_norm_f32(
+ arm_lms_norm_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t energy; /* Energy of the input */
+ float32_t x0, in; /* Temporary variable to hold input sample and state */
- /* Run the below code for Cortex-M0 */
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
while (blkCnt > 0U)
{
@@ -358,7 +388,7 @@ void arm_lms_norm_f32(
/* Initialize pState pointer */
px = pState;
- /* Initialize pCoeffs pointer */
+ /* Initialize coefficient pointer */
pb = pCoeffs;
/* Read the sample from input buffer */
@@ -369,26 +399,52 @@ void arm_lms_norm_f32(
energy += in * in;
/* Set the accumulator to zero */
- sum = 0.0f;
+ acc = 0.0f;
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- sum += (*px++) * (*pb++);
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
/* Decrement the loop counter */
tapCnt--;
}
- /* The result in the accumulator is stored in the destination buffer. */
- *pOut++ = sum;
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
/* Compute and store error */
- d = (float32_t) (*pRef++);
- e = d - sum;
+ e = (float32_t) *pRef++ - acc;
*pErr++ = e;
/* Calculation of Weighting factor for updating filter coefficients */
@@ -398,19 +454,51 @@ void arm_lms_norm_f32(
/* Initialize pState pointer */
px = pState;
- /* Initialize pCcoeffs pointer */
+ /* Initialize coefficient pointer */
pb = pCoeffs;
- /* Loop over numTaps number of values */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
*pb += w * (*px++);
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -419,36 +507,58 @@ void arm_lms_norm_f32(
/* Advance state pointer by 1 for the next sample */
pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
+ /* Save energy and x0 values for the next frame */
S->energy = energy;
S->x0 = x0;
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Copy (numTaps - 1U) samples */
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* Copy the remaining q31_t data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
-
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of LMS_NORM group
- */
+ @} end of LMS_NORM group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
index 820c5c8..543dc72 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_lms_norm_init_f32.c
* Description: Floating-point NLMS filter initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,41 +29,41 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS_NORM
- * @{
+ @addtogroup LMS_NORM
+ @{
*/
- /**
- * @brief Initialization function for floating-point normalized LMS filter.
- * @param[in] *S points to an instance of the floating-point LMS filter structure.
- * @param[in] numTaps number of filter coefficients.
- * @param[in] *pCoeffs points to coefficient buffer.
- * @param[in] *pState points to state buffer.
- * @param[in] mu step size that controls filter coefficient updates.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * \par Description:
- * pCoeffs points to the array of filter coefficients stored in time reversed order:
- *
- * The initial filter coefficients serve as a starting point for the adaptive filter.
- * pState points to an array of length numTaps+blockSize-1 samples,
- * where blockSize is the number of input samples processed by each call to arm_lms_norm_f32().
+/**
+ @brief Initialization function for floating-point normalized LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
- * The initial filter coefficients serve as a starting point for the adaptive filter.
- * pState points to the array of state variables and size of array is
- * numTaps+blockSize-1 samples, where blockSize is the number of input samples processed
- * by each call to arm_lms_norm_q15().
+/**
+ @brief Initialization function for Q15 normalized LMS filter.
+ @param[in] S points to an instance of the Q15 normalized LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
- * The initial filter coefficients serve as a starting point for the adaptive filter.
- * pState points to an array of length numTaps+blockSize-1 samples,
- * where blockSize is the number of input samples processed by each call to arm_lms_norm_q31().
+/**
+ @brief Initialization function for Q31 normalized LMS filter.
+ @param[in] S points to an instance of the Q31 normalized LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples,
+ where blockSize is the number of input samples processed by each call to arm_lms_norm_q31().
*/
void arm_lms_norm_init_q31(
- arm_lms_norm_instance_q31 * S,
- uint16_t numTaps,
- q31_t * pCoeffs,
- q31_t * pState,
- q31_t mu,
- uint32_t blockSize,
- uint8_t postShift)
+ arm_lms_norm_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ q31_t mu,
+ uint32_t blockSize,
+ uint8_t postShift)
{
/* Assign filter taps */
S->numTaps = numTaps;
@@ -71,7 +70,7 @@ void arm_lms_norm_init_q31(
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
- /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
/* Assign post Shift value applied to coefficients */
@@ -91,9 +90,8 @@ void arm_lms_norm_init_q31(
/* Initialise x0 to zero */
S->x0 = 0;
-
}
/**
- * @} end of LMS_NORM group
+ @} end of LMS_NORM group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c b/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
index 70012ea..c15ad5e 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_lms_norm_q15.c
- * Description: Q15 NLMS filter
+ * Description: Processing function for Q15 normalized LMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,68 +29,65 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS_NORM
- * @{
+ @addtogroup LMS_NORM
+ @{
*/
/**
-* @brief Processing function for Q15 normalized LMS filter.
-* @param[in] *S points to an instance of the Q15 normalized LMS filter structure.
-* @param[in] *pSrc points to the block of input data.
-* @param[in] *pRef points to the block of reference data.
-* @param[out] *pOut points to the block of output data.
-* @param[out] *pErr points to the block of error data.
-* @param[in] blockSize number of samples to process.
-* @return none.
-*
-* Scaling and Overflow Behavior:
-* \par
-* The function is implemented using a 64-bit internal accumulator.
-* Both coefficients and state variables are represented in 1.15 format and
-* multiplications yield a 2.30 result. The 2.30 intermediate results are
-* accumulated in a 64-bit accumulator in 34.30 format.
-* There is no risk of internal overflow with this approach and the full
-* precision of intermediate multiplications is preserved. After all additions
-* have been performed, the accumulator is truncated to 34.15 format by
-* discarding low 15 bits. Lastly, the accumulator is saturated to yield a
-* result in 1.15 format.
-*
-* \par
-* In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
-*
+ @brief Processing function for Q15 normalized LMS filter.
+ @param[in] S points to an instance of the Q15 normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and
+ multiplications yield a 2.30 result. The 2.30 intermediate results are
+ accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full
+ precision of intermediate multiplications is preserved. After all additions
+ have been performed, the accumulator is truncated to 34.15 format by
+ discarding low 15 bits. Lastly, the accumulator is saturated to yield a
+ result in 1.15 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and the
+ updation of filter cofficients are saturted.
*/
void arm_lms_norm_q15(
- arm_lms_norm_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pRef,
- q15_t * pOut,
- q15_t * pErr,
- uint32_t blockSize)
+ arm_lms_norm_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
- q15_t mu = S->mu; /* Adaptive factor */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
- q31_t energy; /* Energy of the input */
- q63_t acc; /* Accumulator */
- q15_t e = 0, d = 0; /* error, reference data sample */
- q15_t w = 0, in; /* weight factor and state */
- q15_t x0; /* temporary variable to hold input sample */
- //uint32_t shift = (uint32_t) S->postShift + 1U; /* Shift to be applied to the output */
- q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
- q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
- q31_t coef; /* Teporary variable for coefficient */
- q31_t acc_l, acc_h;
- int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
- int32_t uShift = (32 - lShift);
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q15_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q31_t energy; /* Energy of the input */
+ q15_t e = 0, d = 0; /* Error, reference data sample */
+ q15_t w = 0, in; /* Weight factor and state */
+ q15_t x0; /* Temporary variable to hold input sample */
+ q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Temporary variable for coefficient */
+ q31_t acc_l, acc_h; /* Temporary input */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
energy = S->energy;
x0 = S->x0;
@@ -99,14 +96,9 @@ void arm_lms_norm_q15(
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Loop over blockSize number of values */
+ /* initialise loop count */
blkCnt = blockSize;
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
while (blkCnt > 0U)
{
/* Copy the new input sample into the state buffer */
@@ -115,197 +107,7 @@ void arm_lms_norm_q15(
/* Initialize pState pointer */
px = pState;
- /* Initialize coeff pointer */
- pb = (pCoeffs);
-
- /* Read the sample from input buffer */
- in = *pSrc++;
-
- /* Update the energy calculation */
- energy -= (((q31_t) x0 * (x0)) >> 15);
- energy += (((q31_t) in * (in)) >> 15);
-
- /* Set the accumulator to zero */
- acc = 0;
-
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
-
- while (tapCnt > 0U)
- {
-
- /* Perform the multiply-accumulate */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
- acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
-
-#else
-
- acc += (((q31_t) * px++ * (*pb++)));
- acc += (((q31_t) * px++ * (*pb++)));
- acc += (((q31_t) * px++ * (*pb++)));
- acc += (((q31_t) * px++ * (*pb++)));
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = numTaps % 0x4U;
-
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- acc += (((q31_t) * px++ * (*pb++)));
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
- /* Apply shift for lower part of acc and upper part of acc */
- acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
- /* Converting the result to 1.15 format and saturate the output */
- acc = __SSAT(acc, 16U);
-
- /* Store the result from accumulator into the destination buffer. */
- *pOut++ = (q15_t) acc;
-
- /* Compute and store error */
- d = *pRef++;
- e = d - (q15_t) acc;
- *pErr++ = e;
-
- /* Calculation of 1/energy */
- postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
- &oneByEnergy, S->recipTable);
-
- /* Calculation of e * mu value */
- errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
-
- /* Calculation of (e * mu) * (1/energy) value */
- acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
-
- /* Weighting factor for the normalized version */
- w = (q15_t) __SSAT((q31_t) acc, 16);
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize coeff pointer */
- pb = (pCoeffs);
-
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
-
- /* Update filter coefficients */
- while (tapCnt > 0U)
- {
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
- tapCnt = numTaps % 0x4U;
-
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Read the sample from state buffer */
- x0 = *pState;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1U;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Save energy and x0 values for the next frame */
- S->energy = (q15_t) energy;
- S->x0 = x0;
-
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
-
- /* Points to the start of the pState buffer */
- pStateCurnt = S->pState;
-
- /* Calculation of count for copying integer writes */
- tapCnt = (numTaps - 1U) >> 2;
-
- while (tapCnt > 0U)
- {
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
-
-#else
-
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
-#endif
-
- tapCnt--;
-
- }
-
- /* Calculation of count for remaining q15_t data */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
-#else
-
- /* Run the below code for Cortex-M0 */
-
- while (blkCnt > 0U)
- {
- /* Copy the new input sample into the state buffer */
- *pStateCurnt++ = *pSrc;
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize pCoeffs pointer */
+ /* Initialize coefficient pointer */
pb = pCoeffs;
/* Read the sample from input buffer */
@@ -318,13 +120,36 @@ void arm_lms_norm_q15(
/* Set the accumulator to zero */
acc = 0;
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- acc += (((q31_t) * px++ * (*pb++)));
+ /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
/* Decrement the loop counter */
tapCnt--;
@@ -342,9 +167,6 @@ void arm_lms_norm_q15(
/* Converting the result to 1.15 format and saturate the output */
acc = __SSAT(acc, 16U);
- /* Converting the result to 1.15 format */
- //acc = __SSAT((acc >> (16U - shift)), 16U);
-
/* Store the result from accumulator into the destination buffer. */
*pOut++ = (q15_t) acc;
@@ -354,8 +176,7 @@ void arm_lms_norm_q15(
*pErr++ = e;
/* Calculation of 1/energy */
- postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
- &oneByEnergy, S->recipTable);
+ postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, &oneByEnergy, S->recipTable);
/* Calculation of e * mu value */
errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
@@ -369,29 +190,59 @@ void arm_lms_norm_q15(
/* Initialize pState pointer */
px = pState;
- /* Initialize coeff pointer */
- pb = (pCoeffs);
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
- /* Loop over numTaps number of values */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- coef = *pb + (((q31_t) w * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Read the sample from state buffer */
x0 = *pState;
/* Advance state pointer by 1 for the next sample */
- pState = pState + 1U;
+ pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -399,30 +250,48 @@ void arm_lms_norm_q15(
S->energy = (q15_t) energy;
S->x0 = x0;
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* copy (numTaps - 1U) data */
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* copy data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
-
/**
- * @} end of LMS_NORM group
- */
+ @} end of LMS_NORM group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c b/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
index 9711738..e26219e 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
@@ -3,13 +3,13 @@
* Title: arm_lms_norm_q31.c
* Description: Processing function for the Q31 NLMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,69 +29,65 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS_NORM
- * @{
+ @addtogroup LMS_NORM
+ @{
*/
/**
-* @brief Processing function for Q31 normalized LMS filter.
-* @param[in] *S points to an instance of the Q31 normalized LMS filter structure.
-* @param[in] *pSrc points to the block of input data.
-* @param[in] *pRef points to the block of reference data.
-* @param[out] *pOut points to the block of output data.
-* @param[out] *pErr points to the block of error data.
-* @param[in] blockSize number of samples to process.
-* @return none.
-*
-* Scaling and Overflow Behavior:
-* \par
-* The function is implemented using an internal 64-bit accumulator.
-* The accumulator has a 2.62 format and maintains full precision of the intermediate
-* multiplication results but provides only a single guard bit.
-* Thus, if the accumulator result overflows it wraps around rather than clip.
-* In order to avoid overflows completely the input signal must be scaled down by
-* log2(numTaps) bits. The reference signal should not be scaled down.
-* After all multiply-accumulates are performed, the 2.62 accumulator is shifted
-* and saturated to 1.31 format to yield the final result.
-* The output signal and error signal are in 1.31 format.
-*
-* \par
-* In this filter, filter coefficients are updated for each sample and the
-* updation of filter cofficients are saturted.
-*
-*/
+ @brief Processing function for Q31 normalized LMS filter.
+ @param[in] S points to an instance of the Q31 normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(numTaps) bits. The reference signal should not be scaled down.
+ After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+ and saturated to 1.31 format to yield the final result.
+ The output signal and error signal are in 1.31 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and the
+ updation of filter cofficients are saturted.
+ */
void arm_lms_norm_q31(
- arm_lms_norm_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pRef,
- q31_t * pOut,
- q31_t * pErr,
- uint32_t blockSize)
+ arm_lms_norm_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
- q31_t mu = S->mu; /* Adaptive factor */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t tapCnt, blkCnt; /* Loop counters */
- q63_t energy; /* Energy of the input */
- q63_t acc; /* Accumulator */
- q31_t e = 0, d = 0; /* error, reference data sample */
- q31_t w = 0, in; /* weight factor and state */
- q31_t x0; /* temporary variable to hold input sample */
-// uint32_t shift = 32U - ((uint32_t) S->postShift + 1U); /* Shift to be applied to the output */
- q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
- q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
- q31_t coef; /* Temporary variable for coef */
- q31_t acc_l, acc_h; /* temporary input */
- uint32_t uShift = ((uint32_t) S->postShift + 1U);
- uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q31_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q63_t energy; /* Energy of the input */
+ q31_t e = 0; /* Error data sample */
+ q31_t w = 0, in; /* Weight factor and state */
+ q31_t x0; /* Temporary variable to hold input sample */
+ q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* Temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
energy = S->energy;
x0 = S->x0;
@@ -100,55 +96,64 @@ void arm_lms_norm_q31(
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Loop over blockSize number of values */
+ /* initialise loop count */
blkCnt = blockSize;
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
while (blkCnt > 0U)
{
-
/* Copy the new input sample into the state buffer */
*pStateCurnt++ = *pSrc;
/* Initialize pState pointer */
px = pState;
- /* Initialize coeff pointer */
- pb = (pCoeffs);
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
/* Read the sample from input buffer */
in = *pSrc++;
/* Update the energy calculation */
- energy = (q31_t) ((((q63_t) energy << 32) -
- (((q63_t) x0 * x0) << 1)) >> 32);
+ energy = (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32);
energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
/* Set the accumulator to zero */
acc = 0;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- acc += ((q63_t) (*px++)) * (*pb++);
- acc += ((q63_t) (*px++)) * (*pb++);
- acc += ((q63_t) (*px++)) * (*pb++);
+ /* acc += b[N] * x[n-N] */
acc += ((q63_t) (*px++)) * (*pb++);
- /* Decrement the loop counter */
+ /* acc += b[N-1] * x[n-N-1] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-2] * x[n-N-2] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-3] * x[n-N-3] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
@@ -171,13 +176,11 @@ void arm_lms_norm_q31(
*pOut++ = (q31_t) acc;
/* Compute and store error */
- d = *pRef++;
- e = d - (q31_t) acc;
+ e = *pRef++ - (q31_t) acc;
*pErr++ = e;
/* Calculates the reciprocal of energy */
- postShift = arm_recip_q31(energy + DELTA_Q31,
- &oneByEnergy, &S->recipTable[0]);
+ postShift = arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]);
/* Calculation of product of (e * mu) */
errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
@@ -188,11 +191,13 @@ void arm_lms_norm_q31(
/* Initialize pState pointer */
px = pState;
- /* Initialize coeff pointer */
- pb = (pCoeffs);
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
/* Update filter coefficients */
while (tapCnt > 0U)
@@ -218,162 +223,28 @@ void arm_lms_norm_q31(
*pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
- *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
- pb++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Read the sample from state buffer */
- x0 = *pState;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Save energy and x0 values for the next frame */
- S->energy = (q31_t) energy;
- S->x0 = x0;
-
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
-
- /* Points to the start of the pState buffer */
- pStateCurnt = S->pState;
-
- /* Loop unrolling for (numTaps - 1U) samples copy */
- tapCnt = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
- while (blkCnt > 0U)
- {
-
- /* Copy the new input sample into the state buffer */
- *pStateCurnt++ = *pSrc;
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize pCoeffs pointer */
- pb = pCoeffs;
-
- /* Read the sample from input buffer */
- in = *pSrc++;
-
- /* Update the energy calculation */
- energy =
- (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32);
- energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
-
- /* Set the accumulator to zero */
- acc = 0;
-
- /* Loop over numTaps number of values */
+ /* Initialize tapCnt with number of samples */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- acc += ((q63_t) (*px++)) * (*pb++);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Converting the result to 1.31 format */
- /* Converting the result to 1.31 format */
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
- acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
-
- //acc = (q31_t) (acc >> shift);
-
- /* Store the result from accumulator into the destination buffer. */
- *pOut++ = (q31_t) acc;
-
- /* Compute and store error */
- d = *pRef++;
- e = d - (q31_t) acc;
- *pErr++ = e;
-
- /* Calculates the reciprocal of energy */
- postShift =
- arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]);
-
- /* Calculation of product of (e * mu) */
- errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
-
- /* Weighting factor for the normalized version */
- w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize coeff pointer */
- pb = (pCoeffs);
-
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
-
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- /* coef is in 2.30 format */
coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
- /* get coef in 1.31 format by left shifting */
*pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
- /* update coefficient buffer to next coefficient */
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -383,7 +254,7 @@ void arm_lms_norm_q31(
/* Advance state pointer by 1 for the next sample */
pState = pState + 1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -391,29 +262,50 @@ void arm_lms_norm_q31(
S->energy = (q31_t) energy;
S->x0 = x0;
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- start of the state buffer. This prepares the state buffer for the
- next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Loop for (numTaps - 1U) samples copy */
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* Copy the remaining q31_t data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of LMS_NORM group
+ @} end of LMS_NORM group
*/
diff --git a/DSP/Source/FilteringFunctions/arm_lms_q15.c b/DSP/Source/FilteringFunctions/arm_lms_q15.c
index 6690294..0fc9878 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_q15.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_lms_q15.c
- * Description: Processing function for the Q15 LMS filter
+ * Description: Processing function for Q15 LMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,73 +27,66 @@
*/
#include "arm_math.h"
+
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
- /**
- * @brief Processing function for Q15 LMS filter.
- * @param[in] *S points to an instance of the Q15 LMS filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[in] *pRef points to the block of reference data.
- * @param[out] *pOut points to the block of output data.
- * @param[out] *pErr points to the block of error data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * \par Scaling and Overflow Behavior:
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
- * Lastly, the accumulator is saturated to yield a result in 1.15 format.
- *
- * \par
- * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
- *
+/**
+ @brief Processing function for Q15 LMS filter.
+ @param[in] S points to an instance of the Q15 LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and
+ the updation of filter cofficients are saturted.
*/
void arm_lms_q15(
const arm_lms_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pRef,
- q15_t * pOut,
- q15_t * pErr,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
{
- q15_t *pState = S->pState; /* State pointer */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q15_t *pStateCurnt; /* Points to the current sample of the state */
- q15_t mu = S->mu; /* Adaptive factor */
- q15_t *px; /* Temporary pointer for state */
- q15_t *pb; /* Temporary pointer for coefficient buffer */
- uint32_t tapCnt, blkCnt; /* Loop counters */
- q63_t acc; /* Accumulator */
- q15_t e = 0; /* error of data sample */
- q15_t alpha; /* Intermediate constant for taps update */
- q31_t coef; /* Teporary variable for coefficient */
- q31_t acc_l, acc_h;
- int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
- int32_t uShift = (32 - lShift);
-
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q15_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q15_t e = 0; /* Error of data sample */
+ q15_t alpha; /* Intermediate constant for taps update */
+ q31_t coef; /* Temporary variable for coefficient */
+ q31_t acc_l, acc_h; /* Temporary input */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
/* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Initializing blkCnt with blockSize */
+ /* initialise loop count */
blkCnt = blockSize;
while (blkCnt > 0U)
@@ -101,7 +94,7 @@ void arm_lms_q15(
/* Copy the new input sample into the state buffer */
*pStateCurnt++ = *pSrc++;
- /* Initialize state pointer */
+ /* Initialize pState pointer */
px = pState;
/* Initialize coefficient pointer */
@@ -110,35 +103,32 @@ void arm_lms_q15(
/* Set the accumulator to zero */
acc = 0;
- /* Loop unrolling. Process 4 taps at a time. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
- /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
/* Perform the multiply-accumulate */
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
- acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
- acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
-
-#else
-
- acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
- acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
- acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
- acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
-
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
@@ -158,211 +148,115 @@ void arm_lms_q15(
acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Converting the result to 1.15 format and saturate the output */
- acc = __SSAT(acc, 16);
+ acc = __SSAT(acc, 16U);
/* Store the result from accumulator into the destination buffer. */
*pOut++ = (q15_t) acc;
/* Compute and store error */
e = *pRef++ - (q15_t) acc;
-
*pErr++ = (q15_t) e;
/* Compute alpha i.e. intermediate constant for taps update */
alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
- /* Initialize state pointer */
+ /* Initialize pState pointer */
/* Advance state pointer by 1 for the next sample */
px = pState++;
/* Initialize coefficient pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
tapCnt = numTaps >> 2U;
/* Update filter coefficients */
while (tapCnt > 0U)
{
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
*pb++ = (q15_t) __SSAT((coef), 16);
- /* Decrement the loop counter */
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
*pb++ = (q15_t) __SSAT((coef), 16);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
-
}
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Calculation of count for copying integer writes */
- tapCnt = (numTaps - 1U) >> 2;
-
- while (tapCnt > 0U)
- {
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
- *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
-#else
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-#endif
-
- tapCnt--;
-
- }
-
- /* Calculation of count for remaining q15_t data */
- tapCnt = (numTaps - 1U) % 0x4U;
-
/* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
#else
- /* Run the below code for Cortex-M0 */
-
- /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Copy the new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize pCoeffs pointer */
- pb = pCoeffs;
-
- /* Set the accumulator to zero */
- acc = 0;
-
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
-
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- acc += (q63_t) ((q31_t) (*px++) * (*pb++));
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
- /* Apply shift for lower part of acc and upper part of acc */
- acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
- /* Converting the result to 1.15 format and saturate the output */
- acc = __SSAT(acc, 16);
-
- /* Store the result from accumulator into the destination buffer. */
- *pOut++ = (q15_t) acc;
-
- /* Compute and store error */
- e = *pRef++ - (q15_t) acc;
-
- *pErr++ = (q15_t) e;
-
- /* Compute alpha i.e. intermediate constant for taps update */
- alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
-
- /* Initialize pState pointer */
- /* Advance state pointer by 1 for the next sample */
- px = pState++;
-
- /* Initialize pCoeffs pointer */
- pb = pCoeffs;
-
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
-
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
- *pb++ = (q15_t) __SSAT((coef), 16);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Decrement the loop counter */
- blkCnt--;
-
- }
-
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- start of the state buffer. This prepares the state buffer for the
- next function call. */
-
- /* Points to the start of the pState buffer */
- pStateCurnt = S->pState;
-
- /* Copy (numTaps - 1U) samples */
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* Copy the data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of LMS group
- */
+ @} end of LMS group
+ */
diff --git a/DSP/Source/FilteringFunctions/arm_lms_q31.c b/DSP/Source/FilteringFunctions/arm_lms_q31.c
index 816e589..b0c0e27 100644
--- a/DSP/Source/FilteringFunctions/arm_lms_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_lms_q31.c
@@ -3,13 +3,13 @@
* Title: arm_lms_q31.c
* Description: Processing function for the Q31 LMS filter
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,83 +27,78 @@
*/
#include "arm_math.h"
+
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup LMS
- * @{
+ @addtogroup LMS
+ @{
*/
- /**
- * @brief Processing function for Q31 LMS filter.
- * @param[in] *S points to an instance of the Q15 LMS filter structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[in] *pRef points to the block of reference data.
- * @param[out] *pOut points to the block of output data.
- * @param[out] *pErr points to the block of error data.
- * @param[in] blockSize number of samples to process.
- * @return none.
- *
- * \par Scaling and Overflow Behavior:
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate
- * multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clips.
- * In order to avoid overflows completely the input signal must be scaled down by
- * log2(numTaps) bits.
- * The reference signal should not be scaled down.
- * After all multiply-accumulates are performed, the 2.62 accumulator is shifted
- * and saturated to 1.31 format to yield the final result.
- * The output signal and error signal are in 1.31 format.
- *
- * \par
- * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
+/**
+ @brief Processing function for Q31 LMS filter.
+ @param[in] S points to an instance of the Q31 LMS filter structure.
+ @param[in] pSrc points to the block of input data.
+ @param[in] pRef points to the block of reference data.
+ @param[out] pOut points to the block of output data.
+ @param[out] pErr points to the block of error data.
+ @param[in] blockSize number of samples to process.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clips.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(numTaps) bits.
+ The reference signal should not be scaled down.
+ After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+ and saturated to 1.31 format to yield the final result.
+ The output signal and error signal are in 1.31 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and
+ the updation of filter cofficients are saturted.
*/
void arm_lms_q31(
const arm_lms_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pRef,
- q31_t * pOut,
- q31_t * pErr,
- uint32_t blockSize)
-{
- q31_t *pState = S->pState; /* State pointer */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t mu = S->mu; /* Adaptive factor */
- q31_t *px; /* Temporary pointer for state */
- q31_t *pb; /* Temporary pointer for coefficient buffer */
- uint32_t tapCnt, blkCnt; /* Loop counters */
- q63_t acc; /* Accumulator */
- q31_t e = 0; /* error of data sample */
- q31_t alpha; /* Intermediate constant for taps update */
- q31_t coef; /* Temporary variable for coef */
- q31_t acc_l, acc_h; /* temporary input */
- uint32_t uShift = ((uint32_t) S->postShift + 1U);
- uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
+ const q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q31_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q31_t e = 0; /* Error of data sample */
+ q31_t alpha; /* Intermediate constant for taps update */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* Temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
/* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
- /* Initializing blkCnt with blockSize */
+ /* initialise loop count */
blkCnt = blockSize;
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
while (blkCnt > 0U)
{
/* Copy the new input sample into the state buffer */
*pStateCurnt++ = *pSrc++;
- /* Initialize state pointer */
+ /* Initialize pState pointer */
px = pState;
/* Initialize coefficient pointer */
@@ -112,8 +107,10 @@ void arm_lms_q31(
/* Set the accumulator to zero */
acc = 0;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
while (tapCnt > 0U)
{
@@ -130,13 +127,20 @@ void arm_lms_q31(
/* acc += b[N-3] * x[n-N-3] */
acc += ((q63_t) (*px++)) * (*pb++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Perform the multiply-accumulate */
@@ -160,25 +164,28 @@ void arm_lms_q31(
/* Compute and store error */
e = *pRef++ - (q31_t) acc;
-
- *pErr++ = (q31_t) e;
+ *pErr++ = e;
/* Compute alpha i.e. intermediate constant for taps update */
alpha = (q31_t) (((q63_t) e * mu) >> 31);
- /* Initialize state pointer */
+ /* Initialize pState pointer */
/* Advance state pointer by 1 for the next sample */
px = pState++;
/* Initialize coefficient pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
/* Update filter coefficients */
while (tapCnt > 0U)
{
+ /* Perform the multiply-accumulate */
+
/* coef is in 2.30 format */
coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
/* get coef in 1.31 format by left shifting */
@@ -198,121 +205,19 @@ void arm_lms_q31(
*pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
- *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
- pb++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- satrt of the state buffer. This prepares the state buffer for the
- next function call. */
-
- /* Points to the start of the pState buffer */
- pStateCurnt = S->pState;
-
- /* Loop unrolling for (numTaps - 1U) samples copy */
- tapCnt = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Calculate remaining number of copies */
- tapCnt = (numTaps - 1U) % 0x4U;
-
- /* Copy the remaining q31_t data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
- while (blkCnt > 0U)
- {
- /* Copy the new input sample into the state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Initialize pState pointer */
- px = pState;
-
- /* Initialize pCoeffs pointer */
- pb = pCoeffs;
-
- /* Set the accumulator to zero */
- acc = 0;
-
- /* Loop over numTaps number of values */
+ /* Initialize tapCnt with number of samples */
tapCnt = numTaps;
- while (tapCnt > 0U)
- {
- /* Perform the multiply-accumulate */
- acc += ((q63_t) (*px++)) * (*pb++);
-
- /* Decrement the loop counter */
- tapCnt--;
- }
-
- /* Converting the result to 1.31 format */
- /* Store the result from accumulator into the destination buffer. */
- /* Calc lower part of acc */
- acc_l = acc & 0xffffffff;
-
- /* Calc upper part of acc */
- acc_h = (acc >> 32) & 0xffffffff;
-
- acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
- *pOut++ = (q31_t) acc;
-
- /* Compute and store error */
- e = *pRef++ - (q31_t) acc;
-
- *pErr++ = (q31_t) e;
-
- /* Weighting factor for the LMS version */
- alpha = (q31_t) (((q63_t) e * mu) >> 31);
-
- /* Initialize pState pointer */
- /* Advance state pointer by 1 for the next sample */
- px = pState++;
-
- /* Initialize pCoeffs pointer */
- pb = pCoeffs;
-
- /* Loop over numTaps number of values */
- tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (tapCnt > 0U)
{
@@ -321,37 +226,58 @@ void arm_lms_q31(
*pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
pb++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Processing is complete. Now copy the last numTaps - 1 samples to the
- start of the state buffer. This prepares the state buffer for the
- next function call. */
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
- /* Copy (numTaps - 1U) samples */
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
tapCnt = (numTaps - 1U);
- /* Copy the data */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of LMS group
- */
+ @} end of LMS group
+ */
diff --git a/DSP/Source/MatrixFunctions/CMakeLists.txt b/DSP/Source/MatrixFunctions/CMakeLists.txt
new file mode 100644
index 0000000..d48d6b1
--- /dev/null
+++ b/DSP/Source/MatrixFunctions/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPMatrix)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPMatrix STATIC ${SRC})
+
+configdsp(CMSISDSPMatrix ..)
+
+### Includes
+target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/MatrixFunctions/MatrixFunctions.c b/DSP/Source/MatrixFunctions/MatrixFunctions.c
new file mode 100644
index 0000000..da721fe
--- /dev/null
+++ b/DSP/Source/MatrixFunctions/MatrixFunctions.c
@@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: MatrixFunctions.c
+ * Description: Combination of all matrix function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_mat_add_f32.c"
+#include "arm_mat_add_q15.c"
+#include "arm_mat_add_q31.c"
+#include "arm_mat_cmplx_mult_f32.c"
+#include "arm_mat_cmplx_mult_q15.c"
+#include "arm_mat_cmplx_mult_q31.c"
+#include "arm_mat_init_f32.c"
+#include "arm_mat_init_q15.c"
+#include "arm_mat_init_q31.c"
+#include "arm_mat_inverse_f32.c"
+#include "arm_mat_inverse_f64.c"
+#include "arm_mat_mult_f32.c"
+#include "arm_mat_mult_fast_q15.c"
+#include "arm_mat_mult_fast_q31.c"
+#include "arm_mat_mult_q15.c"
+#include "arm_mat_mult_q31.c"
+#include "arm_mat_scale_f32.c"
+#include "arm_mat_scale_q15.c"
+#include "arm_mat_scale_q31.c"
+#include "arm_mat_sub_f32.c"
+#include "arm_mat_sub_q15.c"
+#include "arm_mat_sub_q31.c"
+#include "arm_mat_trans_f32.c"
+#include "arm_mat_trans_q15.c"
+#include "arm_mat_trans_q31.c"
diff --git a/DSP/Source/MatrixFunctions/arm_mat_add_f32.c b/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
index 4a54049..8e1246c 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,35 +29,43 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixAdd Matrix Addition
- *
- * Adds two matrices.
- * \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
- *
- * The functions check to make sure that
- * pSrcA, pSrcB, and pDst have the same
- * number of rows and columns.
+ @defgroup MatrixAdd Matrix Addition
+
+ Adds two matrices.
+ \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
+
+ The functions check to make sure that
+ pSrcA, pSrcB, and pDst have the same
+ number of rows and columns.
*/
/**
- * @addtogroup MatrixAdd
- * @{
+ @addtogroup MatrixAdd
+ @{
*/
/**
- * @brief Floating-point matrix addition.
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ @brief Floating-point matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
+#if defined(ARM_MATH_NEON)
+/*
+Neon version is assuming the matrix is small enough.
+So no blocking is used for taking into account cache effects.
+For big matrix, there exist better libraries for Neon.
+
+*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -67,12 +75,8 @@ arm_status arm_mat_add_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
-#if defined (ARM_MATH_DSP)
-
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
-#endif // #if defined (ARM_MATH_DSP)
-
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
@@ -89,69 +93,27 @@ arm_status arm_mat_add_f32(
else
#endif
{
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
-#if defined (ARM_MATH_DSP)
-
- /* Loop unrolling */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
- /* Read values from source A */
- inA1 = pIn1[0];
+ vec1 = vld1q_f32(pIn1);
+ vec2 = vld1q_f32(pIn2);
+ res = vaddq_f32(vec1, vec2);
+ vst1q_f32(pOut, res);
- /* Read values from source B */
- inB1 = pIn2[0];
-
- /* Read values from source A */
- inA2 = pIn1[1];
-
- /* out = sourceA + sourceB */
- out1 = inA1 + inB1;
-
- /* Read values from source B */
- inB2 = pIn2[1];
-
- /* Read values from source A */
- inA1 = pIn1[2];
-
- /* out = sourceA + sourceB */
- out2 = inA2 + inB2;
-
- /* Read values from source B */
- inB1 = pIn2[2];
-
- /* Store result in destination */
- pOut[0] = out1;
- pOut[1] = out2;
-
- /* Read values from source A */
- inA2 = pIn1[3];
-
- /* Read values from source B */
- inB2 = pIn2[3];
-
- /* out = sourceA + sourceB */
- out1 = inA1 + inB1;
-
- /* out = sourceA + sourceB */
- out2 = inA2 + inB2;
-
- /* Store result in destination */
- pOut[2] = out1;
-
- /* Store result in destination */
- pOut[3] = out2;
-
-
- /* update pointers to process next sampels */
+ /* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@@ -163,15 +125,6 @@ arm_status arm_mat_add_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
@@ -184,13 +137,96 @@ arm_status arm_mat_add_f32(
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
-
}
/* Return to application */
return (status);
}
+#else
+arm_status arm_mat_add_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of MatrixAdd group
+ @} end of MatrixAdd group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_add_q15.c b/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
index 896e60c..2aaf849 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,116 +29,114 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixAdd
- * @{
+ @addtogroup MatrixAdd
+ @{
*/
/**
- * @brief Q15 matrix addition.
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ @brief Q15 matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
- arm_matrix_instance_q15 * pDst)
+ arm_matrix_instance_q15 * pDst)
{
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- q15_t *pOut = pDst->pData; /* output data matrix pointer */
- uint16_t numSamples; /* total number of elements in the matrix */
- uint32_t blkCnt; /* loop counters */
- arm_status status; /* status of matrix addition */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
- (pSrcA->numCols != pSrcB->numCols) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
- /* Total number of samples in the input matrix */
- numSamples = (uint16_t) (pSrcA->numRows * pSrcA->numCols);
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
+ write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
- /* Loop unrolling */
- blkCnt = (uint32_t) numSamples >> 2U;
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* C(m,n) = A(m,n) + B(m,n) */
- /* Add, Saturate and then store the results in the destination buffer. */
- *__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
- *__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
- /* Decrement the loop counter */
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = (uint32_t) numSamples % 0x4U;
-
- /* q15 pointers of input and output are initialized */
-
- while (blkCnt > 0U)
- {
- /* C(m,n) = A(m,n) + B(m,n) */
- /* Add, Saturate and then store the results in the destination buffer. */
- *pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
- blkCnt = (uint32_t) numSamples;
+ blkCnt = numSamples;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
- /* q15 pointers of input and output are initialized */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
- /* Add, Saturate and then store the results in the destination buffer. */
- *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ + *pInB++), 16);
- /* Decrement the loop counter */
+ /* Add, saturate and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -147,5 +145,5 @@ arm_status arm_mat_add_q15(
}
/**
- * @} end of MatrixAdd group
+ @} end of MatrixAdd group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_add_q31.c b/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
index f230ad2..6194809 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,160 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixAdd
- * @{
+ @addtogroup MatrixAdd
+ @{
*/
/**
- * @brief Q31 matrix addition.
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+ @brief Q31 matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
- q31_t inA1, inB1; /* temporary variables */
-#if defined (ARM_MATH_DSP)
-
- q31_t inA2, inB2; /* temporary variables */
- q31_t out1, out2; /* temporary variables */
-
-#endif // #if defined (ARM_MATH_DSP)
-
- uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
+
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
- (pSrcA->numCols != pSrcB->numCols) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
- /* Total number of samples in the input matrix */
+ /* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
- /* Add, saturate and then store the results in the destination buffer. */
- /* Read values from source A */
- inA1 = pIn1[0];
- /* Read values from source B */
- inB1 = pIn2[0];
+ /* Add, saturate and store result in destination buffer. */
+ *pOut++ = __QADD(*pInA++, *pInB++);
- /* Read values from source A */
- inA2 = pIn1[1];
+ *pOut++ = __QADD(*pInA++, *pInB++);
- /* Add and saturate */
- out1 = __QADD(inA1, inB1);
+ *pOut++ = __QADD(*pInA++, *pInB++);
- /* Read values from source B */
- inB2 = pIn2[1];
+ *pOut++ = __QADD(*pInA++, *pInB++);
- /* Read values from source A */
- inA1 = pIn1[2];
-
- /* Add and saturate */
- out2 = __QADD(inA2, inB2);
-
- /* Read values from source B */
- inB1 = pIn2[2];
-
- /* Store result in destination */
- pOut[0] = out1;
- pOut[1] = out2;
-
- /* Read values from source A */
- inA2 = pIn1[3];
-
- /* Read values from source B */
- inB2 = pIn2[3];
-
- /* Add and saturate */
- out1 = __QADD(inA1, inB1);
- out2 = __QADD(inA2, inB2);
-
- /* Store result in destination */
- pOut[2] = out1;
- pOut[3] = out2;
-
- /* update pointers to process next sampels */
- pIn1 += 4U;
- pIn2 += 4U;
- pOut += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
- /* Add, saturate and then store the results in the destination buffer. */
- inA1 = *pIn1++;
- inB1 = *pIn2++;
- inA1 = __QADD(inA1, inB1);
+ /* Add, saturate and store result in destination buffer. */
+ *pOut++ = __QADD(*pInA++, *pInB++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
-
- *pOut++ = inA1;
-
}
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -191,5 +135,5 @@ arm_status arm_mat_add_q31(
}
/**
- * @} end of MatrixAdd group
+ @} end of MatrixAdd group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
index bb8341e..8e2af31 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_f32.c
* Description: Floating-point matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,36 +29,38 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup CmplxMatrixMult Complex Matrix Multiplication
- *
- * Complex Matrix multiplication is only defined if the number of columns of the
- * first matrix equals the number of rows of the second matrix.
- * Multiplying an M x N matrix with an N x P matrix results
- * in an M x P matrix.
- * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
- * pSrcA and pSrcB are equal; and (2) that the size of the output
- * matrix equals the outer dimensions of pSrcA and pSrcB.
+ @defgroup CmplxMatrixMult Complex Matrix Multiplication
+
+ Complex Matrix multiplication is only defined if the number of columns of the
+ first matrix equals the number of rows of the second matrix.
+ Multiplying an M x N matrix with an N x P matrix results
+ in an M x P matrix.
+ @par
+ When matrix size checking is enabled, the functions check:
+ - that the inner dimensions of pSrcA and pSrcB are equal;
+ - that the size of the output matrix equals the outer dimensions of pSrcA and pSrcB.
*/
/**
- * @addtogroup CmplxMatrixMult
- * @{
+ @addtogroup CmplxMatrixMult
+ @{
*/
/**
- * @brief Floating-point Complex matrix multiplication.
- * @param[in] *pSrcA points to the first input complex matrix structure
- * @param[in] *pSrcB points to the second input complex matrix structure
- * @param[out] *pDst points to output complex matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ @brief Floating-point Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
-
+#if defined(ARM_MATH_NEON)
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -74,14 +76,20 @@ arm_status arm_mat_cmplx_mult_f32(
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
float32_t sumReal1, sumImag1; /* accumulator */
float32_t a0, b0, c0, d0;
- float32_t a1, b1, c1, d1;
+ float32_t a1, a1B,b1, b1B, c1, d1;
float32_t sumReal2, sumImag2; /* accumulator */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32x4x2_t a0V, a1V;
+ float32x4_t accR0,accI0, accR1,accI1,tempR, tempI;
+ float32x2_t accum = vdup_n_f32(0);
+ float32_t *pIn1B = pSrcA->pData;
- uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ uint16_t col, i = 0U, j, rowCnt, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
+ float32_t sumReal1B, sumImag1B;
+ float32_t sumReal2B, sumImag2B;
+ float32_t *pxB;
#ifdef ARM_MATH_MATRIX_CHECK
@@ -99,8 +107,172 @@ arm_status arm_mat_cmplx_mult_f32(
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
- /* row loop */
- do
+
+ rowCnt = row >> 1;
+
+ /* Row loop */
+ while (rowCnt > 0U)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+ pxB = px + 2 * numColsB;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ while (col > 0U)
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal1 = 0.0f;
+ sumImag1 = 0.0f;
+ sumReal1B = 0.0f;
+ sumImag1B = 0.0f;
+
+ sumReal2 = 0.0f;
+ sumImag2 = 0.0f;
+ sumReal2B = 0.0f;
+ sumImag2B = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+ pIn1B = pIn1 + 2*numColsA;
+
+ accR0 = vdupq_n_f32(0.0);
+ accI0 = vdupq_n_f32(0.0);
+ accR1 = vdupq_n_f32(0.0);
+ accI1 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* Reading real part of complex matrix A */
+ a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ a1V = vld2q_f32(pIn1B); // load & separate real/imag pSrcA (de-interleave 2)
+
+ pIn1 += 8;
+ pIn1B += 8;
+
+ tempR[0] = *pIn2;
+ tempI[0] = *(pIn2 + 1U);
+ pIn2 += 2 * numColsB;
+
+ tempR[1] = *pIn2;
+ tempI[1] = *(pIn2 + 1U);
+ pIn2 += 2 * numColsB;
+
+ tempR[2] = *pIn2;
+ tempI[2] = *(pIn2 + 1U);
+ pIn2 += 2 * numColsB;
+
+ tempR[3] = *pIn2;
+ tempI[3] = *(pIn2 + 1U);
+ pIn2 += 2 * numColsB;
+
+ accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
+ accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
+
+ accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
+ accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
+
+ accR1 = vmlaq_f32(accR1,a1V.val[0],tempR);
+ accR1 = vmlsq_f32(accR1,a1V.val[1],tempI);
+
+ accI1 = vmlaq_f32(accI1,a1V.val[1],tempR);
+ accI1 = vmlaq_f32(accI1,a1V.val[0],tempI);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
+ sumReal1 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
+ sumImag1 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
+ sumReal1B += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
+ sumImag1B += accum[0] + accum[1];
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 3;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a1 = *pIn1;
+ a1B = *pIn1B;
+
+ c1 = *pIn2;
+
+ b1 = *(pIn1 + 1U);
+ b1B = *(pIn1B + 1U);
+
+ d1 = *(pIn2 + 1U);
+
+ sumReal1 += a1 * c1;
+ sumImag1 += b1 * c1;
+
+ sumReal1B += a1B * c1;
+ sumImag1B += b1B * c1;
+
+ pIn1 += 2U;
+ pIn1B += 2U;
+ pIn2 += 2 * numColsB;
+
+ sumReal2 -= b1 * d1;
+ sumImag2 += a1 * d1;
+
+ sumReal2B -= b1B * d1;
+ sumImag2B += a1B * d1;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ sumReal1 += sumReal2;
+ sumImag1 += sumImag2;
+
+ sumReal1B += sumReal2B;
+ sumImag1B += sumImag2B;
+
+ /* Store the result in the destination buffer */
+ *px++ = sumReal1;
+ *px++ = sumImag1;
+ *pxB++ = sumReal1B;
+ *pxB++ = sumImag1B;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ /* Update the pointer pInA to point to the starting address of the next 2 row */
+ i = i + 2*numColsB;
+ pInA = pInA + 4 * numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+ }
+
+ rowCnt = row & 1;
+ while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
@@ -114,8 +286,8 @@ arm_status arm_mat_cmplx_mult_f32(
j = 0U;
- /* column loop */
- do
+ /* Column loop */
+ while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
@@ -127,94 +299,58 @@ arm_status arm_mat_cmplx_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ accR0 = vdupq_n_f32(0.0);
+ accI0 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
- /* matrix multiplication */
+ /* Matrix multiplication */
while (colCnt > 0U)
{
-
/* Reading real part of complex matrix A */
- a0 = *pIn1;
+ a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ pIn1 += 8;
- /* Reading real part of complex matrix B */
- c0 = *pIn2;
-
- /* Reading imaginary part of complex matrix A */
- b0 = *(pIn1 + 1U);
-
- /* Reading imaginary part of complex matrix B */
- d0 = *(pIn2 + 1U);
-
- sumReal1 += a0 * c0;
- sumImag1 += b0 * c0;
-
- pIn1 += 2U;
+ tempR[0] = *pIn2;
+ tempI[0] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
- sumReal2 -= b0 * d0;
- sumImag2 += a0 * d0;
-
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-
- a1 = *pIn1;
- c1 = *pIn2;
-
- b1 = *(pIn1 + 1U);
- d1 = *(pIn2 + 1U);
-
- sumReal1 += a1 * c1;
- sumImag1 += b1 * c1;
-
- pIn1 += 2U;
+ tempR[1] = *pIn2;
+ tempI[1] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
- sumReal2 -= b1 * d1;
- sumImag2 += a1 * d1;
-
- a0 = *pIn1;
- c0 = *pIn2;
-
- b0 = *(pIn1 + 1U);
- d0 = *(pIn2 + 1U);
-
- sumReal1 += a0 * c0;
- sumImag1 += b0 * c0;
-
- pIn1 += 2U;
+ tempR[2] = *pIn2;
+ tempI[2] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
- sumReal2 -= b0 * d0;
- sumImag2 += a0 * d0;
-
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-
- a1 = *pIn1;
- c1 = *pIn2;
-
- b1 = *(pIn1 + 1U);
- d1 = *(pIn2 + 1U);
-
- sumReal1 += a1 * c1;
- sumImag1 += b1 * c1;
-
- pIn1 += 2U;
+ tempR[3] = *pIn2;
+ tempI[3] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
- sumReal2 -= b1 * d1;
- sumImag2 += a1 * d1;
+ accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
+ accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
+
+ accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
+ accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
+ accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
+ sumReal1 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
+ sumImag1 += accum[0] + accum[1];
+
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
- colCnt = numColsA % 0x4U;
+ colCnt = numColsA & 3;
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
@@ -248,13 +384,234 @@ arm_status arm_mat_cmplx_mult_f32(
/* Decrement the column loop counter */
col--;
- } while (col > 0U);
+ }
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
+ rowCnt--;
+
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_cmplx_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ float32_t sumReal, sumImag; /* Accumulator */
+ float32_t a1, b1, c1, d1;
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t a0, b0, c0, d0;
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal = 0.0f;
+ sumImag = 0.0f;
+
+ /* Initiate pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+
+ /* Reading real part of complex matrix A */
+ a0 = *pIn1;
+
+ /* Reading real part of complex matrix B */
+ c0 = *pIn2;
+
+ /* Reading imaginary part of complex matrix A */
+ b0 = *(pIn1 + 1U);
+
+ /* Reading imaginary part of complex matrix B */
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ a0 = *(pIn1 );
+ c0 = *(pIn2 );
+ b0 = *(pIn1 + 1U);
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop count */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sumReal;
+ *px++ = sumImag;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + 2 * numColsA;
+
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
@@ -267,6 +624,8 @@ arm_status arm_mat_cmplx_mult_f32(
return (status);
}
+#endif /* #if defined(ARM_MATH_NEON) */
+
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
index 5dee79c..4c5a45b 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,115 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup CmplxMatrixMult
- * @{
+ @addtogroup CmplxMatrixMult
+ @{
*/
-
/**
- * @brief Q15 Complex matrix multiplication
- * @param[in] *pSrcA points to the first input complex matrix structure
- * @param[in] *pSrcB points to the second input complex matrix structure
- * @param[out] *pDst points to output complex matrix structure
- * @param[in] *pScratch points to the array for storing intermediate results
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * \par Conditions for optimum performance
- * Input, output and state buffers should be aligned by 32-bit
- *
- * \par Restrictions
- * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
- * In this case input, output, scratch buffers should be aligned by 32-bit
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator. The inputs to the
- * multiplications are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate
- * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
- * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
- * truncated to 34.15 format by discarding the low 15 bits and then saturated to
- * 1.15 format.
- *
- * \par
- * Refer to arm_mat_mult_fast_q15() for a faster but less precise version of this function.
- *
+ @brief Q15 Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @param[in] pScratch points to an array for storing intermediate results
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Conditions for optimum performance
+ Input, output and state buffers should be aligned by 32-bit
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator. The inputs to the
+ multiplications are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
+ truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
-
-
-
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
- arm_matrix_instance_q15 * pDst,
- q15_t * pScratch)
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pScratch)
{
- /* accumulator */
- q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
- q15_t *px; /* Temporary output data matrix pointer */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
- uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
- q63_t sumReal, sumImag;
+ q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ q63_t sumReal, sumImag; /* accumulator */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
-#ifdef UNALIGNED_SUPPORT_DISABLE
- q15_t in; /* Temporary variable to hold the input value */
- q15_t a, b, c, d;
+#if defined (ARM_MATH_DSP)
+ q31_t prod1, prod2;
+ q31_t pSourceA, pSourceB;
#else
- q31_t in; /* Temporary variable to hold the input value */
- q31_t prod1, prod2;
- q31_t pSourceA, pSourceB;
-#endif
+ q15_t a, b, c, d;
+#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
+
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/* Matrix transpose */
do
{
+ /* The pointer px is set to starting address of column being processed */
+ px = pSrcBT + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
- /* The pointer px is set to starting address of the column being processed */
- px = pSrcBT + i;
-
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
+ a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
-#ifdef UNALIGNED_SUPPORT_DISABLE
- /* Read two elements from the row */
- in = *pInB++;
- *px = in;
- in = *pInB++;
- px[1] = in;
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
- /* Read two elements from the row */
- in = *pInB++;
- *px = in;
- in = *pInB++;
- px[1] = in;
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
- /* Read two elements from the row */
- in = *pInB++;
- *px = in;
- in = *pInB++;
- px[1] = in;
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
- /* Read two elements from the row */
- in = *pInB++;
- *px = in;
- in = *pInB++;
- px[1] = in;
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
@@ -171,79 +145,33 @@ arm_status arm_mat_cmplx_mult_q15(
** No loop unrolling is used. */
col = numColsB % 0x4U;
- while (col > 0U)
- {
- /* Read two elements from the row */
- in = *pInB++;
- *px = in;
- in = *pInB++;
- px[1] = in;
#else
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
+ /* Initialize blkCnt with number of samples */
+ col = numColsB;
- *__SIMD32(px) = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB * 2;
-
-
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
-
- *__SIMD32(px) = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB * 2;
-
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
-
- *__SIMD32(px) = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB * 2;
-
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
-
- *__SIMD32(px) = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB * 2;
-
- /* Decrement the column loop counter */
- col--;
- }
-
- /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- col = numColsB % 0x4U;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
- *__SIMD32(px) = in;
-#endif
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
i = i + 2U;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
- /* Reset the variables for the usage in the following multiplication process */
+ /* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@@ -252,33 +180,61 @@ arm_status arm_mat_cmplx_mult_q15(
/* row loop */
do
{
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the transposed pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
- /* Set the variable sum, that acts as accumulator, to zero */
+ /* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
- /* Apply loop unrolling and compute 2 MACs simultaneously. */
- colCnt = numColsA >> 1;
-
- /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ /* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
@@ -304,53 +260,40 @@ arm_status arm_mat_cmplx_mult_q15(
pInA += 4U;
/* Multiply and Accumlates */
- sumReal += (q31_t) a *c;
- sumImag += (q31_t) a *d;
- sumReal -= (q31_t) b *d;
- sumImag += (q31_t) b *c;
+ sumReal += (q31_t) a * c;
+ sumImag += (q31_t) a * d;
+ sumReal -= (q31_t) b * d;
+ sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
-#else
- /* read real and imag values from pSrcA and pSrcB buffer */
- pSourceA = *__SIMD32(pInA)++;
- pSourceB = *__SIMD32(pInB)++;
- /* Multiply and Accumlates */
-#ifdef ARM_MATH_BIG_ENDIAN
- prod1 = -__SMUSD(pSourceA, pSourceB);
-#else
- prod1 = __SMUSD(pSourceA, pSourceB);
-#endif
- prod2 = __SMUADX(pSourceA, pSourceB);
- sumReal += (q63_t) prod1;
- sumImag += (q63_t) prod2;
+#endif /* #if defined (ARM_MATH_DSP) */
- /* read real and imag values from pSrcA and pSrcB buffer */
- pSourceA = *__SIMD32(pInA)++;
- pSourceB = *__SIMD32(pInB)++;
-
- /* Multiply and Accumlates */
-#ifdef ARM_MATH_BIG_ENDIAN
- prod1 = -__SMUSD(pSourceA, pSourceB);
-#else
- prod1 = __SMUSD(pSourceA, pSourceB);
-#endif
- prod2 = __SMUADX(pSourceA, pSourceB);
- sumReal += (q63_t) prod1;
- sumImag += (q63_t) prod2;
-
-#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
-#ifdef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
@@ -359,48 +302,32 @@ arm_status arm_mat_cmplx_mult_q15(
d = *pInB++;
/* Multiply and Accumlates */
- sumReal += (q31_t) a *c;
- sumImag += (q31_t) a *d;
- sumReal -= (q31_t) b *d;
- sumImag += (q31_t) b *c;
+ sumReal += (q31_t) a * c;
+ sumImag += (q31_t) a * d;
+ sumReal -= (q31_t) b * d;
+ sumImag += (q31_t) b * c;
-#else
- /* read real and imag values from pSrcA and pSrcB buffer */
- pSourceA = *__SIMD32(pInA)++;
- pSourceB = *__SIMD32(pInB)++;
-
- /* Multiply and Accumlates */
-#ifdef ARM_MATH_BIG_ENDIAN
- prod1 = -__SMUSD(pSourceA, pSourceB);
-#else
- prod1 = __SMUSD(pSourceA, pSourceB);
-#endif
- prod2 = __SMUADX(pSourceA, pSourceB);
- sumReal += (q63_t) prod1;
- sumImag += (q63_t) prod2;
-
-#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
+#endif /* #if defined (ARM_MATH_DSP) */
}
- /* Saturate and store the result in the destination buffer */
-
+ /* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -409,5 +336,5 @@ arm_status arm_mat_cmplx_mult_q15(
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
index 65cbb66..7b458f9 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_q31.c
* Description: Floating-point matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,74 +29,69 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup CmplxMatrixMult
- * @{
+ @addtogroup CmplxMatrixMult
+ @{
*/
/**
- * @brief Q31 Complex matrix multiplication
- * @param[in] *pSrcA points to the first input complex matrix structure
- * @param[in] *pSrcB points to the second input complex matrix structure
- * @param[out] *pDst points to output complex matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate
- * multiplication results but provides only a single guard bit. There is no saturation
- * on intermediate additions. Thus, if the accumulator overflows it wraps around and
- * distorts the result. The input signals should be scaled down to avoid intermediate
- * overflows. The input is thus scaled down by log2(numColsA) bits
- * to avoid overflows, as a total of numColsA additions are performed internally.
- * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- *
+ @brief Q31 Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit. There is no saturation
+ on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ distorts the result. The input signals should be scaled down to avoid intermediate
+ overflows. The input is thus scaled down by log2(numColsA) bits
+ to avoid overflows, as a total of numColsA additions are performed internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*/
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
- q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- q63_t sumReal1, sumImag1; /* accumulator */
- q31_t a0, b0, c0, d0;
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ q63_t sumReal, sumImag; /* Accumulator */
q31_t a1, b1, c1, d1;
-
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t a0, b0, c0, d0;
+#endif
+
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
-
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
@@ -119,16 +114,18 @@ arm_status arm_mat_cmplx_mult_q31(
do
{
/* Set the variable sum, that acts as accumulator, to zero */
- sumReal1 = 0.0;
- sumImag1 = 0.0;
+ sumReal = 0.0;
+ sumImag = 0.0;
- /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ /* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- colCnt = numColsA >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
- /* matrix multiplication */
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
while (colCnt > 0U)
{
@@ -145,76 +142,74 @@ arm_status arm_mat_cmplx_mult_q31(
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
- sumReal1 += (q63_t) a0 *c0;
- sumImag1 += (q63_t) b0 *c0;
+ sumReal += (q63_t) a0 * c0;
+ sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
- sumReal1 -= (q63_t) b0 *d0;
- sumImag1 += (q63_t) a0 *d0;
+ sumReal -= (q63_t) b0 * d0;
+ sumImag += (q63_t) a0 * d0;
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
- a1 = *pIn1;
- c1 = *pIn2;
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
- sumReal1 += (q63_t) a1 *c1;
- sumImag1 += (q63_t) b1 *c1;
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
- sumReal1 -= (q63_t) b1 *d1;
- sumImag1 += (q63_t) a1 *d1;
-
- a0 = *pIn1;
- c0 = *pIn2;
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
+ a0 = *(pIn1 );
+ c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
- sumReal1 += (q63_t) a0 *c0;
- sumImag1 += (q63_t) b0 *c0;
+ sumReal += (q63_t) a0 * c0;
+ sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
- sumReal1 -= (q63_t) b0 *d0;
- sumImag1 += (q63_t) a0 *d0;
+ sumReal -= (q63_t) b0 * d0;
+ sumImag += (q63_t) a0 * d0;
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-
- a1 = *pIn1;
- c1 = *pIn2;
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
- sumReal1 += (q63_t) a1 *c1;
- sumImag1 += (q63_t) b1 *c1;
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
- sumReal1 -= (q63_t) b1 *d1;
- sumImag1 += (q63_t) a1 *d1;
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
- /* Decrement the loop count */
+ /* Decrement loop count */
colCnt--;
}
@@ -222,49 +217,55 @@ arm_status arm_mat_cmplx_mult_q31(
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- a1 = *pIn1;
- c1 = *pIn2;
-
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
- sumReal1 += (q63_t) a1 *c1;
- sumImag1 += (q63_t) b1 *c1;
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
- sumReal1 -= (q63_t) b1 *d1;
- sumImag1 += (q63_t) a1 *d1;
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- /* Store the result in the destination buffer */
- *px++ = (q31_t) clip_q63_to_q31(sumReal1 >> 31);
- *px++ = (q31_t) clip_q63_to_q31(sumImag1 >> 31);
+ /* Store result in destination buffer */
+ *px++ = (q31_t) clip_q63_to_q31(sumReal >> 31);
+ *px++ = (q31_t) clip_q63_to_q31(sumImag >> 31);
- /* Update the pointer pIn2 to point to the starting address of the next column */
+ /* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
} while (col > 0U);
- /* Update the pointer pInA to point to the starting address of the next row */
+ /* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
@@ -278,5 +279,5 @@ arm_status arm_mat_cmplx_mult_q31(
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_init_f32.c b/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
index 783f7be..ce02a25 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,31 +29,31 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixInit Matrix Initialization
- *
- * Initializes the underlying matrix data structure.
- * The functions set the numRows,
- * numCols, and pData fields
- * of the matrix data structure.
+ @defgroup MatrixInit Matrix Initialization
+
+ Initializes the underlying matrix data structure.
+ The functions set the numRows,
+ numCols, and pData fields
+ of the matrix data structure.
*/
/**
- * @addtogroup MatrixInit
- * @{
+ @addtogroup MatrixInit
+ @{
*/
/**
- * @brief Floating-point matrix initialization.
- * @param[in,out] *S points to an instance of the floating-point matrix structure.
- * @param[in] nRows number of rows in the matrix.
- * @param[in] nColumns number of columns in the matrix.
- * @param[in] *pData points to the matrix data array.
- * @return none
- */
+ @brief Floating-point matrix initialization.
+ @param[in,out] S points to an instance of the floating-point matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
@@ -72,5 +72,5 @@ void arm_mat_init_f32(
}
/**
- * @} end of MatrixInit group
+ @} end of MatrixInit group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_init_q15.c b/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
index 08da19f..0275503 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,22 +29,22 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixInit
- * @{
+ @addtogroup MatrixInit
+ @{
*/
- /**
- * @brief Q15 matrix initialization.
- * @param[in,out] *S points to an instance of the floating-point matrix structure.
- * @param[in] nRows number of rows in the matrix.
- * @param[in] nColumns number of columns in the matrix.
- * @param[in] *pData points to the matrix data array.
- * @return none
- */
+/**
+ @brief Q15 matrix initialization.
+ @param[in,out] S points to an instance of the floating-point matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
@@ -63,5 +63,5 @@ void arm_mat_init_q15(
}
/**
- * @} end of MatrixInit group
+ @} end of MatrixInit group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_init_q31.c b/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
index 22e6f6d..d5c5722 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,27 +29,27 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixInit Matrix Initialization
- *
+ @defgroup MatrixInit Matrix Initialization
+
*/
/**
- * @addtogroup MatrixInit
- * @{
+ @addtogroup MatrixInit
+ @{
*/
- /**
- * @brief Q31 matrix initialization.
- * @param[in,out] *S points to an instance of the floating-point matrix structure.
- * @param[in] nRows number of rows in the matrix.
- * @param[in] nColumns number of columns in the matrix.
- * @param[in] *pData points to the matrix data array.
- * @return none
- */
+/**
+ @brief Q31 matrix initialization.
+ @param[in,out] S points to an instance of the Q31 matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
@@ -68,5 +68,5 @@ void arm_mat_init_q31(
}
/**
- * @} end of MatrixInit group
+ @} end of MatrixInit group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c b/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
index b82373a..d602b98 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f32.c
* Description: Floating-point matrix inverse
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,47 +29,45 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixInv Matrix Inverse
- *
- * Computes the inverse of a matrix.
- *
- * The inverse is defined only if the input matrix is square and non-singular (the determinant
- * is non-zero). The function checks that the input and output matrices are square and of the
- * same size.
- *
- * Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
- * inversion of floating-point matrices.
- *
- * \par Algorithm
- * The Gauss-Jordan method is used to find the inverse.
- * The algorithm performs a sequence of elementary row-operations until it
- * reduces the input matrix to an identity matrix. Applying the same sequence
- * of elementary row-operations to an identity matrix yields the inverse matrix.
- * If the input matrix is singular, then the algorithm terminates and returns error status
- * ARM_MATH_SINGULAR.
- * \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
+ @defgroup MatrixInv Matrix Inverse
+
+ Computes the inverse of a matrix.
+
+ The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
+ The function checks that the input and output matrices are square and of the same size.
+
+ Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
+ inversion of floating-point matrices.
+
+ @par Algorithm
+ The Gauss-Jordan method is used to find the inverse.
+ The algorithm performs a sequence of elementary row-operations until it
+ reduces the input matrix to an identity matrix. Applying the same sequence
+ of elementary row-operations to an identity matrix yields the inverse matrix.
+ If the input matrix is singular, then the algorithm terminates and returns error status
+ ARM_MATH_SINGULAR.
+ \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
*/
/**
- * @addtogroup MatrixInv
- * @{
+ @addtogroup MatrixInv
+ @{
*/
/**
- * @brief Floating-point matrix inverse.
- * @param[in] *pSrc points to input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns
- * ARM_MATH_SIZE_MISMATCH if the input matrix is not square or if the size
- * of the output matrix does not match the size of the input matrix.
- * If the input matrix is found to be singular (non-invertible), then the function returns
- * ARM_MATH_SINGULAR. Otherwise, the function returns ARM_MATH_SUCCESS.
+ @brief Floating-point matrix inverse.
+ @param[in] pSrc points to input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
-
+#if defined(ARM_MATH_NEON)
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
@@ -82,18 +80,17 @@ arm_status arm_mat_inverse_f32(
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
-#if defined (ARM_MATH_DSP)
float32_t maxC; /* maximum value in the column */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t tmpV;
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
@@ -104,43 +101,476 @@ arm_status arm_mat_inverse_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for column i is the greatest of the column.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is not the most significant of the columns, exchange that row with a row
+ * below it that does contain the most significant value in column i. If the most
+ * significant value of the column is zero, then an inverse to that matrix does not exist.
+ * The most significant value of the column is the absolute maximum.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement the loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Grab the most significant value from column l */
+ maxC = 0;
+
+ for (i = l; i < numRows; i++)
+ {
+ maxC = *pInT1 > 0 ? (*pInT1 > maxC ? *pInT1 : maxC) : (-*pInT1 > maxC ? -*pInT1 : maxC);
+ pInT1 += numCols;
+ }
+
+ /* Update the status if the matrix is singular */
+ if (maxC == 0.0f)
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Restore pInT1 */
+ pInT1 = pIn;
+
+ /* Destination pointer modifier */
+ k = 1U;
+
+ /* Check if the pivot element is the most significant of the column */
+ if ( (in > 0.0f ? in : -in) != maxC)
+ {
+ /* Loop over the number rows present below */
+ i = numRows - (l + 1U);
+
+ while (i > 0U)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * l);
+ pOutT2 = pOutT1 + (numCols * k);
+
+ /* Look for the most significant element to
+ * replace in the rows below */
+ if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+ /* Update the destination pointer modifier */
+ k++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *pPivotRowIn;
+ tmpV = vdupq_n_f32(1.0/in);
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l) >> 2;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ vec1 = vld1q_f32(pInT1);
+
+ vec1 = vmulq_f32(vec1, tmpV);
+ vst1q_f32(pInT1, vec1);
+ pInT1 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = (numCols - l) & 3;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols >> 2;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ vec1 = vld1q_f32(pInT2);
+
+ vec1 = vmulq_f32(vec1, tmpV);
+ vst1q_f32(pInT2, vec1);
+ pInT2 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = numCols & 3;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0U;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while (k > 0U)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+ tmpV = vdupq_n_f32(in);
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l) >> 2;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ vec1 = vld1q_f32(pInT1);
+ vec2 = vld1q_f32(pPRT_in);
+ vec1 = vmlsq_f32(vec1, tmpV, vec2);
+ vst1q_f32(pInT1, vec1);
+ pPRT_in += 4;
+ pInT1 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = (numCols - l) & 3;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols >> 2;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ vec1 = vld1q_f32(pInT2);
+ vec2 = vld1q_f32(pPRT_pDst);
+ vec1 = vmlsq_f32(vec1, tmpV, vec2);
+ vst1q_f32(pInT2, vec1);
+ pPRT_pDst += 4;
+ pInT2 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = numCols & 3;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment the temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement the loop counter */
+ k--;
+
+ /* Increment the pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_inverse_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+#if defined (ARM_MATH_DSP)
+ float32_t maxC; /* maximum value in the column */
+
+ float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/*--------------------------------------------------------------------------------------------------------------
- * Matrix Inverse can be solved using elementary row operations.
- *
- * Gauss-Jordan Method:
- *
- * 1. First combine the identity matrix and the input matrix separated by a bar to form an
- * augmented matrix as follows:
- * _ _ _ _
- * | a11 a12 | 1 0 | | X11 X12 |
- * | | | = | |
- * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
- *
- * 2. In our implementation, pDst Matrix is used as identity matrix.
- *
- * 3. Begin with the first row. Let i = 1.
- *
- * 4. Check to see if the pivot for column i is the greatest of the column.
- * The pivot is the element of the main diagonal that is on the current row.
- * For instance, if working with row i, then the pivot element is aii.
- * If the pivot is not the most significant of the columns, exchange that row with a row
- * below it that does contain the most significant value in column i. If the most
- * significant value of the column is zero, then an inverse to that matrix does not exist.
- * The most significant value of the column is the absolute maximum.
- *
- * 5. Divide every element of row i by the pivot.
- *
- * 6. For every row below and row i, replace that row with the sum of that row and
- * a multiple of row i so that each new element in column i below row i is zero.
- *
- * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
- * for every element below and above the main diagonal.
- *
- * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
- * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
- *----------------------------------------------------------------------------------------------------------------*/
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for column i is the greatest of the column.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is not the most significant of the columns, exchange that row with a row
+ * below it that does contain the most significant value in column i. If the most
+ * significant value of the column is zero, then an inverse to that matrix does not exist.
+ * The most significant value of the column is the absolute maximum.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -170,7 +600,7 @@ arm_status arm_mat_inverse_f32(
j--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
rowCnt--;
}
@@ -260,7 +690,7 @@ arm_status arm_mat_inverse_f32(
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
j--;
}
@@ -274,7 +704,7 @@ arm_status arm_mat_inverse_f32(
/* Update the destination pointer modifier */
k++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
}
@@ -385,19 +815,19 @@ arm_status arm_mat_inverse_f32(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
j--;
}
}
- /* Increment the temporary input pointer */
+ /* Increment temporary input pointer */
pInT1 = pInT1 + l;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
- /* Increment the pivot index */
+ /* Increment pivot index */
i++;
}
@@ -414,8 +844,6 @@ arm_status arm_mat_inverse_f32(
#else
- /* Run the below code for Cortex-M0 */
-
float32_t Xchg, in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
@@ -423,50 +851,53 @@ arm_status arm_mat_inverse_f32(
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
- if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
- || (pSrc->numRows != pDst->numRows))
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/*--------------------------------------------------------------------------------------------------------------
- * Matrix Inverse can be solved using elementary row operations.
- *
- * Gauss-Jordan Method:
- *
- * 1. First combine the identity matrix and the input matrix separated by a bar to form an
- * augmented matrix as follows:
- * _ _ _ _ _ _ _ _
- * | | a11 a12 | | | 1 0 | | | X11 X12 |
- * | | | | | | | = | |
- * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
- *
- * 2. In our implementation, pDst Matrix is used as identity matrix.
- *
- * 3. Begin with the first row. Let i = 1.
- *
- * 4. Check to see if the pivot for row i is zero.
- * The pivot is the element of the main diagonal that is on the current row.
- * For instance, if working with row i, then the pivot element is aii.
- * If the pivot is zero, exchange that row with a row below it that does not
- * contain a zero in column i. If this is not possible, then an inverse
- * to that matrix does not exist.
- *
- * 5. Divide every element of row i by the pivot.
- *
- * 6. For every row below and row i, replace that row with the sum of that row and
- * a multiple of row i so that each new element in column i below row i is zero.
- *
- * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
- * for every element below and above the main diagonal.
- *
- * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
- * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
- *----------------------------------------------------------------------------------------------------------------*/
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -496,7 +927,7 @@ arm_status arm_mat_inverse_f32(
j--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
rowCnt--;
}
@@ -506,7 +937,7 @@ arm_status arm_mat_inverse_f32(
/* Index modifier to navigate through the columns */
l = 0U;
- //for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
+
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@@ -640,6 +1071,7 @@ arm_status arm_mat_inverse_f32(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
+
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@@ -651,19 +1083,21 @@ arm_status arm_mat_inverse_f32(
}
}
- /* Increment the temporary input pointer */
+
+ /* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
+
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
+
/* Increment the index modifier */
l++;
}
-
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
@@ -682,10 +1116,12 @@ arm_status arm_mat_inverse_f32(
status = ARM_MATH_SINGULAR;
}
}
+
/* Return to application */
return (status);
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of MatrixInv group
+ @} end of MatrixInv group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c b/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
index 54e5982..4607e07 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,50 +29,28 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
*/
/**
- * @defgroup MatrixInv Matrix Inverse
- *
- * Computes the inverse of a matrix.
- *
- * The inverse is defined only if the input matrix is square and non-singular (the determinant
- * is non-zero). The function checks that the input and output matrices are square and of the
- * same size.
- *
- * Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
- * inversion of floating-point matrices.
- *
- * \par Algorithm
- * The Gauss-Jordan method is used to find the inverse.
- * The algorithm performs a sequence of elementary row-operations until it
- * reduces the input matrix to an identity matrix. Applying the same sequence
- * of elementary row-operations to an identity matrix yields the inverse matrix.
- * If the input matrix is singular, then the algorithm terminates and returns error status
- * ARM_MATH_SINGULAR.
- * \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
- */
-
-/**
- * @addtogroup MatrixInv
- * @{
- */
-
-/**
- * @brief Floating-point matrix inverse.
- * @param[in] *pSrc points to input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns
- * ARM_MATH_SIZE_MISMATCH if the input matrix is not square or if the size
- * of the output matrix does not match the size of the input matrix.
- * If the input matrix is found to be singular (non-invertible), then the function returns
- * ARM_MATH_SINGULAR. Otherwise, the function returns ARM_MATH_SUCCESS.
+ @brief Floating-point (64 bit) matrix inverse.
+ @param[in] pSrc points to input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
- arm_matrix_instance_f64 * pDst)
+ arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
@@ -85,62 +63,61 @@ arm_status arm_mat_inverse_f64(
#if defined (ARM_MATH_DSP)
float64_t maxC; /* maximum value in the column */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float64_t Xchg, in = 0.0f, in1; /* Temporary input values */
+ float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
- if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
- || (pSrc->numRows != pDst->numRows))
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
- * Matrix Inverse can be solved using elementary row operations.
- *
- * Gauss-Jordan Method:
- *
- * 1. First combine the identity matrix and the input matrix separated by a bar to form an
- * augmented matrix as follows:
- * _ _ _ _
- * | a11 a12 | 1 0 | | X11 X12 |
- * | | | = | |
- * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
- *
- * 2. In our implementation, pDst Matrix is used as identity matrix.
- *
- * 3. Begin with the first row. Let i = 1.
- *
- * 4. Check to see if the pivot for column i is the greatest of the column.
- * The pivot is the element of the main diagonal that is on the current row.
- * For instance, if working with row i, then the pivot element is aii.
- * If the pivot is not the most significant of the columns, exchange that row with a row
- * below it that does contain the most significant value in column i. If the most
- * significant value of the column is zero, then an inverse to that matrix does not exist.
- * The most significant value of the column is the absolute maximum.
- *
- * 5. Divide every element of row i by the pivot.
- *
- * 6. For every row below and row i, replace that row with the sum of that row and
- * a multiple of row i so that each new element in column i below row i is zero.
- *
- * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
- * for every element below and above the main diagonal.
- *
- * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
- * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
- *----------------------------------------------------------------------------------------------------------------*/
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for column i is the greatest of the column.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is not the most significant of the columns, exchange that row with a row
+ * below it that does contain the most significant value in column i. If the most
+ * significant value of the column is zero, then an inverse to that matrix does not exist.
+ * The most significant value of the column is the absolute maximum.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -155,22 +132,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
- *pOutT1++ = 0.0f;
+ *pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
- *pOutT1++ = 1.0f;
+ *pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
- *pOutT1++ = 0.0f;
+ *pOutT1++ = 0.0;
j--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
rowCnt--;
}
@@ -208,7 +185,7 @@ arm_status arm_mat_inverse_f64(
}
/* Update the status if the matrix is singular */
- if (maxC == 0.0f)
+ if (maxC == 0.0)
{
return ARM_MATH_SINGULAR;
}
@@ -220,7 +197,7 @@ arm_status arm_mat_inverse_f64(
k = 1U;
/* Check if the pivot element is the most significant of the column */
- if ( (in > 0.0f ? in : -in) != maxC)
+ if ( (in > 0.0 ? in : -in) != maxC)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
@@ -233,7 +210,7 @@ arm_status arm_mat_inverse_f64(
/* Look for the most significant element to
* replace in the rows below */
- if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
+ if ((*pInT2 > 0.0 ? *pInT2: -*pInT2) == maxC)
{
/* Loop over number of columns
* to the right of the pilot element */
@@ -260,7 +237,7 @@ arm_status arm_mat_inverse_f64(
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
j--;
}
@@ -274,13 +251,13 @@ arm_status arm_mat_inverse_f64(
/* Update the destination pointer modifier */
k++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
}
}
/* Update the status if the matrix is singular */
- if ((flag != 1U) && (in == 0.0f))
+ if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@@ -385,19 +362,19 @@ arm_status arm_mat_inverse_f64(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
j--;
}
}
- /* Increment the temporary input pointer */
+ /* Increment temporary input pointer */
pInT1 = pInT1 + l;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
k--;
- /* Increment the pivot index */
+ /* Increment pivot index */
i++;
}
@@ -414,59 +391,60 @@ arm_status arm_mat_inverse_f64(
#else
- /* Run the below code for Cortex-M0 */
-
- float64_t Xchg, in = 0.0f; /* Temporary input values */
+ float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
- if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
- || (pSrc->numRows != pDst->numRows))
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/*--------------------------------------------------------------------------------------------------------------
- * Matrix Inverse can be solved using elementary row operations.
- *
- * Gauss-Jordan Method:
- *
- * 1. First combine the identity matrix and the input matrix separated by a bar to form an
- * augmented matrix as follows:
- * _ _ _ _ _ _ _ _
- * | | a11 a12 | | | 1 0 | | | X11 X12 |
- * | | | | | | | = | |
- * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
- *
- * 2. In our implementation, pDst Matrix is used as identity matrix.
- *
- * 3. Begin with the first row. Let i = 1.
- *
- * 4. Check to see if the pivot for row i is zero.
- * The pivot is the element of the main diagonal that is on the current row.
- * For instance, if working with row i, then the pivot element is aii.
- * If the pivot is zero, exchange that row with a row below it that does not
- * contain a zero in column i. If this is not possible, then an inverse
- * to that matrix does not exist.
- *
- * 5. Divide every element of row i by the pivot.
- *
- * 6. For every row below and row i, replace that row with the sum of that row and
- * a multiple of row i so that each new element in column i below row i is zero.
- *
- * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
- * for every element below and above the main diagonal.
- *
- * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
- * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
- *----------------------------------------------------------------------------------------------------------------*/
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -481,22 +459,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
- *pOutT1++ = 0.0f;
+ *pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
- *pOutT1++ = 1.0f;
+ *pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
- *pOutT1++ = 0.0f;
+ *pOutT1++ = 0.0;
j--;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
rowCnt--;
}
@@ -506,7 +484,7 @@ arm_status arm_mat_inverse_f64(
/* Index modifier to navigate through the columns */
l = 0U;
- //for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
+
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@@ -529,7 +507,7 @@ arm_status arm_mat_inverse_f64(
k = 1U;
/* Check if the pivot element is zero */
- if (*pInT1 == 0.0f)
+ if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
@@ -540,7 +518,7 @@ arm_status arm_mat_inverse_f64(
/* Check if there is a non zero pivot element to
* replace in the rows below */
- if (*pInT2 != 0.0f)
+ if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
@@ -572,7 +550,7 @@ arm_status arm_mat_inverse_f64(
}
/* Update the status if the matrix is singular */
- if ((flag != 1U) && (in == 0.0f))
+ if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@@ -640,6 +618,7 @@ arm_status arm_mat_inverse_f64(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
+
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@@ -651,30 +630,32 @@ arm_status arm_mat_inverse_f64(
}
}
- /* Increment the temporary input pointer */
+
+ /* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
+
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
+
/* Increment the index modifier */
l++;
}
-
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
- if ((flag != 1U) && (in == 0.0f))
+ if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
- if (pIn[i] != 0.0f)
+ if (pIn[i] != 0.0)
break;
}
@@ -682,10 +663,11 @@ arm_status arm_mat_inverse_f64(
status = ARM_MATH_SINGULAR;
}
}
+
/* Return to application */
return (status);
}
/**
- * @} end of MatrixInv group
+ @} end of MatrixInv group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c b/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
index a038f2f..ffddf99 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_mult_f32.c
* Description: Floating-point matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -62,6 +62,9 @@
* @return The function returns either
* ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
*/
+#if defined(ARM_MATH_NEON)
+
+#define GROUPOFROWS 8
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
@@ -78,32 +81,225 @@ arm_status arm_mat_mult_f32(
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
- uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
-#ifdef ARM_MATH_MATRIX_CHECK
+ float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
+ float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
+ float32x2_t accum = vdup_n_f32(0);
+ float32_t *pIn1B = pSrcA->pData;
+ float32_t *pIn1C = pSrcA->pData;
+ float32_t *pIn1D = pSrcA->pData;
+ float32_t *pIn1E = pSrcA->pData;
+ float32_t *pIn1F = pSrcA->pData;
+ float32_t *pIn1G = pSrcA->pData;
+ float32_t *pIn1H = pSrcA->pData;
+ float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
+ float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
+
+#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
-
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
- /* row loop */
- do
+ /* Row loop */
+ rowCnt = row >> 3;
+
+ while(rowCnt > 0)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + GROUPOFROWS*i;
+ pxB = px + numColsB;
+ pxC = px + 2*numColsB;
+ pxD = px + 3*numColsB;
+ pxE = px + 4*numColsB;
+ pxF = px + 5*numColsB;
+ pxG = px + 6*numColsB;
+ pxH = px + 7*numColsB;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum0 = 0.0f;
+ sum1 = 0.0f;
+ sum2 = 0.0f;
+ sum3 = 0.0f;
+ sum4 = 0.0f;
+ sum5 = 0.0f;
+ sum6 = 0.0f;
+ sum7 = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+ pIn1B = pIn1 + numColsA;
+ pIn1C = pIn1 + 2*numColsA;
+ pIn1D = pIn1 + 3*numColsA;
+ pIn1E = pIn1 + 4*numColsA;
+ pIn1F = pIn1 + 5*numColsA;
+ pIn1G = pIn1 + 6*numColsA;
+ pIn1H = pIn1 + 7*numColsA;
+
+ acc0 = vdupq_n_f32(0.0);
+ acc1 = vdupq_n_f32(0.0);
+ acc2 = vdupq_n_f32(0.0);
+ acc3 = vdupq_n_f32(0.0);
+ acc4 = vdupq_n_f32(0.0);
+ acc5 = vdupq_n_f32(0.0);
+ acc6 = vdupq_n_f32(0.0);
+ acc7 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f32(pIn1);
+ a1V = vld1q_f32(pIn1B);
+ a2V = vld1q_f32(pIn1C);
+ a3V = vld1q_f32(pIn1D);
+ a4V = vld1q_f32(pIn1E);
+ a5V = vld1q_f32(pIn1F);
+ a6V = vld1q_f32(pIn1G);
+ a7V = vld1q_f32(pIn1H);
+
+ pIn1 += 4;
+ pIn1B += 4;
+ pIn1C += 4;
+ pIn1D += 4;
+ pIn1E += 4;
+ pIn1F += 4;
+ pIn1G += 4;
+ pIn1H += 4;
+
+ temp[0] = *pIn2;
+ pIn2 += numColsB;
+ temp[1] = *pIn2;
+ pIn2 += numColsB;
+ temp[2] = *pIn2;
+ pIn2 += numColsB;
+ temp[3] = *pIn2;
+ pIn2 += numColsB;
+
+ acc0 = vmlaq_f32(acc0,a0V,temp);
+ acc1 = vmlaq_f32(acc1,a1V,temp);
+ acc2 = vmlaq_f32(acc2,a2V,temp);
+ acc3 = vmlaq_f32(acc3,a3V,temp);
+ acc4 = vmlaq_f32(acc4,a4V,temp);
+ acc5 = vmlaq_f32(acc5,a5V,temp);
+ acc6 = vmlaq_f32(acc6,a6V,temp);
+ acc7 = vmlaq_f32(acc7,a7V,temp);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum0 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+ sum1 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+ sum2 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+ sum3 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
+ sum4 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
+ sum5 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
+ sum6 += accum[0] + accum[1];
+
+ accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
+ sum7 += accum[0] + accum[1];
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 3;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ sum0 += *pIn1++ * (*pIn2);
+ sum1 += *pIn1B++ * (*pIn2);
+ sum2 += *pIn1C++ * (*pIn2);
+ sum3 += *pIn1D++ * (*pIn2);
+ sum4 += *pIn1E++ * (*pIn2);
+ sum5 += *pIn1F++ * (*pIn2);
+ sum6 += *pIn1G++ * (*pIn2);
+ sum7 += *pIn1H++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum0;
+ *pxB++ = sum1;
+ *pxC++ = sum2;
+ *pxD++ = sum3;
+ *pxE++ = sum4;
+ *pxF++ = sum5;
+ *pxG++ = sum6;
+ *pxH++ = sum7;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + GROUPOFROWS*numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+ }
+
+ /*
+
+ i was the index of a group of rows computed by previous loop.
+ Now i is the index of a row since below code is computing row per row
+ and no more group of row per group of rows.
+
+ */
+
+ i = GROUPOFROWS*i;
+ rowCnt = row & 7;
+
+ while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
@@ -117,7 +313,7 @@ arm_status arm_mat_mult_f32(
j = 0U;
- /* column loop */
+ /* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
@@ -126,43 +322,43 @@ arm_status arm_mat_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ acc0 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
- /* matrix multiplication */
+ /* Matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- in3 = *pIn2;
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ pIn1 += 4;
+
+ temp[0] = *pIn2;
pIn2 += numColsB;
- in1 = pIn1[0];
- in2 = pIn1[1];
- sum += in1 * in3;
- in4 = *pIn2;
+ temp[1] = *pIn2;
+ pIn2 += numColsB;
+ temp[2] = *pIn2;
+ pIn2 += numColsB;
+ temp[3] = *pIn2;
pIn2 += numColsB;
- sum += in2 * in4;
- in3 = *pIn2;
- pIn2 += numColsB;
- in1 = pIn1[2];
- in2 = pIn1[3];
- sum += in1 * in3;
- in4 = *pIn2;
- pIn2 += numColsB;
- sum += in2 * in4;
- pIn1 += 4U;
+ acc0 = vmlaq_f32(acc0,a0V,temp);
/* Decrement the loop count */
colCnt--;
}
+ accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum += accum[0] + accum[1];
+
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
pIn2 += numColsB;
@@ -182,40 +378,67 @@ arm_status arm_mat_mult_f32(
} while (col > 0U);
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
#else
-
- /* Run the below code for Cortex-M0 */
-
- float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
+arm_status arm_mat_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ float32_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
-
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
- /* The following loop performs the dot-product of each row in pInA with each column in pInB */
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
- /* Output pointer is set to starting address of the row being processed */
+ /* Output pointer is set to starting address of row being processed */
px = pOut + i;
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@@ -224,43 +447,78 @@ arm_status arm_mat_mult_f32(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
- /* Initialize the pointer pIn1 to point to the starting address of the row being processed */
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
- /* Matrix A columns number of MAC operations are to be performed */
- colCnt = numColsA;
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- sum += *pIn1++ * (*pIn2);
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
- /* Decrement the loop counter */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
colCnt--;
}
- /* Store the result in the destination buffer */
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
*px++ = sum;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
- /* Update the pointer pIn2 to point to the starting address of the next column */
+ /* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Update the pointer pInA to point to the starting address of the next row */
+ /* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
+
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -269,6 +527,8 @@ arm_status arm_mat_mult_f32(
return (status);
}
+#endif /* #if defined(ARM_MATH_NEON) */
+
/**
* @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c b/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
index 8d720c7..670ace1 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,206 +29,165 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixMult
- * @{
+ @addtogroup MatrixMult
+ @{
*/
-
/**
- * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @param[in] *pState points to the array for storing intermediate results
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The difference between the function arm_mat_mult_q15() and this fast variant is that
- * the fast variant use a 32-bit rather than a 64-bit accumulator.
- * The result of each 1.15 x 1.15 multiplication is truncated to
- * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
- * format. Finally, the accumulator is saturated and converted to a 1.15 result.
- *
- * \par
- * The fast version has the same overflow behavior as the standard version but provides
- * less precision since it discards the low 16 bits of each multiplication result.
- * In order to avoid overflows completely the input signals must be scaled down.
- * Scale down one of the input matrices by log2(numColsA) bits to
- * avoid overflows, as a total of numColsA additions are computed internally for each
- * output element.
- *
- * \par
- * See arm_mat_mult_q15() for a slower implementation of this function
- * which uses 64-bit accumulation to provide higher precision.
+ @brief Q15 matrix multiplication (fast variant).
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @param[in] pState points to the array for storing intermediate results
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
+ the fast variant use a 32-bit rather than a 64-bit accumulator.
+ The result of each 1.15 x 1.15 multiplication is truncated to
+ 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ format. Finally, the accumulator is saturated and converted to a 1.15 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides
+ less precision since it discards the low 16 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
+ as a total of numColsA additions are computed internally for each output element.
+ @remark
+ Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
+ which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
- arm_matrix_instance_q15 * pDst,
- q15_t * pState)
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
{
- q31_t sum; /* accumulator */
- q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
- q15_t *px; /* Temporary output data matrix pointer */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
- uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- q31_t in; /* Temporary variable to hold the input value */
- q31_t inA1, inA2, inB1, inB2;
- q31_t sum2, sum3, sum4;
- q15_t *pInA2, *pInB2, *px2;
- uint32_t j = 0;
+ q31_t sum; /* Accumulator */
+ q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+#if defined (ARM_MATH_DSP)
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t inA1, inB1, inA2, inB2;
+ q31_t sum2, sum3, sum4;
+ q15_t *pInA2, *pInB2, *px2;
+ uint32_t j = 0;
#else
-
- q15_t in; /* Temporary variable to hold the input value */
- q15_t inA1, inA2, inB1, inB2;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+ q15_t in; /* Temporary variable to hold the input value */
+ q15_t inA1, inB1, inA2, inB2;
+#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
+
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/* Matrix transpose */
do
{
- /* Apply loop unrolling and exchange the columns with row elements */
- col = numColsB >> 2;
-
- /* The pointer px is set to starting address of the column being processed */
+ /* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
+ /* Apply loop unrolling and exchange columns with row elements */
+ col = numColsB >> 2U;
+
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
- /* Unpack and store one element in the destination */
+#if defined (ARM_MATH_DSP)
+
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) in;
-
#else
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Unpack and store the second element in the destination */
+ /* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#else
-
*px = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
-
- /* Unpack and store one element in the destination */
+ in = read_q15x2_ia ((q15_t **) &pInB);
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) in;
-
#else
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
- /* Unpack and store the second element in the destination */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#else
-
*px = (q15_t) in;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-#else
-
- /* Read one element from the row */
- in = *pInB++;
-
- /* Store one element in the destination */
- *px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
- /* Read one element from the row */
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* Read one element from row */
in = *pInB++;
- /* Store one element in the destination */
+ /* Store one element in destination */
*px = in;
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Read one element from the row */
in = *pInB++;
-
- /* Store one element in the destination */
*px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
- /* Read one element from the row */
in = *pInB++;
-
- /* Store one element in the destination */
*px = in;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
- /* Decrement the column loop counter */
+ in = *pInB++;
+ *px = in;
+ px += numRowsB;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement column loop counter */
col--;
}
@@ -238,31 +197,31 @@ arm_status arm_mat_mult_fast_q15(
while (col > 0U)
{
- /* Read and store the input element in the destination */
+ /* Read and store input element in destination */
*px = *pInB++;
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
i++;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
- /* Reset the variables for the usage in the following multiplication process */
+ /* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
- row = row >> 1;
+ row = row >> 1U;
px2 = px + numColsB;
#endif
@@ -270,29 +229,28 @@ arm_status arm_mat_mult_fast_q15(
/* row loop */
while (row > 0U)
{
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the transposed pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
- col = col >> 1;
+ col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
- /* Set the variable sum, that acts as accumulator, to zero */
+ /* Set variable sum, that acts as accumulator, to zero */
sum = 0;
- /* Initiate the pointer pInA to point to the starting address of the column being processed */
+ /* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
@@ -301,56 +259,55 @@ arm_status arm_mat_mult_fast_q15(
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
- colCnt = numColsA >> 1;
+ colCnt = numColsA >> 1U;
#else
- colCnt = numColsA >> 2;
+ colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
- inA1 = *__SIMD32(pInA)++;
- inB1 = *__SIMD32(pInB)++;
- inA2 = *__SIMD32(pInA2)++;
- inB2 = *__SIMD32(pInB2)++;
+#if defined (ARM_MATH_DSP)
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+ inA2 = read_q15x2_ia ((q15_t **) &pInA2);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB2);
+
+ /* Multiply and Accumlates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
-
#else
-
- inA1 = *pInA;
- inB1 = *pInB;
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ /* Multiply and Accumlates */
sum += inA1 * inB1;
- inA2 = pInA[1];
- inB2 = pInB[1];
+ inA2 = *pInA++;
+ inB2 = *pInB++;
sum += inA2 * inB2;
- inA1 = pInA[2];
- inB1 = pInB[2];
+ inA1 = *pInA++;
+ inB1 = *pInB++;
sum += inA1 * inB1;
- inA2 = pInA[3];
- inB2 = pInB[3];
+ inA2 = *pInA++;
+ inB2 = *pInB++;
sum += inA2 * inB2;
+#endif /* #if defined (ARM_MATH_DSP) */
- pInA += 4;
- pInB += 4;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
@@ -366,44 +323,45 @@ arm_status arm_mat_mult_fast_q15(
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- sum += (q31_t) (*pInA++) * (*pInB++);
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ sum += (q31_t) *pInA++ * *pInB++;
+ /* Decrement loop counter */
colCnt--;
}
-#endif
+#endif /* #if defined (ARM_MATH_DSP) */
- /* Saturate and store the result in the destination buffer */
+ /* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
i = i + numColsA;
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
-#ifndef UNALIGNED_SUPPORT_DISABLE
+#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
@@ -412,7 +370,7 @@ arm_status arm_mat_mult_fast_q15(
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
- px = pDst->pData+numColsB-1;
+ px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@@ -420,26 +378,26 @@ arm_status arm_mat_mult_fast_q15(
{
/* point to last column in matrix B */
- pInB = pSrcBT + numRowsB*(numColsB-1);
+ pInB = pSrcBT + numRowsB * (numColsB-1);
- /* Set the variable sum, that acts as accumulator, to zero */
+ /* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
- colCnt = numColsA >> 2;
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
- inA1 = *__SIMD32(pInA)++;
- inA2 = *__SIMD32(pInA)++;
- inB1 = *__SIMD32(pInB)++;
- inB2 = *__SIMD32(pInB)++;
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
@@ -449,11 +407,11 @@ arm_status arm_mat_mult_fast_q15(
colCnt--;
}
- /* Store the result in the destination buffer */
- *px = (q15_t) (sum >> 15);
+ /* Store result in destination buffer */
+ *px = (q15_t) (sum >> 15);
px += numColsB;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
}
}
@@ -462,7 +420,7 @@ arm_status arm_mat_mult_fast_q15(
if (numRowsA & 1U) {
/* point to last row in output matrix */
- px = pDst->pData+(numColsB)*(numRowsA-1);
+ px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
@@ -471,48 +429,48 @@ arm_status arm_mat_mult_fast_q15(
/* col loop */
while (col > 0)
{
-
/* point to last row in matrix A */
- pInA = pSrcA->pData + (numRowsA-1)*numColsA;
+ pInA = pSrcA->pData + (numRowsA-1) * numColsA;
- /* Set the variable sum, that acts as accumulator, to zero */
+ /* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
- colCnt = numColsA >> 2;
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
- inA1 = *__SIMD32(pInA)++;
- inA2 = *__SIMD32(pInA)++;
- inB1 = *__SIMD32(pInB)++;
- inB2 = *__SIMD32(pInB)++;
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- colCnt = numColsA & 3U;
+ colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
+
colCnt--;
}
- /* Store the result in the destination buffer */
- *px++ = (q15_t) (sum >> 15);
+ /* Store result in destination buffer */
+ *px++ = (q15_t) (sum >> 15);
- /* Decrement the col loop counter */
+ /* Decrement column loop counter */
col--;
}
}
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#endif /* #if defined (ARM_MATH_DSP) */
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -521,5 +479,5 @@ arm_status arm_mat_mult_fast_q15(
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c b/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
index 78b33ef..011959a 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,226 +29,166 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixMult
- * @{
+ @addtogroup MatrixMult
+ @{
*/
/**
- * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The difference between the function arm_mat_mult_q31() and this fast variant is that
- * the fast variant use a 32-bit rather than a 64-bit accumulator.
- * The result of each 1.31 x 1.31 multiplication is truncated to
- * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
- * format. Finally, the accumulator is saturated and converted to a 1.31 result.
- *
- * \par
- * The fast version has the same overflow behavior as the standard version but provides
- * less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signals must be scaled down.
- * Scale down one of the input matrices by log2(numColsA) bits to
- * avoid overflows, as a total of numColsA additions are computed internally for each
- * output element.
- *
- * \par
- * See arm_mat_mult_q31() for a slower implementation of this function
- * which uses 64-bit accumulation to provide higher precision.
+ @brief Q31 matrix multiplication (fast variant).
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
+ the fast variant use a 32-bit rather than a 64-bit accumulator.
+ The result of each 1.31 x 1.31 multiplication is truncated to
+ 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ format. Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides
+ less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
+ as a total of numColsA additions are computed internally for each output element.
+ @remark
+ Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
+ which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- q31_t *px; /* Temporary output data matrix pointer */
- q31_t sum; /* Accumulator */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
- q31_t inA1, inB1;
-
-#if defined (ARM_MATH_DSP)
-
- q31_t sum2, sum3, sum4;
- q31_t inA2, inB2;
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
+ q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
+ q31_t sum1, sum2, sum3, sum4; /* Accumulator */
+ q31_t inA1, inA2, inB1, inB2;
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
-#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
-
px = pDst->pData;
-#if defined (ARM_MATH_DSP)
- row = row >> 1;
+ row = row >> 1U;
px2 = px + numColsB;
-#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
-
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
-#if defined (ARM_MATH_DSP)
- col = col >> 1;
-#endif
+ col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
- sum = 0;
-
- /* Initiate data pointers */
- pInA = pSrcA->pData + i;
- pInB = pSrcB->pData + j;
-
-#if defined (ARM_MATH_DSP)
+ sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
+
+ /* Initiate data pointers */
+ pInA = pSrcA->pData + i;
+ pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
+
colCnt = numColsA;
-#else
- colCnt = numColsA >> 2;
-#endif
/* matrix multiplication */
while (colCnt > 0U)
{
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
-#if defined (ARM_MATH_DSP)
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
- sum = __SMMLA(inA1, inB1, sum);
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- /* Perform the multiply-accumulates */
- inB1 = *pInB;
- pInB += numColsB;
- inA1 = pInA[0];
- sum = __SMMLA(inA1, inB1, sum);
-
- inB1 = *pInB;
- pInB += numColsB;
- inA1 = pInA[1];
- sum = __SMMLA(inA1, inB1, sum);
-
- inB1 = *pInB;
- pInB += numColsB;
- inA1 = pInA[2];
- sum = __SMMLA(inA1, inB1, sum);
-
- inB1 = *pInB;
- pInB += numColsB;
- inA1 = pInA[3];
- sum = __SMMLA(inA1, inB1, sum);
-
- pInA += 4U;
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
+ sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
+ sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
-#ifdef ARM_MATH_CM0_FAMILY
- /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
- colCnt = numColsA % 0x4U;
- while (colCnt > 0U)
- {
- sum = __SMMLA(*pInA++, *pInB, sum);
- pInB += numColsB;
- colCnt--;
- }
- j++;
-#endif
-
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
- *px++ = sum << 1;
-
-#if defined (ARM_MATH_DSP)
+ *px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
+
j += 2;
-#endif
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
-
}
- i = i + numColsA;
+ i = i + (numColsA << 1U);
+ px = px2 + (numColsB & 1U);
+ px2 = px + numColsB;
-#if defined (ARM_MATH_DSP)
- i = i + numColsA;
- px = px2 + (numColsB & 1U);
- px2 = px + numColsB;
-#endif
-
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
-
}
/* Compute any remaining odd row/column below */
-#if defined (ARM_MATH_DSP)
-
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
- row = numRowsA & (~0x1);
+ row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
- px = pDst->pData+numColsB-1;
+ px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@@ -258,49 +198,75 @@ arm_status arm_mat_mult_fast_q31(
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
- /* Set the variable sum, that acts as accumulator, to zero */
- sum = 0;
+ /* Set variable sum1, that acts as accumulator, to zero */
+ sum1 = 0;
- /* Compute 4 columns at once */
- colCnt = numColsA >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 columns at a time. */
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
- inA1 = *pInA++;
- inA2 = *pInA++;
- inB1 = *pInB;
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
pInB += numColsB;
- inB2 = *pInB;
- pInB += numColsB;
- sum = __SMMLA(inA1, inB1, sum);
- sum = __SMMLA(inA2, inB2, sum);
- inA1 = *pInA++;
- inA2 = *pInA++;
- inB1 = *pInB;
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
pInB += numColsB;
- inB2 = *pInB;
- pInB += numColsB;
- sum = __SMMLA(inA1, inB1, sum);
- sum = __SMMLA(inA2, inB2, sum);
- /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+ /* Decrement loop counter */
colCnt--;
}
- colCnt = numColsA & 3U;
+ /* Loop unrolling: Compute remaining column */
+ colCnt = numColsA % 4U;
+
+#else
+
+ /* Initialize colCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (colCnt > 0U) {
- sum = __SMMLA(*pInA++, *pInB, sum);
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
pInB += numColsB;
+
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
- *px = sum << 1;
+ *px = sum1 << 1;
px += numColsB;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
}
}
@@ -309,7 +275,7 @@ arm_status arm_mat_mult_fast_q31(
if (numRowsA & 1U) {
/* point to last row in output matrix */
- px = pDst->pData+(numColsB)*(numRowsA-1);
+ px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
@@ -319,14 +285,16 @@ arm_status arm_mat_mult_fast_q31(
{
/* point to last row in matrix A */
- pInA = pSrcA->pData + (numRowsA-1)*numColsA;
+ pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
- /* Set the variable sum, that acts as accumulator, to zero */
- sum = 0;
+ /* Set variable sum1, that acts as accumulator, to zero */
+ sum1 = 0;
- /* Compute 4 columns at once */
- colCnt = numColsA >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 columns at a time. */
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
@@ -337,8 +305,13 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
- sum = __SMMLA(inA1, inB1, sum);
- sum = __SMMLA(inA2, inB2, sum);
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
+ sum1 = __SMMLA(inA2, inB2, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
+#endif
inA1 = *pInA++;
inA2 = *pInA++;
@@ -346,32 +319,49 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
- sum = __SMMLA(inA1, inB1, sum);
- sum = __SMMLA(inA2, inB2, sum);
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
+ sum1 = __SMMLA(inA2, inB2, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
+#endif
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- colCnt = numColsA & 3U;
+ /* Loop unrolling: Compute remaining column */
+ colCnt = numColsA % 4U;
+
+#else
+
+ /* Initialize colCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (colCnt > 0U) {
- sum = __SMMLA(*pInA++, *pInB, sum);
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
pInB += numColsB;
+
colCnt--;
}
/* Saturate and store the result in the destination buffer */
- *px++ = sum << 1;
+ *px++ = sum1 << 1;
i++;
- /* Decrement the col loop counter */
+ /* Decrement col loop counter */
col--;
}
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -380,5 +370,5 @@ arm_status arm_mat_mult_fast_q31(
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c b/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
index 3244f47..1d2b69c 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,206 +29,129 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixMult
- * @{
+ @addtogroup MatrixMult
+ @{
*/
-
/**
- * @brief Q15 matrix multiplication
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @param[in] *pState points to the array for storing intermediate results (Unused)
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator. The inputs to the
- * multiplications are in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate
- * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
- * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
- * truncated to 34.15 format by discarding the low 15 bits and then saturated to
- * 1.15 format.
- *
- * \par
- * Refer to arm_mat_mult_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- *
+ @brief Q15 matrix multiplication.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @param[in] pState points to the array for storing intermediate results (Unused)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator. The inputs to the
+ multiplications are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
+ and then saturated to 1.15 format.
+ @par
+ Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
- arm_matrix_instance_q15 * pDst,
- q15_t * pState)
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
{
- q63_t sum; /* accumulator */
+ q63_t sum; /* Accumulator */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_DSP) /* != CM0 */
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
- q15_t *px; /* Temporary output data matrix pointer */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
- uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
-
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- q31_t in; /* Temporary variable to hold the input value */
- q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
-
-#else
-
- q15_t in; /* Temporary variable to hold the input value */
- q15_t inA1, inB1, inA2, inB2;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+ q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t inA1, inB1, inA2, inB2;
#ifdef ARM_MATH_MATRIX_CHECK
+
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
/* Matrix transpose */
do
{
- /* Apply loop unrolling and exchange the columns with row elements */
- col = numColsB >> 2;
-
- /* The pointer px is set to starting address of the column being processed */
+ /* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
+ /* Apply loop unrolling and exchange columns with row elements */
+ col = numColsB >> 2U;
+
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
-
- /* Unpack and store one element in the destination */
+ /* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) in;
-
#else
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Unpack and store the second element in the destination */
+ /* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#else
-
*px = (q15_t) in;
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Read two elements from the row */
- in = *__SIMD32(pInB)++;
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
- /* Unpack and store one element in the destination */
+ /* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) in;
-
#else
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
- /* Unpack and store the second element in the destination */
-
#ifndef ARM_MATH_BIG_ENDIAN
-
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#else
-
*px = (q15_t) in;
-
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
-#else
-
- /* Read one element from the row */
- in = *pInB++;
-
- /* Store one element in the destination */
- *px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB;
-
- /* Read one element from the row */
- in = *pInB++;
-
- /* Store one element in the destination */
- *px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB;
-
- /* Read one element from the row */
- in = *pInB++;
-
- /* Store one element in the destination */
- *px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB;
-
- /* Read one element from the row */
- in = *pInB++;
-
- /* Store one element in the destination */
- *px = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += numRowsB;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
@@ -238,24 +161,24 @@ arm_status arm_mat_mult_q15(
while (col > 0U)
{
- /* Read and store the input element in the destination */
+ /* Read and store input element in destination */
*px = *pInB++;
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
i++;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
- /* Reset the variables for the usage in the following multiplication process */
+ /* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@@ -264,123 +187,98 @@ arm_status arm_mat_mult_q15(
/* row loop */
do
{
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the transposed pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
- /* Set the variable sum, that acts as accumulator, to zero */
+ /* Set variable sum, that acts as accumulator, to zero */
sum = 0;
- /* Apply loop unrolling and compute 2 MACs simultaneously. */
- colCnt = numColsA >> 2;
-
- /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ /* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
- pSourceA1 = *__SIMD32(pInA)++;
- pSourceB1 = *__SIMD32(pInB)++;
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
- pSourceA2 = *__SIMD32(pInA)++;
- pSourceB2 = *__SIMD32(pInB)++;
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
- sum = __SMLALD(pSourceA1, pSourceB1, sum);
- sum = __SMLALD(pSourceA2, pSourceB2, sum);
+ sum = __SMLALD(inA1, inB1, sum);
+ sum = __SMLALD(inA2, inB2, sum);
-#else
- /* read real and imag values from pSrcA and pSrcB buffer */
- inA1 = *pInA++;
- inB1 = *pInB++;
- inA2 = *pInA++;
- /* Multiply and Accumlates */
- sum += inA1 * inB1;
- inB2 = *pInB++;
-
- inA1 = *pInA++;
- inB1 = *pInB++;
- /* Multiply and Accumlates */
- sum += inA2 * inB2;
- inA2 = *pInA++;
- inB2 = *pInB++;
-
- /* Multiply and Accumlates */
- sum += inA1 * inB1;
- sum += inA2 * inB2;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
- colCnt = numColsA & 3U;
+ colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- /* Saturate and store the result in the destination buffer */
+ /* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
-#else
+#else /* #if defined (ARM_MATH_DSP) */
- /* Run the below code for Cortex-M0 */
-
- q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
- q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
- q15_t *pOut = pDst->pData; /* output data matrix pointer */
- q15_t *px; /* Temporary output data matrix pointer */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
+ q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *pOut = pDst->pData; /* Output data matrix pointer */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
@@ -391,11 +289,10 @@ arm_status arm_mat_mult_q15(
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@@ -404,7 +301,7 @@ arm_status arm_mat_mult_q15(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
- /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
+ /* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
@@ -413,38 +310,41 @@ arm_status arm_mat_mult_q15(
/* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- /* Perform the multiply-accumulates */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
- /* Saturate and store the result in the destination buffer */
+ /* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
+
+ /* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
- /* Update the pointer pIn2 to point to the starting address of the next column */
+ /* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
- /* Update the pointer pSrcA to point to the starting address of the next row */
+ /* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
- /* set status as ARM_MATH_SUCCESS */
+
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -453,5 +353,5 @@ arm_status arm_mat_mult_q15(
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c b/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
index 9bd2b97..161e723 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,254 +29,168 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixMult
- * @{
+ @addtogroup MatrixMult
+ @{
*/
/**
- * @brief Q31 matrix multiplication
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate
- * multiplication results but provides only a single guard bit. There is no saturation
- * on intermediate additions. Thus, if the accumulator overflows it wraps around and
- * distorts the result. The input signals should be scaled down to avoid intermediate
- * overflows. The input is thus scaled down by log2(numColsA) bits
- * to avoid overflows, as a total of numColsA additions are performed internally.
- * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- * \par
- * See arm_mat_mult_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
- *
+ @brief Q31 matrix multiplication.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit. There is no saturation
+ on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ distorts the result. The input signals should be scaled down to avoid intermediate
+ overflows. The input is thus scaled down by log2(numColsA) bits
+ to avoid overflows, as a total of numColsA additions are performed internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ @remark
+ Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
- q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
- uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
- q31_t a0, a1, a2, a3, b0, b1, b2, b3;
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
- /* Output pointer is set to starting address of the row being processed */
+ /* Output pointer is set to starting address of row being processed */
px = pOut + i;
- /* For every row wise process, the column loop counter is to be initiated */
+ /* For every row wise process, column loop counter is to be initiated */
col = numColsB;
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the pSrcB data */
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
- j = 0U;
-
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
- /* Initiate the pointer pIn1 to point to the starting address of pInA */
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
- /* Apply loop unrolling and compute 4 MACs simultaneously. */
- colCnt = numColsA >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
/* Perform the multiply-accumulates */
- b0 = *pIn2;
+ sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
- a0 = *pIn1++;
- a1 = *pIn1++;
-
- b1 = *pIn2;
- pIn2 += numColsB;
- b2 = *pIn2;
+ sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
- sum += (q63_t) a0 *b0;
- sum += (q63_t) a1 *b1;
-
- a2 = *pIn1++;
- a3 = *pIn1++;
-
- b3 = *pIn2;
+ sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
- sum += (q63_t) a2 *b2;
- sum += (q63_t) a3 *b3;
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
- while (colCnt > 0U)
- {
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
- /* Perform the multiply-accumulates */
- sum += (q63_t) * pIn1++ * *pIn2;
- pIn2 += numColsB;
-
- /* Decrement the loop counter */
- colCnt--;
- }
-
- /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
- *px++ = (q31_t) (sum >> 31);
-
- /* Update the pointer pIn2 to point to the starting address of the next column */
- j++;
- pIn2 = (pSrcB->pData) + j;
-
- /* Decrement the column loop counter */
- col--;
-
- } while (col > 0U);
-
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
- arm_status status; /* status of matrix multiplication */
-
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
- /* row loop */
- do
- {
- /* Output pointer is set to starting address of the row being processed */
- px = pOut + i;
-
- /* For every row wise process, the column loop counter is to be initiated */
- col = numColsB;
-
- /* For every row wise process, the pIn2 pointer is set
- ** to the starting address of the pSrcB data */
- pIn2 = pSrcB->pData;
-
- /* column loop */
- do
- {
- /* Set the variable sum, that acts as accumulator, to zero */
- sum = 0;
-
- /* Initiate the pointer pIn1 to point to the starting address of pInA */
- pIn1 = pInA;
-
- /* Matrix A columns number of MAC operations are to be performed */
+ /* Initialize cntCnt with number of columns */
colCnt = numColsA;
- /* matrix multiplication */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (colCnt > 0U)
{
- /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
/* Perform the multiply-accumulates */
- sum += (q63_t) * pIn1++ * *pIn2;
+ sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
colCnt--;
}
- /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
- *px++ = (q31_t) clip_q63_to_q31(sum >> 31);
+ /* Convert result from 2.62 to 1.31 format and store in destination buffer */
+ *px++ = (q31_t) (sum >> 31);
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
- /* Update the pointer pIn2 to point to the starting address of the next column */
+ /* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
-#endif
-
- /* Update the pointer pInA to point to the starting address of the next row */
+ /* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
} while (row > 0U);
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
/* Return to application */
return (status);
}
/**
- * @} end of MatrixMult group
+ @} end of MatrixMult group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c b/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
index dbc385a..a0097b1 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,42 +29,42 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixScale Matrix Scale
- *
- * Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
- * matrix by the scalar. For example:
- * \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
- *
- * The function checks to make sure that the input and output matrices are of the same size.
- *
- * In the fixed-point Q15 and Q31 functions, scale is represented by
- * a fractional multiplication scaleFract and an arithmetic shift shift.
- * The shift allows the gain of the scaling operation to exceed 1.0.
- * The overall scale factor applied to the fixed-point data is
- *
- * scale = scaleFract * 2^shift.
- *
+ @defgroup MatrixScale Matrix Scale
+
+ Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
+ matrix by the scalar. For example:
+ \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
+
+ The function checks to make sure that the input and output matrices are of the same size.
+
+ In the fixed-point Q15 and Q31 functions, scale is represented by
+ a fractional multiplication scaleFract and an arithmetic shift shift.
+ The shift allows the gain of the scaling operation to exceed 1.0.
+ The overall scale factor applied to the fixed-point data is
+
+ scale = scaleFract * 2^shift.
+
*/
/**
- * @addtogroup MatrixScale
- * @{
+ @addtogroup MatrixScale
+ @{
*/
/**
- * @brief Floating-point matrix scaling.
- * @param[in] *pSrc points to input matrix structure
- * @param[in] scale scale factor to be applied
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either ARM_MATH_SIZE_MISMATCH
- * or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
+ @brief Floating-point matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scale scale factor to be applied
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
@@ -76,12 +76,10 @@ arm_status arm_mat_scale_f32(
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
-#if defined (ARM_MATH_DSP)
float32_t in1, in2, in3, in4; /* temporary variables */
float32_t out1, out2, out3, out4; /* temporary variables */
-#endif // #if defined (ARM_MATH_DSP)
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
@@ -93,37 +91,23 @@ arm_status arm_mat_scale_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
+ float32x4_t vec1;
+ float32x4_t res;
+
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop Unrolling */
blkCnt = numSamples >> 2;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
- in1 = pIn[0];
- in2 = pIn[1];
- in3 = pIn[2];
- in4 = pIn[3];
-
- out1 = in1 * scale;
- out2 = in2 * scale;
- out3 = in3 * scale;
- out4 = in4 * scale;
-
-
- pOut[0] = out1;
- pOut[1] = out2;
- pOut[2] = out3;
- pOut[3] = out4;
+ vec1 = vld1q_f32(pIn);
+ res = vmulq_f32(vec1, vdupq_n_f32(scale));
+ vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
@@ -137,15 +121,6 @@ arm_status arm_mat_scale_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
@@ -163,7 +138,84 @@ arm_status arm_mat_scale_f32(
/* Return to application */
return (status);
}
+#else
+arm_status arm_mat_scale_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ float32_t scale,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counters */
+ arm_status status; /* Status of matrix scaling */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of MatrixScale group
+ @} end of MatrixScale group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c b/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
index af664ca..9b75d4e 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,135 +29,134 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixScale
- * @{
+ @addtogroup MatrixScale
+ @{
*/
/**
- * @brief Q15 matrix scaling.
- * @param[in] *pSrc points to input matrix
- * @param[in] scaleFract fractional portion of the scale factor
- * @param[in] shift number of bits to shift the result by
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The input data *pSrc and scaleFract are in 1.15 format.
- * These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ @brief Q15 matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scaleFract fractional portion of the scale factor
+ @param[in] shift number of bits to shift the result by
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The input data *pSrc and scaleFract are in 1.15 format.
+ These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
- q15_t scaleFract,
- int32_t shift,
- arm_matrix_instance_q15 * pDst)
+ q15_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q15 * pDst)
{
- q15_t *pIn = pSrc->pData; /* input data matrix pointer */
- q15_t *pOut = pDst->pData; /* output data matrix pointer */
- uint32_t numSamples; /* total number of elements in the matrix */
- int32_t totShift = 15 - shift; /* total shift to apply after scaling */
- uint32_t blkCnt; /* loop counters */
- arm_status status; /* status of matrix scaling */
+ q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counter */
+ arm_status status; /* Status of matrix scaling */
+ int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
-#if defined (ARM_MATH_DSP)
-
- q15_t in1, in2, in3, in4;
- q31_t out1, out2, out3, out4;
- q31_t inA1, inA2;
-
-#endif // #if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t inA1, inA2;
+ q31_t out1, out2, out3, out4; /* Temporary output variables */
+ q15_t in1, in2, in3, in4; /* Temporary input variables */
+#endif
#ifdef ARM_MATH_MATRIX_CHECK
- /* Check for matrix mismatch */
- if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif // #ifdef ARM_MATH_MATRIX_CHECK
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
- /* Total number of samples in the input matrix */
+ /* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- /* Loop Unrolling */
- blkCnt = numSamples >> 2;
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
- /* Scale, saturate and then store the results in the destination buffer. */
- /* Reading 2 inputs from memory */
- inA1 = _SIMD32_OFFSET(pIn);
- inA2 = _SIMD32_OFFSET(pIn + 2);
- /* C = A * scale */
- /* Scale the inputs and then store the 2 results in the destination buffer
+#if defined (ARM_MATH_DSP)
+ /* read 2 times 2 samples at a time from source */
+ inA1 = read_q15x2_ia ((q15_t **) &pIn);
+ inA2 = read_q15x2_ia ((q15_t **) &pIn);
+
+ /* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
- out2 = (q31_t) ((q15_t) inA1 * scaleFract);
+ out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
- out4 = (q31_t) ((q15_t) inA2 * scaleFract);
+ out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
- out1 = out1 >> totShift;
- inA1 = _SIMD32_OFFSET(pIn + 4);
- out2 = out2 >> totShift;
- inA2 = _SIMD32_OFFSET(pIn + 6);
- out3 = out3 >> totShift;
- out4 = out4 >> totShift;
+ /* apply shifting */
+ out1 = out1 >> kShift;
+ out2 = out2 >> kShift;
+ out3 = out3 >> kShift;
+ out4 = out4 >> kShift;
+ /* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
- _SIMD32_OFFSET(pOut) = __PKHBT(in2, in1, 16);
- _SIMD32_OFFSET(pOut + 2) = __PKHBT(in4, in3, 16);
+ /* store result to destination */
+ write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
+ write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
- /* update pointers to process next sampels */
- pIn += 4U;
- pOut += 4U;
+#else
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+#endif
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
- /* Scale, saturate and then store the results in the destination buffer. */
- *pOut++ =
- (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> totShift, 16));
- /* Decrement the numSamples loop counter */
+ /* Scale, saturate and store result in destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+
+ /* Decrement loop counter */
blkCnt--;
}
+
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -167,5 +166,5 @@ arm_status arm_mat_scale_q15(
}
/**
- * @} end of MatrixScale group
+ @} end of MatrixScale group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c b/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
index d190cf1..929b17f 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,152 +29,125 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixScale
- * @{
+ @addtogroup MatrixScale
+ @{
*/
/**
- * @brief Q31 matrix scaling.
- * @param[in] *pSrc points to input matrix
- * @param[in] scaleFract fractional portion of the scale factor
- * @param[in] shift number of bits to shift the result by
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The input data *pSrc and scaleFract are in 1.31 format.
- * These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
+ @brief Q31 matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scaleFract fractional portion of the scale factor
+ @param[in] shift number of bits to shift the result by
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The input data *pSrc and scaleFract are in 1.31 format.
+ These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
- q31_t scaleFract,
- int32_t shift,
- arm_matrix_instance_q31 * pDst)
+ q31_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn = pSrc->pData; /* input data matrix pointer */
- q31_t *pOut = pDst->pData; /* output data matrix pointer */
- uint32_t numSamples; /* total number of elements in the matrix */
- int32_t totShift = shift + 1; /* shift to apply after scaling */
- uint32_t blkCnt; /* loop counters */
- arm_status status; /* status of matrix scaling */
- q31_t in1, in2, out1; /* temporary variabels */
-
-#if defined (ARM_MATH_DSP)
-
- q31_t in3, in4, out2, out3, out4; /* temporary variables */
-
-#endif // #ifndef ARM_MAT_CM0
+ q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counter */
+ arm_status status; /* Status of matrix scaling */
+ int32_t kShift = shift + 1; /* Shift to apply after scaling */
+ q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
- /* Check for matrix mismatch */
- if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif // #ifdef ARM_MATH_MATRIX_CHECK
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
- /* Total number of samples in the input matrix */
+ /* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
- /* Read values from input */
- in1 = *pIn;
- in2 = *(pIn + 1);
- in3 = *(pIn + 2);
- in4 = *(pIn + 3);
- /* multiply input with scaler value */
- in1 = ((q63_t) in1 * scaleFract) >> 32;
- in2 = ((q63_t) in2 * scaleFract) >> 32;
- in3 = ((q63_t) in3 * scaleFract) >> 32;
- in4 = ((q63_t) in4 * scaleFract) >> 32;
+ /* Scale, saturate and store result in destination buffer. */
+ in = *pIn++; /* read four inputs from source */
+ in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
+ out = in << kShift; /* apply shifting */
+ if (in != (out >> kShift)) /* saturate the results. */
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out; /* Store result destination */
- /* apply shifting */
- out1 = in1 << totShift;
- out2 = in2 << totShift;
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
- /* saturate the results. */
- if (in1 != (out1 >> totShift))
- out1 = 0x7FFFFFFF ^ (in1 >> 31);
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
- if (in2 != (out2 >> totShift))
- out2 = 0x7FFFFFFF ^ (in2 >> 31);
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
- out3 = in3 << totShift;
- out4 = in4 << totShift;
-
- *pOut = out1;
- *(pOut + 1) = out2;
-
- if (in3 != (out3 >> totShift))
- out3 = 0x7FFFFFFF ^ (in3 >> 31);
-
- if (in4 != (out4 >> totShift))
- out4 = 0x7FFFFFFF ^ (in4 >> 31);
-
-
- *(pOut + 2) = out3;
- *(pOut + 3) = out4;
-
- /* update pointers to process next sampels */
- pIn += 4U;
- pOut += 4U;
-
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
- /* Scale, saturate and then store the results in the destination buffer. */
- in1 = *pIn++;
- in2 = ((q63_t) in1 * scaleFract) >> 32;
+ /* Scale, saturate and store result in destination buffer. */
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
- out1 = in2 << totShift;
-
- if (in2 != (out1 >> totShift))
- out1 = 0x7FFFFFFF ^ (in2 >> 31);
-
- *pOut++ = out1;
-
- /* Decrement the numSamples loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -187,5 +160,5 @@ arm_status arm_mat_scale_q31(
}
/**
- * @} end of MatrixScale group
+ @} end of MatrixScale group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c b/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
index 7c0b54e..cb57647 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,34 +29,36 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @defgroup MatrixSub Matrix Subtraction
- *
- * Subtract two matrices.
- * \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
- *
- * The functions check to make sure that
- * pSrcA, pSrcB, and pDst have the same
- * number of rows and columns.
+ @defgroup MatrixSub Matrix Subtraction
+
+ Subtract two matrices.
+ \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
+
+ The functions check to make sure that
+ pSrcA, pSrcB, and pDst have the same
+ number of rows and columns.
*/
/**
- * @addtogroup MatrixSub
- * @{
+ @addtogroup MatrixSub
+ @{
*/
/**
- * @brief Floating-point matrix subtraction
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ @brief Floating-point matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
+#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -66,11 +68,9 @@ arm_status arm_mat_sub_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
-#if defined (ARM_MATH_DSP)
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
-#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
@@ -88,71 +88,28 @@ arm_status arm_mat_sub_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
+
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop Unrolling */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
- inA1 = pIn1[0];
+ vec1 = vld1q_f32(pIn1);
+ vec2 = vld1q_f32(pIn2);
+ res = vsubq_f32(vec1, vec2);
+ vst1q_f32(pOut, res);
- /* Read values from source B */
- inB1 = pIn2[0];
-
- /* Read values from source A */
- inA2 = pIn1[1];
-
- /* out = sourceA - sourceB */
- out1 = inA1 - inB1;
-
- /* Read values from source B */
- inB2 = pIn2[1];
-
- /* Read values from source A */
- inA1 = pIn1[2];
-
- /* out = sourceA - sourceB */
- out2 = inA2 - inB2;
-
- /* Read values from source B */
- inB1 = pIn2[2];
-
- /* Store result in destination */
- pOut[0] = out1;
- pOut[1] = out2;
-
- /* Read values from source A */
- inA2 = pIn1[3];
-
- /* Read values from source B */
- inB2 = pIn2[3];
-
- /* out = sourceA - sourceB */
- out1 = inA1 - inB1;
-
-
- /* out = sourceA - sourceB */
- out2 = inA2 - inB2;
-
- /* Store result in destination */
- pOut[2] = out1;
-
- /* Store result in destination */
- pOut[3] = out2;
-
-
- /* update pointers to process next sampels */
+ /* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@@ -165,14 +122,6 @@ arm_status arm_mat_sub_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
@@ -191,7 +140,87 @@ arm_status arm_mat_sub_f32(
/* Return to application */
return (status);
}
+#else
+arm_status arm_mat_sub_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of MatrixSub group
+ @} end of MatrixSub group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c b/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
index 28e659f..5d5e5d0 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,112 +29,108 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixSub
- * @{
+ @addtogroup MatrixSub
+ @{
*/
/**
- * @brief Q15 matrix subtraction.
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ @brief Q15 matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
- arm_matrix_instance_q15 * pDst)
+ arm_matrix_instance_q15 * pDst)
{
- q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- q15_t *pOut = pDst->pData; /* output data matrix pointer */
- uint32_t numSamples; /* total number of elements in the matrix */
- uint32_t blkCnt; /* loop counters */
- arm_status status; /* status of matrix subtraction */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
- (pSrcA->numCols != pSrcB->numCols) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
- /* Total number of samples in the input matrix */
+ /* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Apply loop unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
- /* Subtract, Saturate and then store the results in the destination buffer. */
- *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
- *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
- /* Decrement the loop counter */
+ /* Subtract, Saturate and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
+ write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
- while (blkCnt > 0U)
- {
- /* C(m,n) = A(m,n) - B(m,n) */
- /* Subtract and then store the results in the destination buffer. */
- *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
- /* Subtract and then store the results in the destination buffer. */
- *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
- /* Decrement the loop counter */
+ /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -144,5 +140,5 @@ arm_status arm_mat_sub_q15(
}
/**
- * @} end of MatrixSub group
+ @} end of MatrixSub group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c b/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
index 3bf5508..40d1bef 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_sub_q31.c
* Description: Q31 matrix subtraction
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,157 +29,100 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixSub
- * @{
+ @addtogroup MatrixSub
+ @{
*/
/**
- * @brief Q31 matrix subtraction.
- * @param[in] *pSrcA points to the first input matrix structure
- * @param[in] *pSrcB points to the second input matrix structure
- * @param[out] *pDst points to output matrix structure
- * @return The function returns either
- * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
- *
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
- */
+ @brief Q31 matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
- q31_t inA1, inB1; /* temporary variables */
-#if defined (ARM_MATH_DSP)
-
- q31_t inA2, inB2; /* temporary variables */
- q31_t out1, out2; /* temporary variables */
-
-#endif // #if defined (ARM_MATH_DSP)
-
- uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
-
#ifdef ARM_MATH_MATRIX_CHECK
- /* Check for matrix mismatch condition */
+
+ /* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
- (pSrcA->numCols != pSrcB->numCols) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
{
- /* Total number of samples in the input matrix */
+ /* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /* Loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
+
/* Subtract, saturate and then store the results in the destination buffer. */
- /* Read values from source A */
- inA1 = pIn1[0];
+ *pOut++ = __QSUB(*pInA++, *pInB++);
- /* Read values from source B */
- inB1 = pIn2[0];
+ *pOut++ = __QSUB(*pInA++, *pInB++);
- /* Read values from source A */
- inA2 = pIn1[1];
+ *pOut++ = __QSUB(*pInA++, *pInB++);
- /* Subtract and saturate */
- out1 = __QSUB(inA1, inB1);
+ *pOut++ = __QSUB(*pInA++, *pInB++);
- /* Read values from source B */
- inB2 = pIn2[1];
-
- /* Read values from source A */
- inA1 = pIn1[2];
-
- /* Subtract and saturate */
- out2 = __QSUB(inA2, inB2);
-
- /* Read values from source B */
- inB1 = pIn2[2];
-
- /* Store result in destination */
- pOut[0] = out1;
- pOut[1] = out2;
-
- /* Read values from source A */
- inA2 = pIn1[3];
-
- /* Read values from source B */
- inB2 = pIn2[3];
-
- /* Subtract and saturate */
- out1 = __QSUB(inA1, inB1);
-
- /* Subtract and saturate */
- out2 = __QSUB(inA2, inB2);
-
- /* Store result in destination */
- pOut[2] = out1;
- pOut[3] = out2;
-
- /* update pointers to process next samples */
- pIn1 += 4U;
- pIn2 += 4U;
- pOut += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
- /* Subtract, saturate and then store the results in the destination buffer. */
- inA1 = *pIn1++;
- inB1 = *pIn2++;
- inA1 = __QSUB(inA1, inB1);
+ /* Subtract, saturate and store result in destination buffer. */
+ *pOut++ = __QSUB(*pInA++, *pInB++);
- *pOut++ = inA1;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -192,5 +135,5 @@ arm_status arm_mat_sub_q31(
}
/**
- * @} end of MatrixSub group
+ @} end of MatrixSub group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c b/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
index 84165ce..71748bf 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
@@ -3,8 +3,8 @@
* Title: arm_mat_trans_f32.c
* Description: Floating-point matrix transpose
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@@ -26,33 +26,36 @@
* limitations under the License.
*/
-/**
- * @defgroup MatrixTrans Matrix Transpose
- *
- * Tranposes a matrix.
- * Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
- * \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
- */
-
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixTrans
- * @{
+ @defgroup MatrixTrans Matrix Transpose
+
+ Tranposes a matrix.
+
+ Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
- * @brief Floating-point matrix transpose.
- * @param[in] *pSrc points to the input matrix
- * @param[out] *pDst points to the output matrix
- * @return The function returns either ARM_MATH_SIZE_MISMATCH
- * or ARM_MATH_SUCCESS based on the outcome of size checking.
- */
+ @addtogroup MatrixTrans
+ @{
+ */
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
@@ -64,17 +67,11 @@ arm_status arm_mat_trans_f32(
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- uint16_t blkCnt, i = 0U, row = nRows; /* loop counters */
+ uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
-
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
@@ -86,41 +83,44 @@ arm_status arm_mat_trans_f32(
{
/* Matrix transpose by exchanging the rows with columns */
- /* row loop */
- do
+ /* Row loop */
+ rowCnt = row >> 2;
+ while (rowCnt > 0U)
{
- /* Loop Unrolling */
+ float32x4_t row0V,row1V,row2V,row3V;
+ float32x4x2_t ra0,ra1,rb0,rb1;
+
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U) /* column loop */
+ while (blkCnt > 0U) /* Column loop */
{
- /* Read and store the input element in the destination */
- *px = *pIn++;
+ row0V = vld1q_f32(pIn);
+ row1V = vld1q_f32(pIn + 1 * nColumns);
+ row2V = vld1q_f32(pIn + 2 * nColumns);
+ row3V = vld1q_f32(pIn + 3 * nColumns);
+ pIn += 4;
- /* Update the pointer px to point to the next row of the transposed matrix */
+ ra0 = vzipq_f32(row0V,row2V);
+ ra1 = vzipq_f32(row1V,row3V);
+
+ rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
+ rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
+
+ vst1q_f32(px,rb0.val[0]);
px += nRows;
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ vst1q_f32(px,rb0.val[1]);
px += nRows;
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ vst1q_f32(px,rb1.val[0]);
px += nRows;
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ vst1q_f32(px,rb1.val[1]);
px += nRows;
/* Decrement the column loop counter */
@@ -133,46 +133,34 @@ arm_status arm_mat_trans_f32(
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += nRows;
+ *px++ = *pIn;
+ *px++ = *(pIn + 1 * nColumns);
+ *px++ = *(pIn + 2 * nColumns);
+ *px++ = *(pIn + 3 * nColumns);
+
+ px += (nRows - 4);
+ pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
-#else
+ i += 4;
+ pIn += 3 * nColumns;
- /* Run the below code for Cortex-M0 */
+ /* Decrement the row loop counter */
+ rowCnt--;
- uint16_t col, i = 0U, row = nRows; /* loop counters */
- arm_status status; /* status of matrix transpose */
+ } /* Row loop end */
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* Matrix transpose by exchanging the rows with columns */
- /* row loop */
- do
+ rowCnt = row & 3;
+ while (rowCnt > 0U)
{
+ blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
- /* Initialize column loop counter */
- col = nColumns;
-
- while (col > 0U)
+ while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
@@ -181,17 +169,11 @@ arm_status arm_mat_trans_f32(
px += nRows;
/* Decrement the column loop counter */
- col--;
+ blkCnt--;
}
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
i++;
-
- /* Decrement the row loop counter */
- row--;
-
- } while (row > 0U); /* row loop end */
+ rowCnt -- ;
+ }
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
@@ -200,6 +182,102 @@ arm_status arm_mat_trans_f32(
/* Return to application */
return (status);
}
+#else
+arm_status arm_mat_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixTrans group
diff --git a/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c b/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
index 6ba0904..707e0d6 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mat_trans_q15.c
* Description: Q15 matrix transpose
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,244 +29,154 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixTrans
- * @{
+ @addtogroup MatrixTrans
+ @{
*/
-/*
- * @brief Q15 matrix transpose.
- * @param[in] *pSrc points to the input matrix
- * @param[out] *pDst points to the output matrix
- * @return The function returns either ARM_MATH_SIZE_MISMATCH
- * or ARM_MATH_SUCCESS based on the outcome of size checking.
+/**
+ @brief Q15 matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
- arm_matrix_instance_q15 * pDst)
+ arm_matrix_instance_q15 * pDst)
{
- q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
- q15_t *pOut = pDst->pData; /* output data matrix pointer */
- uint16_t nRows = pSrc->numRows; /* number of nRows */
- uint16_t nColumns = pSrc->numCols; /* number of nColumns */
- uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
- arm_status status; /* status of matrix transpose */
+ q15_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-#ifndef UNALIGNED_SUPPORT_DISABLE
-
- q31_t in; /* variable to hold temporary output */
-
-#else
-
- q15_t in;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in; /* variable to hold temporary output */
+#endif
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
- /* row loop */
+ /* row loop */
do
{
-
- /* Apply loop unrolling and exchange the columns with row elements */
- col = nColumns >> 2U;
-
- /* The pointer pOut is set to starting address of the column being processed */
+ /* Pointer pOut is set to starting address of column being processed */
pOut = pDst->pData + i;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (col > 0U)
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
{
-#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pIn);
- /* Read two elements from the row */
- in = *__SIMD32(pSrcA)++;
-
- /* Unpack and store one element in the destination */
+ /* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*pOut = (q15_t) in;
-
#else
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Unpack and store second element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *pOut = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pIn);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) in;
+#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Update the pointer pOut to point to the next row of the transposed matrix */
+ /* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
- /* Unpack and store the second element in the destination */
-
+ /* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
-
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
#else
-
*pOut = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer pOut to point to the next row of the transposed matrix */
+ /* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
- /* Read two elements from the row */
-#ifndef ARM_MATH_BIG_ENDIAN
-
- in = *__SIMD32(pSrcA)++;
-
-#else
-
- in = *__SIMD32(pSrcA)++;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Unpack and store one element in the destination */
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *pOut = (q15_t) in;
-
-#else
-
- *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update the pointer pOut to point to the next row of the transposed matrix */
- pOut += nRows;
-
- /* Unpack and store the second element in the destination */
-#ifndef ARM_MATH_BIG_ENDIAN
-
- *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-
-#else
-
- *pOut = (q15_t) in;
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-#else
- /* Read one element from the row */
- in = *pSrcA++;
-
- /* Store one element in the destination */
- *pOut = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- pOut += nRows;
-
- /* Read one element from the row */
- in = *pSrcA++;
-
- /* Store one element in the destination */
- *pOut = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- pOut += nRows;
-
- /* Read one element from the row */
- in = *pSrcA++;
-
- /* Store one element in the destination */
- *pOut = in;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- pOut += nRows;
-
- /* Read one element from the row */
- in = *pSrcA++;
-
- /* Store one element in the destination */
- *pOut = in;
-
-#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
-
- /* Update the pointer pOut to point to the next row of the transposed matrix */
- pOut += nRows;
-
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
- /* Perform matrix transpose for last 3 samples here. */
- col = nColumns % 0x4U;
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize col with number of samples */
+ col = nCols;
-#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* Matrix transpose by exchanging the rows with columns */
- /* row loop */
- do
- {
- /* The pointer pOut is set to starting address of the column being processed */
- pOut = pDst->pData + i;
-
- /* Initialize column loop counter */
- col = nColumns;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
- /* Read and store the input element in the destination */
- *pOut = *pSrcA++;
+ /* Read and store input element in destination */
+ *pOut = *pIn++;
- /* Update the pointer pOut to point to the next row of the transposed matrix */
+ /* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
i++;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
- } while (row > 0U);
+ } while (row > 0U); /* row loop end */
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
+
/* Return to application */
return (status);
}
/**
- * @} end of MatrixTrans group
+ @} end of MatrixTrans group
*/
diff --git a/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c b/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
index 6f698e0..5d0b5e2 100644
--- a/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
+++ b/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mat_trans_q31.c
* Description: Q31 matrix transpose
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,163 +29,111 @@
#include "arm_math.h"
/**
- * @ingroup groupMatrix
+ @ingroup groupMatrix
*/
/**
- * @addtogroup MatrixTrans
- * @{
+ @addtogroup MatrixTrans
+ @{
*/
-/*
- * @brief Q31 matrix transpose.
- * @param[in] *pSrc points to the input matrix
- * @param[out] *pDst points to the output matrix
- * @return The function returns either ARM_MATH_SIZE_MISMATCH
- * or ARM_MATH_SUCCESS based on the outcome of size checking.
+/**
+ @brief Q31 matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
- arm_matrix_instance_q31 * pDst)
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pIn = pSrc->pData; /* input data matrix pointer */
- q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
- uint16_t nRows = pSrc->numRows; /* number of nRows */
- uint16_t nColumns = pSrc->numCols; /* number of nColumns */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- uint16_t blkCnt, i = 0U, row = nRows; /* loop counters */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
-
#ifdef ARM_MATH_MATRIX_CHECK
-
/* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
- /* row loop */
+ /* row loop */
do
{
- /* Apply loop unrolling and exchange the columns with row elements */
- blkCnt = nColumns >> 2U;
-
- /* The pointer px is set to starting address of the column being processed */
+ /* Pointer px is set to starting address of column being processed */
px = pOut + i;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
{
- /* Read and store the input element in the destination */
+ /* Read and store input element in destination */
*px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
+ /* Update pointer px to point to next row of transposed matrix */
px += nRows;
- /* Read and store the input element in the destination */
*px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
- /* Read and store the input element in the destination */
*px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
- /* Read and store the input element in the destination */
*px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
- /* Decrement the column loop counter */
- blkCnt--;
- }
-
- /* Perform matrix transpose for last 3 samples here. */
- blkCnt = nColumns % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += nRows;
-
- /* Decrement the column loop counter */
- blkCnt--;
- }
-
-#else
-
- /* Run the below code for Cortex-M0 */
-
- uint16_t col, i = 0U, row = nRows; /* loop counters */
- arm_status status; /* status of matrix transpose */
-
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* Matrix transpose by exchanging the rows with columns */
- /* row loop */
- do
- {
- /* The pointer px is set to starting address of the column being processed */
- px = pOut + i;
-
- /* Initialize column loop counter */
- col = nColumns;
-
- while (col > 0U)
- {
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += nRows;
-
- /* Decrement the column loop counter */
+ /* Decrement column loop counter */
col--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
i++;
- /* Decrement the row loop counter */
+ /* Decrement row loop counter */
row--;
- }
- while (row > 0U); /* row loop end */
+ } while (row > 0U); /* row loop end */
- /* set status as ARM_MATH_SUCCESS */
+ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -194,5 +142,5 @@ arm_status arm_mat_trans_q31(
}
/**
- * @} end of MatrixTrans group
+ @} end of MatrixTrans group
*/
diff --git a/DSP/Source/StatisticsFunctions/CMakeLists.txt b/DSP/Source/StatisticsFunctions/CMakeLists.txt
new file mode 100644
index 0000000..3f23355
--- /dev/null
+++ b/DSP/Source/StatisticsFunctions/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPStatistics)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPStatistics STATIC ${SRC})
+
+configdsp(CMSISDSPStatistics ..)
+
+### Includes
+target_include_directories(CMSISDSPStatistics PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/StatisticsFunctions/StatisticsFunctions.c b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
new file mode 100644
index 0000000..4f86aa4
--- /dev/null
+++ b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
@@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: StatisticsFunctions.c
+ * Description: Combination of all statistics function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_max_f32.c"
+#include "arm_max_q15.c"
+#include "arm_max_q31.c"
+#include "arm_max_q7.c"
+#include "arm_mean_f32.c"
+#include "arm_mean_q15.c"
+#include "arm_mean_q31.c"
+#include "arm_mean_q7.c"
+#include "arm_min_f32.c"
+#include "arm_min_q15.c"
+#include "arm_min_q31.c"
+#include "arm_min_q7.c"
+#include "arm_power_f32.c"
+#include "arm_power_q15.c"
+#include "arm_power_q31.c"
+#include "arm_power_q7.c"
+#include "arm_rms_f32.c"
+#include "arm_rms_q15.c"
+#include "arm_rms_q31.c"
+#include "arm_std_f32.c"
+#include "arm_std_q15.c"
+#include "arm_std_q31.c"
+#include "arm_var_f32.c"
+#include "arm_var_q15.c"
+#include "arm_var_q31.c"
diff --git a/DSP/Source/StatisticsFunctions/arm_max_f32.c b/DSP/Source/StatisticsFunctions/arm_max_f32.c
index a0a68ac..cd54e2a 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_f32.c
@@ -3,13 +3,13 @@
* Title: arm_max_f32.c
* Description: Maximum value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,136 +27,237 @@
*/
#include "arm_math.h"
+#if defined(ARM_MATH_NEON)
+#include
+#endif
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup Max Maximum
- *
- * Computes the maximum value of an array of data.
- * The function returns both the maximum value and its position within the array.
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup Max Maximum
+
+ Computes the maximum value of an array of data.
+ The function returns both the maximum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_max_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
+ uint32x4_t index={4,5,6,7};
+ uint32x4_t delta={4,4,4,4};
+ uint32x4_t countV={0,1,2,3};
+ uint32x2_t countV2;
+
/* Initialise the count value. */
count = 0U;
+
/* Initialise the index value to zero. */
outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcgtq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmax_f32(outV2,outV2);
+ out = outV2[0];
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = countV2[0];
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_max_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- float32_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -164,7 +265,7 @@ void arm_max_f32(
*pResult = out;
*pIndex = outIndex;
}
-
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q15.c b/DSP/Source/StatisticsFunctions/arm_max_q15.c
index 67d5e34..329b0c8 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q15.c
@@ -3,13 +3,13 @@
* Title: arm_max_q15.c
* Description: Maximum value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult,
- uint32_t * pIndex)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q15_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q15(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q31.c b/DSP/Source/StatisticsFunctions/arm_max_q31.c
index 5d34bbd..99de13e 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q31.c
@@ -3,13 +3,13 @@
* Title: arm_max_q31.c
* Description: Maximum value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult,
- uint32_t * pIndex)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q31_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q31(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q7.c b/DSP/Source/StatisticsFunctions/arm_max_q7.c
index 72f6e5e..9c8b6d3 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q7.c
@@ -3,13 +3,13 @@
* Title: arm_max_q7.c
* Description: Maximum value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult,
- uint32_t * pIndex)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q7_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q7_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q7_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q7(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_f32.c b/DSP/Source/StatisticsFunctions/arm_mean_f32.c
index 85a3b16..63d9652 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mean_f32.c
* Description: Mean value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,82 +29,70 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup mean Mean
- *
- * Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
- * The underlying algorithm is used:
- *
- *
- *
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup mean Mean
+
+ Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
+ @brief Mean value of a floating-point vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult mean value returned here.
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_mean_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
float32_t in1, in2, in3, in4;
+ float32x4_t inV;
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
-
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
-
+ inV = vld1q_f32(pSrc);
+ sumV = vaddq_f32(sumV, inV);
+
+ pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
-#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
@@ -119,7 +107,60 @@ void arm_mean_f32(
/* Store the result to the destination */
*pResult = sum / (float32_t) blockSize;
}
+#else
+void arm_mean_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (sum / blockSize);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q15.c b/DSP/Source/StatisticsFunctions/arm_mean_q15.c
index 7bf55c2..463aa84 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q15.c
* Description: Mean value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,59 +29,55 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.15 format and is accumulated in a 32-bit
- * accumulator in 17.15 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is saturated and truncated to yield a result of 1.15 format.
- *
+ @brief Mean value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.15 format and is accumulated in a 32-bit
+ accumulator in 17.15 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.15 format.
*/
void arm_mean_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
- q31_t in;
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in = *__SIMD32(pSrc)++;
+ in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
- in = *__SIMD32(pSrc)++;
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
@@ -89,32 +85,30 @@ void arm_mean_q15(
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
- *pResult = (q15_t) (sum / (q31_t)blockSize);
+ /* Store result to destination */
+ *pResult = (q15_t) (sum / (int32_t) blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q31.c b/DSP/Source/StatisticsFunctions/arm_mean_q31.c
index ea83ced..4b0ed6e 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q31.c
* Description: Mean value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,95 +29,82 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *\par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.31 format and is accumulated in a 64-bit
- * accumulator in 33.31 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is truncated to yield a result of 1.31 format.
- *
+ @brief Mean value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format and is accumulated in a 64-bit
+ accumulator in 33.31 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.31 format.
*/
void arm_mean_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- q31_t in1, in2, in3, in4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
-
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
-#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
/* Decrement the loop counter */
blkCnt--;
}
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
- *pResult = (q31_t) (sum / (int32_t) blockSize);
+ /* Store result to destination */
+ *pResult = (q31_t) (sum / blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q7.c b/DSP/Source/StatisticsFunctions/arm_mean_q7.c
index a7bdfb8..8f52211 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q7.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q7.c
* Description: Mean value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,57 +29,51 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.7 format and is accumulated in a 32-bit
- * accumulator in 25.7 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is truncated to yield a result of 1.7 format.
- *
+ @brief Mean value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format and is accumulated in a 32-bit
+ accumulator in 25.7 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.7 format.
*/
void arm_mean_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
- q31_t in;
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in = *__SIMD32(pSrc)++;
-
+ in = read_q7x4_ia ((q7_t **) &pSrc);
sum += ((in << 24U) >> 24U);
sum += ((in << 16U) >> 24U);
sum += ((in << 8U) >> 24U);
@@ -89,32 +83,30 @@ void arm_mean_q7(
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
+ /* Store result to destination */
*pResult = (q7_t) (sum / (int32_t) blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_f32.c b/DSP/Source/StatisticsFunctions/arm_min_f32.c
index 858b0a2..6e9ff4b 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_f32.c
@@ -3,13 +3,13 @@
* Title: arm_min_f32.c
* Description: Minimum value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,136 +27,233 @@
*/
#include "arm_math.h"
+#include
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup Min Minimum
- *
- * Computes the minimum value of an array of data.
- * The function returns both the minimum value and its position within the array.
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup Min Minimum
+
+ Computes the minimum value of an array of data.
+ The function returns both the minimum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_min_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
+ uint32x4_t index={4,5,6,7};
+ uint32x4_t delta={4,4,4,4};
+ uint32x4_t countV={0,1,2,3};
+ uint32x2_t countV2;
+
/* Initialise the count value. */
count = 0U;
+
/* Initialise the index value to zero. */
outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcltq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmin_f32(outV2,outV2);
+ out = outV2[0];
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = countV2[0];
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_min_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- float32_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -164,7 +261,8 @@ void arm_min_f32(
*pResult = out;
*pIndex = outIndex;
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q15.c b/DSP/Source/StatisticsFunctions/arm_min_q15.c
index fdc32b7..9450383 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q15.c
@@ -3,13 +3,13 @@
* Title: arm_min_q15.c
* Description: Minimum value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult,
- uint32_t * pIndex)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q15_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q15(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q31.c b/DSP/Source/StatisticsFunctions/arm_min_q31.c
index fc4c155..e25eb47 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q31.c
@@ -3,13 +3,13 @@
* Title: arm_min_q31.c
* Description: Minimum value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult,
- uint32_t * pIndex)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q31_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q31(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q7.c b/DSP/Source/StatisticsFunctions/arm_min_q7.c
index 50362e6..2b171f0 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q7.c
@@ -3,13 +3,13 @@
* Title: arm_min_q7.c
* Description: Minimum value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult,
- uint32_t * pIndex)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q7_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q7_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q7_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q7(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_f32.c b/DSP/Source/StatisticsFunctions/arm_power_f32.c
index 1426735..a4825a5 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_f32.c
@@ -3,13 +3,13 @@
* Title: arm_power_f32.c
* Description: Sum of the squares of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,40 +29,37 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup power Power
- *
- * Calculates the sum of the squares of the elements in the input vector.
- * The underlying algorithm is used:
- *
- *
- *
- * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ @defgroup power Power
+
+ Calculates the sum of the squares of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
-
/**
- * @brief Sum of the squares of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
+ @brief Sum of the squares of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
*/
-
-
+#if defined(ARM_MATH_NEON)
void arm_power_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
@@ -70,45 +67,32 @@ void arm_power_f32(
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
-
-#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
@@ -123,7 +107,69 @@ void arm_power_f32(
/* Store the result to the destination */
*pResult = sum;
}
+#else
+void arm_power_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q15.c b/DSP/Source/StatisticsFunctions/arm_power_q15.c
index 6d95f4d..12f524d 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q15.c
@@ -3,13 +3,13 @@
* Title: arm_power_q15.c
* Description: Sum of the squares of the elements of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,110 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 34.30 format.
- *
+ @brief Sum of the squares of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 34.30 format.
*/
void arm_power_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q63_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+#endif
- q31_t in32; /* Temporary variable to store input value */
- q15_t in16; /* Temporary variable to store input value */
- uint32_t blkCnt; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
-
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
- in32 = *__SIMD32(pSrc)++;
- sum = __SMLALD(in32, in32, sum);
- in32 = *__SIMD32(pSrc)++;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
- in16 = *pSrc++;
- sum = __SMLALD(in16, in16, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t in; /* Temporary variable to store input value */
- uint32_t blkCnt; /* loop counter */
-
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
- /* Store the results in 34.30 format */
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in 34.30 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q31.c b/DSP/Source/StatisticsFunctions/arm_power_q31.c
index 16be249..1e193b3 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q31.c
@@ -3,13 +3,13 @@
* Title: arm_power_q31.c
* Description: Sum of the squares of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,58 +29,51 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.31 format.
- * Intermediate multiplication yields a 2.62 format, and this
- * result is truncated to 2.48 format by discarding the lower 14 bits.
- * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
- * With 15 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 16.48 format.
- *
+ @brief Sum of the squares of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format.
+ Intermediate multiplication yields a 2.62 format, and this
+ result is truncated to 2.48 format by discarding the lower 14 bits.
+ The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ With 15 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 16.48 format.
*/
void arm_power_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q63_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
- q31_t in;
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q31_t in; /* Temporary variable to store input value */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
@@ -93,37 +86,36 @@ void arm_power_q31(
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the results in 16.48 format */
+ /* Store results in 16.48 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q7.c b/DSP/Source/StatisticsFunctions/arm_power_q7.c
index 24306cd..47405cd 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q7.c
@@ -3,13 +3,13 @@
* Title: arm_power_q7.c
* Description: Sum of the squares of the elements of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,99 +29,108 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.7 format.
- * Intermediate multiplication yields a 2.14 format, and this
- * result is added without saturation to an accumulator in 18.14 format.
- * With 17 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 18.14 format.
- *
+ @brief Sum of the squares of the elements of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format.
+ Intermediate multiplication yields a 2.14 format, and this
+ result is added without saturation to an accumulator in 18.14 format.
+ With 17 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 18.14 format.
*/
void arm_power_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- q7_t in; /* Temporary variable to store input */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
+ q7_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+ q31_t in1, in2; /* Temporary variables to store input value */
+#endif
- q31_t input1; /* Temporary variable to store packed input */
- q31_t in1, in2; /* Temporary variables to store input */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Reading two inputs of pSrc vector and packing */
- input1 = *__SIMD32(pSrc)++;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- in1 = __SXTB16(__ROR(input1, 8));
- in2 = __SXTB16(input1);
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q7x4_ia ((q7_t **) &pSrc);
+
+ in1 = __SXTB16(__ROR(in32, 8));
+ in2 = __SXTB16(in32);
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* calculate power and accumulate to accumulator */
sum = __SMLAD(in1, in1, sum);
sum = __SMLAD(in2, in2, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the result in 18.14 format */
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in 18.14 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_f32.c b/DSP/Source/StatisticsFunctions/arm_rms_f32.c
index 8d1b708..4546510 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_f32.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_f32.c
- * Description: Root mean square value of an array of F32 type
+ * Description: Root mean square value of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,88 +29,75 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup RMS Root mean square (RMS)
- *
- *
- * Calculates the Root Mean Sqaure of the elements in the input vector.
- * The underlying algorithm is used:
- *
- *
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup RMS Root mean square (RMS)
+
+ Calculates the Root Mean Square of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup RMS
- * @{
+ @addtogroup RMS
+ @{
*/
-
/**
- * @brief Root Mean Square of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
+ @brief Root Mean Square of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_rms_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
- float32_t sum = 0.0f; /* Accumulator */
- float32_t in; /* Tempoprary variable to store input value */
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
- /* loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the result in a temporary variable, sum */
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
- in = *pSrc++;
- sum += in * in;
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
-#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ /* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
@@ -121,7 +108,69 @@ void arm_rms_f32(
/* Compute Rms and store the result in the destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
+#else
+void arm_rms_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ( in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store result in destination */
+ arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q15.c b/DSP/Source/StatisticsFunctions/arm_rms_q15.c
index d0e61ca..9fcd964 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_q15.c
@@ -3,13 +3,13 @@
* Title: arm_rms_q15.c
* Description: Root Mean Square of the elements of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,111 +29,106 @@
#include "arm_math.h"
/**
- * @addtogroup RMS
- * @{
+ @ingroup groupStats
*/
/**
- * @brief Root Mean Square of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
- *
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_rms_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q63_t sum = 0; /* accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
+#endif
- q31_t in; /* temporary variable to store the input value */
- q15_t in1; /* temporary variable to store the input value */
- uint32_t blkCnt; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
- in = *__SIMD32(pSrc)++;
- sum = __SMLALD(in, in, sum);
- in = *__SIMD32(pSrc)++;
- sum = __SMLALD(in, in, sum);
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
- in1 = *pSrc++;
- sum = __SMLALD(in1, in1, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Truncating and saturating the accumulator to 1.15 format */
- /* Store the result in the destination */
- arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
+ /* Compute sum of squares and store result in a temporary variable. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t in; /* temporary variable to store the input value */
- uint32_t blkCnt; /* loop counter */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
in = *pSrc++;
sum += ((q31_t) in * in);
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ((q31_t) in * in);
+
+ /* Decrement loop counter */
blkCnt--;
}
/* Truncating and saturating the accumulator to 1.15 format */
- /* Store the result in the destination */
+ /* Store result in destination */
arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q31.c b/DSP/Source/StatisticsFunctions/arm_rms_q31.c
index cb3c58e..5a3e8f3 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_q31.c
@@ -3,13 +3,13 @@
* Title: arm_rms_q31.c
* Description: Root Mean Square of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,109 +29,96 @@
#include "arm_math.h"
/**
- * @addtogroup RMS
- * @{
+ @ingroup groupStats
*/
+/**
+ @addtogroup RMS
+ @{
+ */
/**
- * @brief Root Mean Square of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
- * @details
- * Scaling and Overflow Behavior:
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, and intermediate multiplication
- * yields a 2.62 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows, it wraps around and distorts the result.
- * In order to avoid overflows completely, the input signal must be scaled down by
- * log2(blockSize) bits, as a total of blockSize additions are performed internally.
- * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
- *
+ @brief Root Mean Square of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, and intermediate multiplication
+ yields a 2.62 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows, it wraps around and distorts the result.
+ In order to avoid overflows completely, the input signal must be scaled down by
+ log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
*/
void arm_rms_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* accumulator */
- q31_t in; /* Temporary variable to store the input */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */
+ q31_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- q31_t in1, in2, in3, in4; /* Temporary input variables */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 8 outputs at a time.
- ** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the result in a temporary variable, sum */
- /* read two samples from source buffer */
- in1 = pSrc[0];
- in2 = pSrc[1];
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- /* calculate power and accumulate to accumulator */
- sum += (q63_t) in1 *in1;
- sum += (q63_t) in2 *in2;
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += ((q63_t) in * in);
- /* read two samples from source buffer */
- in3 = pSrc[2];
- in4 = pSrc[3];
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
- /* calculate power and accumulate to accumulator */
- sum += (q63_t) in3 *in3;
- sum += (q63_t) in4 *in4;
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
- /* update source buffer to process next samples */
- pSrc += 4U;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
- in = *pSrc++;
- sum += (q63_t) in *in;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Decrement the loop counter */
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ((q63_t) in * in);
+
+ /* Decrement loop counter */
blkCnt--;
}
/* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
- /* Compute Rms and store the result in the destination vector */
+ /* Compute Rms and store result in destination vector */
arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
}
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_f32.c b/DSP/Source/StatisticsFunctions/arm_std_f32.c
index 9750b88..e1e6577 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_f32.c
@@ -3,13 +3,13 @@
* Title: arm_std_f32.c
* Description: Standard deviation of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,111 +29,131 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup STD Standard deviation
- *
- * Calculates the standard deviation of the elements in the input vector.
- * The underlying algorithm is used:
- *
- *
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup STD Standard deviation
+
+ Calculates the standard deviation of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
-
/**
- * @brief Standard deviation of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
+ @brief Standard deviation of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_std_f32(
- float32_t * pSrc,
- uint32_t blockSize,
- float32_t * pResult)
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
{
- float32_t sum = 0.0f; /* Temporary result storage */
- float32_t sumOfSquares = 0.0f; /* Sum of squares */
- float32_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
+ float32_t var;
+ arm_var_f32(pSrc,blockSize,&var);
+ arm_sqrt_f32(var, pResult);
+}
#else
- float32_t squareOfSum; /* Square of Sum */
- float32_t var; /* Temporary varaince storage */
+void arm_std_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t sumOfSquares = 0.0f; /* Sum of squares */
+ float32_t in; /* Temporary variable to store input value */
+
+#ifndef ARM_MATH_CM0_FAMILY
+ float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
+#else
+ float32_t squareOfSum; /* Square of Sum */
+ float32_t var; /* Temporary varaince storage */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in;
- sumOfSquares += in * in;
- in = *pSrc++;
- sum += in;
- sumOfSquares += in * in;
- in = *pSrc++;
- sum += in;
- sumOfSquares += in * in;
- in = *pSrc++;
- sum += in;
- sumOfSquares += in * in;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += in * in;
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += in * in;
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += in * in;
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += in * in;
+ sum += in;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in;
- sumOfSquares += in * in;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ( in * in);
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
+#ifndef ARM_MATH_CM0_FAMILY
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
/* Compute mean of all input values */
@@ -143,44 +163,26 @@ void arm_std_f32(
squareOfMean = (mean * mean) * (((float32_t) blockSize) /
((float32_t) blockSize - 1.0f));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result to destination */
arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
#else
/* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
- in = *pSrc++;
- sumOfSquares += in * in;
-
- /* C = (A[0] + A[1] + ... + A[blockSize-1]) */
- /* Compute Sum of the input samples
- * and then store the result in a temporary variable, sum. */
- sum += in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute the square of sum */
+ /* Compute square of sum */
squareOfSum = ((sum * sum) / (float32_t) blockSize);
- /* Compute the variance */
+ /* Compute variance */
var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_f32(var, pResult);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #ifndef ARM_MATH_CM0_FAMILY */
+
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_q15.c b/DSP/Source/StatisticsFunctions/arm_std_q15.c
index 2f2f52e..8e5c042 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_q15.c
@@ -3,13 +3,13 @@
* Title: arm_std_q15.c
* Description: Standard deviation of an array of Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,146 +29,133 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
/**
- * @brief Standard deviation of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
+ @brief Standard deviation of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_std_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Accumulator */
- q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
-#if defined (ARM_MATH_DSP)
- q31_t in; /* input value */
- q15_t in1; /* input value */
-#else
- q15_t in; /* input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in1 = *pSrc++;
- sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
- sum += in1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
-
- /* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
-
- /* mean of the squares minus the square of the mean. */
- /* Compute standard deviation and store the result to the destination */
- arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++;
sumOfSquares += (in * in);
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += (in * in);
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* mean of the squares minus the square of the mean. */
- /* Compute standard deviation and store the result to the destination */
+ /* mean of squares minus the square of mean. */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
-
-#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_q31.c b/DSP/Source/StatisticsFunctions/arm_std_q31.c
index f02cbdd..cfb6cb8 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_q31.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_q31.c
- * Description: Standard deviation of an array of Q31 type.
+ * Description: Standard deviation of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,119 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
/**
- * @brief Standard deviation of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
- * @details
- * Scaling and Overflow Behavior:
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, which is then downshifted by 8 bits
- * which yields 1.23, and intermediate multiplication yields a 2.46 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a 16 guard bits.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by
- * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
- * After division, internal variables should be Q18.46
- * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
- *
+ @brief Standard deviation of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult standard deviation value returned here.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_std_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Accumulator */
- q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- q31_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Accumulator */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
- in = *pSrc++ >> 8U;
- sumOfSquares += ((q63_t) (in) * (in));
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
}
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_f32.c b/DSP/Source/StatisticsFunctions/arm_var_f32.c
index c0f731d..3c325b1 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_f32.c
@@ -3,13 +3,13 @@
* Title: arm_var_f32.c
* Description: Variance of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,153 +29,206 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup variance Variance
- *
- * Calculates the variance of the elements in the input vector.
- * The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
- *
- *
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup variance Variance
+
+ Calculates the variance of the elements in the input vector.
+ The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
+
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
-
/**
- * @brief Variance of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
+ @brief Variance of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_var_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
- float32_t fMean, fValue;
- uint32_t blkCnt; /* loop counter */
- float32_t * pInput = pSrc;
- float32_t sum = 0.0f;
- float32_t fSum = 0.0f;
- #if defined(ARM_MATH_DSP)
- float32_t in1, in2, in3, in4;
- #endif
+ float32_t mean;
- if (blockSize <= 1U)
- {
- *pResult = 0;
- return;
- }
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
- #if defined(ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M7 */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
+ float32x4_t avg;
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
+ arm_mean_f32(pSrc,blockSize,&mean);
+ avg = vdupq_n_f32(mean);
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pInput++;
- in2 = *pInput++;
- in3 = *pInput++;
- in4 = *pInput++;
+ blkCnt = blockSize >> 2U;
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ inV = vsubq_f32(inV, avg);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
- #else
- /* Run the below code for Cortex-M0 or Cortex-M3 */
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ in = in - mean;
+ sum += in * in;
- #endif
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- sum += *pInput++;
+ /* Variance */
+ *pResult = sum / (float32_t)(blockSize - 1.0f);
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- fMean = sum / (float32_t) blockSize;
-
- pInput = pSrc;
-
- #if defined(ARM_MATH_DSP)
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- blkCnt = blockSize % 0x4U;
- #else
- /* Run the below code for Cortex-M0 or Cortex-M3 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
- #endif
-
- while (blkCnt > 0U)
- {
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Variance */
- *pResult = fSum / (float32_t)(blockSize - 1.0f);
}
+#else
+void arm_var_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t fSum = 0.0f;
+ float32_t fMean, fValue;
+ const float32_t * pInput = pSrc;
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ fMean = sum / (float32_t) blockSize;
+
+ pInput = pSrc;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = fSum / (float32_t)(blockSize - 1.0f);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+
/**
- * @} end of variance group
+ @} end of variance group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_q15.c b/DSP/Source/StatisticsFunctions/arm_var_q15.c
index 5ba61f7..259e76b 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_q15.c
@@ -3,13 +3,13 @@
* Title: arm_var_q15.c
* Description: Variance of an array of Q15 type
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,144 +29,136 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
/**
- * @brief Variance of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
- * @details
- * Scaling and Overflow Behavior:
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
+ @brief Variance of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_var_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Accumulator */
- q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
-#if defined (ARM_MATH_DSP)
- q31_t in; /* input value */
- q15_t in1; /* input value */
-#else
- q15_t in; /* input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in1 = *pSrc++;
- sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
- sum += in1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
-
- /* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
-
- /* mean of the squares minus the square of the mean. */
- *pResult = (meanOfSquares - squareOfMean) >> 15U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++;
sumOfSquares += (in * in);
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+#if defined (ARM_MATH_DSP)
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+#else
+ sumOfSquares += (in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* mean of the squares minus the square of the mean. */
- *pResult = (meanOfSquares - squareOfMean) >> 15;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* mean of squares minus the square of mean. */
+ *pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
- * @} end of variance group
+ @} end of variance group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_q31.c b/DSP/Source/StatisticsFunctions/arm_var_q31.c
index 526c6cd..558332f 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_q31.c
@@ -3,13 +3,13 @@
* Title: arm_var_q31.c
* Description: Variance of an array of Q31 type
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,119 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
/**
- * @brief Variance of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
- * @details
- * Scaling and Overflow Behavior:
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, which is then downshifted by 8 bits
- * which yields 1.23, and intermediate multiplication yields a 2.46 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a 16 guard bits.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by
- * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
- * After division, internal variables should be Q18.46
- * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
- *
+ @brief Variance of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_var_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Accumulator */
- q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- q31_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *pSrc++ >> 8U;
- sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
- in = *pSrc++ >> 8U;
- sumOfSquares += ((q63_t) (in) * (in));
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute variance and store result in destination */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
- * @} end of variance group
+ @} end of variance group
*/
diff --git a/DSP/Source/SupportFunctions/CMakeLists.txt b/DSP/Source/SupportFunctions/CMakeLists.txt
new file mode 100644
index 0000000..33c4f87
--- /dev/null
+++ b/DSP/Source/SupportFunctions/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPSupport)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPSupport STATIC ${SRC})
+
+configdsp(CMSISDSPSupport ..)
+
+### Includes
+target_include_directories(CMSISDSPSupport PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/SupportFunctions/SupportFunctions.c b/DSP/Source/SupportFunctions/SupportFunctions.c
new file mode 100644
index 0000000..4deb19b
--- /dev/null
+++ b/DSP/Source/SupportFunctions/SupportFunctions.c
@@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: SupportFunctions.c
+ * Description: Combination of all support function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_copy_f32.c"
+#include "arm_copy_q15.c"
+#include "arm_copy_q31.c"
+#include "arm_copy_q7.c"
+#include "arm_fill_f32.c"
+#include "arm_fill_q15.c"
+#include "arm_fill_q31.c"
+#include "arm_fill_q7.c"
+#include "arm_float_to_q15.c"
+#include "arm_float_to_q31.c"
+#include "arm_float_to_q7.c"
+#include "arm_q15_to_float.c"
+#include "arm_q15_to_q31.c"
+#include "arm_q15_to_q7.c"
+#include "arm_q31_to_float.c"
+#include "arm_q31_to_q15.c"
+#include "arm_q31_to_q7.c"
+#include "arm_q7_to_float.c"
+#include "arm_q7_to_q15.c"
+#include "arm_q7_to_q31.c"
diff --git a/DSP/Source/SupportFunctions/arm_copy_f32.c b/DSP/Source/SupportFunctions/arm_copy_f32.c
index 1e2b5cf..707adc4 100644
--- a/DSP/Source/SupportFunctions/arm_copy_f32.c
+++ b/DSP/Source/SupportFunctions/arm_copy_f32.c
@@ -3,13 +3,13 @@
* Title: arm_copy_f32.c
* Description: Copies the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,66 +29,56 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @defgroup copy Vector Copy
- *
- * Copies sample by sample from source vector to destination vector.
- *
- *
- * pDst[n] = pSrc[n]; 0 <= n < blockSize.
- *
- *
- * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ @defgroup copy Vector Copy
+
+ Copies sample by sample from source vector to destination vector.
+
+
+ pDst[n] = pSrc[n]; 0 <= n < blockSize.
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup copy
- * @{
+ @addtogroup copy
+ @{
*/
/**
- * @brief Copies the elements of a floating-point vector.
- * @param[in] *pSrc points to input vector
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the input vector
- * @return none.
- *
+ @brief Copies the elements of a floating-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_copy_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
+ float32x4_t inV;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t in1, in2, in3, in4;
-
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A */
/* Copy and then store the results in the destination buffer */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
-
- *pDst++ = in1;
- *pDst++ = in2;
- *pDst++ = in3;
- *pDst++ = in4;
+ inV = vld1q_f32(pSrc);
+ vst1q_f32(pDst, inV);
+ pSrc += 4;
+ pDst += 4;
/* Decrement the loop counter */
blkCnt--;
@@ -96,16 +86,7 @@ void arm_copy_f32(
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
@@ -117,7 +98,55 @@ void arm_copy_f32(
blkCnt--;
}
}
+#else
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of BasicCopy group
+ @} end of BasicCopy group
*/
diff --git a/DSP/Source/SupportFunctions/arm_copy_q15.c b/DSP/Source/SupportFunctions/arm_copy_q15.c
index 0d2fffb..d8da113 100644
--- a/DSP/Source/SupportFunctions/arm_copy_q15.c
+++ b/DSP/Source/SupportFunctions/arm_copy_q15.c
@@ -3,13 +3,13 @@
* Title: arm_copy_q15.c
* Description: Copies the elements of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,74 +29,68 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup copy
- * @{
+ @addtogroup copy
+ @{
*/
+
/**
- * @brief Copies the elements of a Q15 vector.
- * @param[in] *pSrc points to input vector
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the input vector
- * @return none.
- *
+ @brief Copies the elements of a Q15 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_copy_q15(
- q15_t * pSrc,
- q15_t * pDst,
- uint32_t blockSize)
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A */
- /* Read two inputs */
- *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
- *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
- /* Decrement the loop counter */
+ /* read 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc));
+ write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc));
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
-
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A */
- /* Copy and then store the value in the destination buffer */
+
+ /* Copy and store result in destination buffer */
*pDst++ = *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of BasicCopy group
+ @} end of BasicCopy group
*/
diff --git a/DSP/Source/SupportFunctions/arm_copy_q31.c b/DSP/Source/SupportFunctions/arm_copy_q31.c
index 5bf8934..e342a32 100644
--- a/DSP/Source/SupportFunctions/arm_copy_q31.c
+++ b/DSP/Source/SupportFunctions/arm_copy_q31.c
@@ -3,13 +3,13 @@
* Title: arm_copy_q31.c
* Description: Copies the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,83 +29,70 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup copy
- * @{
+ @addtogroup copy
+ @{
*/
/**
- * @brief Copies the elements of a Q31 vector.
- * @param[in] *pSrc points to input vector
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the input vector
- * @return none.
- *
+ @brief Copies the elements of a Q31 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_copy_q31(
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t in1, in2, in3, in4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A */
- /* Copy and then store the values in the destination buffer */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
- *pDst++ = in1;
- *pDst++ = in2;
- *pDst++ = in3;
- *pDst++ = in4;
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A */
- /* Copy and then store the value in the destination buffer */
+
+ /* Copy and store result in destination buffer */
*pDst++ = *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of BasicCopy group
+ @} end of BasicCopy group
*/
diff --git a/DSP/Source/SupportFunctions/arm_copy_q7.c b/DSP/Source/SupportFunctions/arm_copy_q7.c
index 5c737cd..77da8ca 100644
--- a/DSP/Source/SupportFunctions/arm_copy_q7.c
+++ b/DSP/Source/SupportFunctions/arm_copy_q7.c
@@ -3,13 +3,13 @@
* Title: arm_copy_q7.c
* Description: Copies the elements of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,75 +29,67 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup copy
- * @{
+ @addtogroup copy
+ @{
*/
/**
- * @brief Copies the elements of a Q7 vector.
- * @param[in] *pSrc points to input vector
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the input vector
- * @return none.
- *
+ @brief Copies the elements of a Q7 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_copy_q7(
- q7_t * pSrc,
- q7_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A */
- /* Copy and then store the results in the destination buffer */
- /* 4 samples are copied and stored at a time using SIMD */
- *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
- /* Decrement the loop counter */
+ /* read 4 samples at a time */
+ write_q7x4_ia (&pDst, read_q7x4_ia ((q7_t **) &pSrc));
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A */
- /* Copy and then store the results in the destination buffer */
+
+ /* Copy and store result in destination buffer */
*pDst++ = *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of BasicCopy group
+ @} end of BasicCopy group
*/
diff --git a/DSP/Source/SupportFunctions/arm_fill_f32.c b/DSP/Source/SupportFunctions/arm_fill_f32.c
index be749c8..29f6286 100644
--- a/DSP/Source/SupportFunctions/arm_fill_f32.c
+++ b/DSP/Source/SupportFunctions/arm_fill_f32.c
@@ -3,13 +3,13 @@
* Title: arm_fill_f32.c
* Description: Fills a constant value into a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,36 +29,35 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @defgroup Fill Vector Fill
- *
- * Fills the destination vector with a constant value.
- *
- *
- * pDst[n] = value; 0 <= n < blockSize.
- *
- *
- * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ @defgroup Fill Vector Fill
+
+ Fills the destination vector with a constant value.
+
+
+ pDst[n] = value; 0 <= n < blockSize.
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup Fill
- * @{
+ @addtogroup Fill
+ @{
*/
/**
- * @brief Fills a constant value into a floating-point vector.
- * @param[in] value input value to be filled
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the output vector
- * @return none.
- *
+ @brief Fills a constant value into a floating-point vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_fill_f32(
float32_t value,
float32_t * pDst,
@@ -66,27 +65,19 @@ void arm_fill_f32(
{
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t in1 = value;
- float32_t in2 = value;
- float32_t in3 = value;
- float32_t in4 = value;
+ float32x4_t inV = vdupq_n_f32(value);
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = value */
/* Fill the value in the destination buffer */
- *pDst++ = in1;
- *pDst++ = in2;
- *pDst++ = in3;
- *pDst++ = in4;
+ vst1q_f32(pDst, inV);
+ pDst += 4;
/* Decrement the loop counter */
blkCnt--;
@@ -94,17 +85,7 @@ void arm_fill_f32(
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
-#else
-
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
+ blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
@@ -116,7 +97,55 @@ void arm_fill_f32(
blkCnt--;
}
}
+#else
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of Fill group
+ @} end of Fill group
*/
diff --git a/DSP/Source/SupportFunctions/arm_fill_q15.c b/DSP/Source/SupportFunctions/arm_fill_q15.c
index 27eb42c..d8c0f8d 100644
--- a/DSP/Source/SupportFunctions/arm_fill_q15.c
+++ b/DSP/Source/SupportFunctions/arm_fill_q15.c
@@ -3,13 +3,13 @@
* Title: arm_fill_q15.c
* Description: Fills a constant value into a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,21 +29,20 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup Fill
- * @{
+ @addtogroup Fill
+ @{
*/
/**
- * @brief Fills a constant value into a Q15 vector.
- * @param[in] value input value to be filled
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the output vector
- * @return none.
- *
+ @brief Fills a constant value into a Q15 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_fill_q15(
@@ -51,58 +50,51 @@ void arm_fill_q15(
q15_t * pDst,
uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
q31_t packedValue; /* value packed to 32 bits */
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
-
/* Packing two 16 bit values to 32 bit value in order to use SIMD */
packedValue = __PKHBT(value, value, 16U);
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
- *__SIMD32(pDst)++ = packedValue;
- *__SIMD32(pDst)++ = packedValue;
- /* Decrement the loop counter */
+ /* fill 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, packedValue);
+ write_q15x2_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
+
+ /* Fill value in destination buffer */
*pDst++ = value;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of Fill group
+ @} end of Fill group
*/
diff --git a/DSP/Source/SupportFunctions/arm_fill_q31.c b/DSP/Source/SupportFunctions/arm_fill_q31.c
index 397a7b5..e174889 100644
--- a/DSP/Source/SupportFunctions/arm_fill_q31.c
+++ b/DSP/Source/SupportFunctions/arm_fill_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fill_q31.c
* Description: Fills a constant value into a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,21 +29,20 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup Fill
- * @{
+ @addtogroup Fill
+ @{
*/
/**
- * @brief Fills a constant value into a Q31 vector.
- * @param[in] value input value to be filled
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the output vector
- * @return none.
- *
+ @brief Fills a constant value into a Q31 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_fill_q31(
@@ -51,59 +50,49 @@ void arm_fill_q31(
q31_t * pDst,
uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t in1 = value;
- q31_t in2 = value;
- q31_t in3 = value;
- q31_t in4 = value;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
- *pDst++ = in1;
- *pDst++ = in2;
- *pDst++ = in3;
- *pDst++ = in4;
- /* Decrement the loop counter */
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
+
+ /* Fill value in destination buffer */
*pDst++ = value;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of Fill group
+ @} end of Fill group
*/
diff --git a/DSP/Source/SupportFunctions/arm_fill_q7.c b/DSP/Source/SupportFunctions/arm_fill_q7.c
index dffdf97..bca3267 100644
--- a/DSP/Source/SupportFunctions/arm_fill_q7.c
+++ b/DSP/Source/SupportFunctions/arm_fill_q7.c
@@ -3,13 +3,13 @@
* Title: arm_fill_q7.c
* Description: Fills a constant value into a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,21 +29,20 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup Fill
- * @{
+ @addtogroup Fill
+ @{
*/
/**
- * @brief Fills a constant value into a Q7 vector.
- * @param[in] value input value to be filled
- * @param[out] *pDst points to output vector
- * @param[in] blockSize length of the output vector
- * @return none.
- *
+ @brief Fills a constant value into a Q7 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
*/
void arm_fill_q7(
@@ -51,56 +50,50 @@ void arm_fill_q7(
q7_t * pDst,
uint32_t blockSize)
{
- uint32_t blkCnt; /* loop counter */
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
q31_t packedValue; /* value packed to 32 bits */
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
-
/* Packing four 8 bit values to 32 bit value in order to use SIMD */
packedValue = __PACKq7(value, value, value, value);
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
- *__SIMD32(pDst)++ = packedValue;
- /* Decrement the loop counter */
+ /* fill 4 samples at a time */
+ write_q7x4_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = value */
- /* Fill the value in the destination buffer */
+
+ /* Fill value in destination buffer */
*pDst++ = value;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of Fill group
+ @} end of Fill group
*/
diff --git a/DSP/Source/SupportFunctions/arm_float_to_q15.c b/DSP/Source/SupportFunctions/arm_float_to_q15.c
index 0aa20f1..68c1ad0 100644
--- a/DSP/Source/SupportFunctions/arm_float_to_q15.c
+++ b/DSP/Source/SupportFunctions/arm_float_to_q15.c
@@ -3,13 +3,13 @@
* Title: arm_float_to_q15.c
* Description: Converts the elements of the floating-point vector to Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,95 +29,92 @@
#include "arm_math.h"
/**
- * @ingroup groupSupport
+ @ingroup groupSupport
*/
/**
- * @addtogroup float_to_x
- * @{
+ @addtogroup float_to_x
+ @{
*/
/**
- * @brief Converts the elements of the floating-point vector to Q15 vector.
- * @param[in] *pSrc points to the floating-point input vector
- * @param[out] *pDst points to the Q15 output vector
- * @param[in] blockSize length of the input vector
- * @return none.
- *
- * \par Description:
- * \par
- * The equation used for the conversion process is:
- *
- * \par Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
- * \note
- * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
- * defined in the preprocessor section of project options.
- *
+ @brief Converts the elements of the floating-point vector to Q15 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
- * Scaling and Overflow Behavior:
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
- *
- * \note In order to apply rounding, the library should be rebuilt with the ROUNDING macro
- * defined in the preprocessor section of project options.
+ @brief Converts the elements of the floating-point vector to Q31 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
- *
+ @brief Converts the elements of the Q15 vector to floating-point vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q15 vector to Q31 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q15 vector to Q7 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
- *
+ @brief Converts the elements of the Q31 vector to floating-point vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q31 vector to Q15 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q31 vector to Q7 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
- *
+ @brief Converts the elements of the Q7 vector to floating-point vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q7 vector to Q15 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
- *
- */
+ @brief Converts the elements of the Q7 vector to Q31 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ @par Details
+ The equation used for the conversion process is:
+
+ */
void arm_q7_to_q31(
- q7_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q7_t *pIn = pSrc; /* Src pointer */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- q31_t in;
+ q31_t in;
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (q31_t) A << 24 */
- /* convert from q7 to q31 and then store the results in the destination buffer */
- in = *__SIMD32(pIn)++;
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ in = read_q7x4_ia ((q7_t **) &pIn);
#ifndef ARM_MATH_BIG_ENDIAN
@@ -94,37 +87,35 @@ void arm_q7_to_q31(
*pDst++ = (__ROR(in, 16)) & 0xFF000000;
*pDst++ = (__ROR(in, 8)) & 0xFF000000;
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (q31_t) A << 24 */
- /* convert from q7 to q31 and then store the results in the destination buffer */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
*pDst++ = (q31_t) * pIn++ << 24;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
}
/**
- * @} end of q7_to_x group
+ @} end of q7_to_x group
*/
diff --git a/DSP/Source/TransformFunctions/CMakeLists.txt b/DSP/Source/TransformFunctions/CMakeLists.txt
new file mode 100644
index 0000000..4e5b4f2
--- /dev/null
+++ b/DSP/Source/TransformFunctions/CMakeLists.txt
@@ -0,0 +1,116 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPTransform)
+
+
+
+add_library(CMSISDSPTransform STATIC)
+
+include(fft)
+fft(CMSISDSPTransform)
+
+if (CONFIGTABLE AND ALLFFT)
+target_compile_definitions(CMSISDSPTransform PUBLIC ARM_ALL_FFT_TABLES)
+endif()
+
+target_sources(CMSISDSPTransform PRIVATE arm_bitreversal.c)
+target_sources(CMSISDSPTransform PRIVATE arm_bitreversal2.c)
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F32_16 OR CFFT_F32_32 OR CFFT_F32_64 OR CFFT_F32_128 OR CFFT_F32_256 OR CFFT_F32_512
+ OR CFFT_F32_1024 OR CFFT_F32_2048 OR CFFT_F32_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q15_16 OR CFFT_Q15_32 OR CFFT_Q15_64 OR CFFT_Q15_128 OR CFFT_Q15_256 OR CFFT_Q15_512
+ OR CFFT_Q15_1024 OR CFFT_Q15_2048 OR CFFT_Q15_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q31_16 OR CFFT_Q31_32 OR CFFT_Q31_64 OR CFFT_Q31_128 OR CFFT_Q31_256 OR CFFT_Q31_512
+ OR CFFT_Q31_1024 OR CFFT_Q31_2048 OR CFFT_Q31_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR DCT4_F32_128 OR DCT4_F32_512 OR DCT4_F32_2048 OR DCT4_F32_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_f32.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR DCT4_Q31_128 OR DCT4_Q31_512 OR DCT4_Q31_2048 OR DCT4_Q31_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q31.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR ALLFFT OR DCT4_Q15_128 OR DCT4_Q15_512 OR DCT4_Q15_2048 OR DCT4_Q15_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_q15.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F32_32 OR RFFT_FAST_F32_64 OR RFFT_FAST_F32_128
+ OR RFFT_FAST_F32_256 OR RFFT_FAST_F32_512 OR RFFT_FAST_F32_1024 OR RFFT_FAST_F32_2048
+ OR RFFT_FAST_F32_4096 )
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F32_128 OR RFFT_F32_512 OR RFFT_F32_2048 OR RFFT_F32_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q15_32 OR RFFT_Q15_64 OR RFFT_Q15_128 OR RFFT_Q15_256
+ OR RFFT_Q15_512 OR RFFT_Q15_1024 OR RFFT_Q15_2048 OR RFFT_Q15_4096 OR RFFT_Q15_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q31_32 OR RFFT_Q31_64 OR RFFT_Q31_128 OR RFFT_Q31_256
+ OR RFFT_Q31_512 OR RFFT_Q31_1024 OR RFFT_Q31_2048 OR RFFT_Q31_4096 OR RFFT_Q31_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+endif()
+
+configdsp(CMSISDSPTransform ..)
+
+### Includes
+target_include_directories(CMSISDSPTransform PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/TransformFunctions/TransformFunctions.c b/DSP/Source/TransformFunctions/TransformFunctions.c
new file mode 100644
index 0000000..d0f7ce4
--- /dev/null
+++ b/DSP/Source/TransformFunctions/TransformFunctions.c
@@ -0,0 +1,60 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: TransformFunctions.c
+ * Description: Combination of all transform function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_bitreversal.c"
+#include "arm_bitreversal2.c"
+#include "arm_cfft_f32.c"
+#include "arm_cfft_q15.c"
+#include "arm_cfft_q31.c"
+#include "arm_cfft_radix2_f32.c"
+#include "arm_cfft_radix2_init_f32.c"
+#include "arm_cfft_radix2_init_q15.c"
+#include "arm_cfft_radix2_init_q31.c"
+#include "arm_cfft_radix2_q15.c"
+#include "arm_cfft_radix2_q31.c"
+#include "arm_cfft_radix4_f32.c"
+#include "arm_cfft_radix4_init_f32.c"
+#include "arm_cfft_radix4_init_q15.c"
+#include "arm_cfft_radix4_init_q31.c"
+#include "arm_cfft_radix4_q15.c"
+#include "arm_cfft_radix4_q31.c"
+#include "arm_cfft_radix8_f32.c"
+#include "arm_dct4_f32.c"
+#include "arm_dct4_init_f32.c"
+#include "arm_dct4_init_q15.c"
+#include "arm_dct4_init_q31.c"
+#include "arm_dct4_q15.c"
+#include "arm_dct4_q31.c"
+#include "arm_rfft_f32.c"
+#include "arm_rfft_fast_f32.c"
+#include "arm_rfft_fast_init_f32.c"
+#include "arm_rfft_init_f32.c"
+#include "arm_rfft_init_q15.c"
+#include "arm_rfft_init_q31.c"
+#include "arm_rfft_q15.c"
+#include "arm_rfft_q31.c"
diff --git a/DSP/Source/TransformFunctions/arm_bitreversal.c b/DSP/Source/TransformFunctions/arm_bitreversal.c
index cea4821..c608129 100644
--- a/DSP/Source/TransformFunctions/arm_bitreversal.c
+++ b/DSP/Source/TransformFunctions/arm_bitreversal.c
@@ -3,13 +3,13 @@
* Title: arm_bitreversal.c
* Description: Bitreversal functions
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,20 +29,20 @@
#include "arm_math.h"
#include "arm_common_tables.h"
-/*
-* @brief In-place bit reversal function.
-* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
-* @param[in] fftSize length of the FFT.
-* @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table.
-* @param[in] *pBitRevTab points to the bit reversal table.
-* @return none.
-*/
+/**
+ @brief In-place floating-point bit reversal function.
+ @param[in,out] pSrc points to in-place floating-point data buffer
+ @param[in] fftSize length of FFT
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+ */
void arm_bitreversal_f32(
-float32_t * pSrc,
-uint16_t fftSize,
-uint16_t bitRevFactor,
-uint16_t * pBitRevTab)
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
{
uint16_t fftLenBy2, fftLenBy2p1;
uint16_t i, j;
@@ -100,21 +100,20 @@ uint16_t * pBitRevTab)
}
-
-/*
-* @brief In-place bit reversal function.
-* @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
-* @param[in] fftLen length of the FFT.
-* @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
-* @param[in] *pBitRevTab points to bit reversal table.
-* @return none.
+/**
+ @brief In-place Q31 bit reversal function.
+ @param[in,out] pSrc points to in-place Q31 data buffer.
+ @param[in] fftLen length of FFT.
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
*/
void arm_bitreversal_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-uint16_t bitRevFactor,
-uint16_t * pBitRevTable)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
{
uint32_t fftLenBy2, fftLenBy2p1, i, j;
q31_t in;
@@ -163,29 +162,29 @@ uint16_t * pBitRevTable)
pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
/* Reading the index for the bit reversal */
- j = *pBitRevTable;
+ j = *pBitRevTab;
/* Updating the bit reversal index depending on the fft length */
- pBitRevTable += bitRevFactor;
+ pBitRevTab += bitRevFactor;
}
}
-/*
- * @brief In-place bit reversal function.
- * @param[in, out] *pSrc points to the in-place buffer of Q15 data type.
- * @param[in] fftLen length of the FFT.
- * @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
- * @param[in] *pBitRevTab points to bit reversal table.
- * @return none.
+/**
+ @brief In-place Q15 bit reversal function.
+ @param[in,out] pSrc16 points to in-place Q15 data buffer
+ @param[in] fftLen length of FFT
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
*/
void arm_bitreversal_q15(
-q15_t * pSrc16,
-uint32_t fftLen,
-uint16_t bitRevFactor,
-uint16_t * pBitRevTab)
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
{
q31_t *pSrc = (q31_t *) pSrc16;
q31_t in;
diff --git a/DSP/Source/TransformFunctions/arm_bitreversal2.S b/DSP/Source/TransformFunctions/arm_bitreversal2.S
index e0a82db..c16091b 100644
--- a/DSP/Source/TransformFunctions/arm_bitreversal2.S
+++ b/DSP/Source/TransformFunctions/arm_bitreversal2.S
@@ -5,13 +5,13 @@
; * Called after doing an fft to reorder the output.
; * The function is loop unrolled by 2. arm_bitreversal_16 as well.
; *
-; * $Date: 27. January 2017
-; * $Revision: V.1.5.1
+; * $Date: 18. March 2019
+; * $Revision: V1.5.2
; *
; * Target Processor: Cortex-M cores
; * -------------------------------------------------------------------- */
;/*
-; * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+; * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
; *
; * SPDX-License-Identifier: Apache-2.0
; *
@@ -68,13 +68,13 @@
CODESECT
THUMB
-;/*
-;* @brief In-place bit reversal function.
-;* @param[in, out] *pSrc points to the in-place buffer of unknown 32-bit data type.
-;* @param[in] bitRevLen bit reversal table length
-;* @param[in] *pBitRevTab points to bit reversal table.
-;* @return none.
-;*/
+;/**
+; @brief In-place bit reversal function.
+; @param[in,out] pSrc points to the in-place buffer of unknown 32-bit data type
+; @param[in] bitRevLen bit reversal table length
+; @param[in] pBitRevTab points to bit reversal table
+; @return none
+; */
EXPORT arm_bitreversal_32
EXPORT arm_bitreversal_16
@@ -87,7 +87,7 @@
.type arm_bitreversal_32, %function
#endif
-#if defined(ARM_MATH_CM0) || defined(ARM_MATH_CM0PLUS) || defined(ARM_MATH_ARMV8MBL)
+#if defined (ARM_MATH_CM0_FAMILY)
arm_bitreversal_32 PROC
ADDS r3,r1,#1
diff --git a/DSP/Source/TransformFunctions/arm_bitreversal2.c b/DSP/Source/TransformFunctions/arm_bitreversal2.c
new file mode 100644
index 0000000..c5fe60f
--- /dev/null
+++ b/DSP/Source/TransformFunctions/arm_bitreversal2.c
@@ -0,0 +1,99 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitreversal2.c
+ * Description: Bitreversal functions
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+ @brief In-place 32 bit reversal function.
+ @param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
+ @param[in] bitRevLen bit reversal table length
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_32(
+ uint32_t *pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t *pBitRevTab)
+{
+ uint32_t a, b, i, tmp;
+
+ for (i = 0; i < bitRevLen; )
+ {
+ a = pBitRevTab[i ] >> 2;
+ b = pBitRevTab[i + 1] >> 2;
+
+ //real
+ tmp = pSrc[a];
+ pSrc[a] = pSrc[b];
+ pSrc[b] = tmp;
+
+ //complex
+ tmp = pSrc[a+1];
+ pSrc[a+1] = pSrc[b+1];
+ pSrc[b+1] = tmp;
+
+ i += 2;
+ }
+}
+
+
+/**
+ @brief In-place 16 bit reversal function.
+ @param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
+ @param[in] bitRevLen bit reversal table length
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_16(
+ uint16_t *pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t *pBitRevTab)
+{
+ uint16_t a, b, i, tmp;
+
+ for (i = 0; i < bitRevLen; )
+ {
+ a = pBitRevTab[i ] >> 2;
+ b = pBitRevTab[i + 1] >> 2;
+
+ //real
+ tmp = pSrc[a];
+ pSrc[a] = pSrc[b];
+ pSrc[b] = tmp;
+
+ //complex
+ tmp = pSrc[a+1];
+ pSrc[a+1] = pSrc[b+1];
+ pSrc[b+1] = tmp;
+
+ i += 2;
+ }
+}
diff --git a/DSP/Source/TransformFunctions/arm_cfft_f32.c b/DSP/Source/TransformFunctions/arm_cfft_f32.c
index 4abb6f5..2fff61c 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_f32.c
* Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,281 +30,283 @@
#include "arm_common_tables.h"
extern void arm_radix8_butterfly_f32(
- float32_t * pSrc,
- uint16_t fftLen,
- const float32_t * pCoef,
- uint16_t twidCoefModifier);
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
extern void arm_bitreversal_32(
- uint32_t * pSrc,
- const uint16_t bitRevLen,
- const uint16_t * pBitRevTable);
+ uint32_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
/**
-* @ingroup groupTransforms
-*/
+ @ingroup groupTransforms
+ */
/**
-* @defgroup ComplexFFT Complex FFT Functions
-*
-* \par
-* The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
-* Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
-* than the DFT, especially for long lengths.
-* The algorithms described in this section
-* operate on complex data. A separate set of functions is devoted to handling
-* of real sequences.
-* \par
-* There are separate algorithms for handling floating-point, Q15, and Q31 data
-* types. The algorithms available for each data type are described next.
-* \par
-* The FFT functions operate in-place. That is, the array holding the input data
-* will also be used to hold the corresponding result. The input data is complex
-* and contains 2*fftLen interleaved values as shown below.
-*
{real[0], imag[0], real[1], imag[1],..}
-* The FFT result will be contained in the same array and the frequency domain
-* values will have the same interleaving.
-*
-* \par Floating-point
-* The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
-* stages are performed along with a single radix-2 or radix-4 stage, as needed.
-* The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
-* a different twiddle factor table.
-* \par
-* The function uses the standard FFT definition and output values may grow by a
-* factor of fftLen when computing the forward transform. The
-* inverse transform includes a scale of 1/fftLen as part of the
-* calculation and this matches the textbook definition of the inverse FFT.
-* \par
-* Pre-initialized data structures containing twiddle factors and bit reversal
-* tables are provided and defined in arm_const_structs.h. Include
-* this header in your function and then pass one of the constant structures as
-* an argument to arm_cfft_f32. For example:
-* \par
-* arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)
-* \par
-* computes a 64-point inverse complex FFT including bit reversal.
-* The data structures are treated as constant data and not modified during the
-* calculation. The same data structure can be reused for multiple transforms
-* including mixing forward and inverse transforms.
-* \par
-* Earlier releases of the library provided separate radix-2 and radix-4
-* algorithms that operated on floating-point data. These functions are still
-* provided but are deprecated. The older functions are slower and less general
-* than the new functions.
-* \par
-* An example of initialization of the constants for the arm_cfft_f32 function follows:
-* \code
-* const static arm_cfft_instance_f32 *S;
-* ...
-* switch (length) {
-* case 16:
-* S = &arm_cfft_sR_f32_len16;
-* break;
-* case 32:
-* S = &arm_cfft_sR_f32_len32;
-* break;
-* case 64:
-* S = &arm_cfft_sR_f32_len64;
-* break;
-* case 128:
-* S = &arm_cfft_sR_f32_len128;
-* break;
-* case 256:
-* S = &arm_cfft_sR_f32_len256;
-* break;
-* case 512:
-* S = &arm_cfft_sR_f32_len512;
-* break;
-* case 1024:
-* S = &arm_cfft_sR_f32_len1024;
-* break;
-* case 2048:
-* S = &arm_cfft_sR_f32_len2048;
-* break;
-* case 4096:
-* S = &arm_cfft_sR_f32_len4096;
-* break;
-* }
-* \endcode
-* \par Q15 and Q31
-* The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
-* stages are performed along with a single radix-2 stage, as needed.
-* The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
-* a different twiddle factor table.
-* \par
-* The function uses the standard FFT definition and output values may grow by a
-* factor of fftLen when computing the forward transform. The
-* inverse transform includes a scale of 1/fftLen as part of the
-* calculation and this matches the textbook definition of the inverse FFT.
-* \par
-* Pre-initialized data structures containing twiddle factors and bit reversal
-* tables are provided and defined in arm_const_structs.h. Include
-* this header in your function and then pass one of the constant structures as
-* an argument to arm_cfft_q31. For example:
-* \par
-* arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)
-* \par
-* computes a 64-point inverse complex FFT including bit reversal.
-* The data structures are treated as constant data and not modified during the
-* calculation. The same data structure can be reused for multiple transforms
-* including mixing forward and inverse transforms.
-* \par
-* Earlier releases of the library provided separate radix-2 and radix-4
-* algorithms that operated on floating-point data. These functions are still
-* provided but are deprecated. The older functions are slower and less general
-* than the new functions.
-* \par
-* An example of initialization of the constants for the arm_cfft_q31 function follows:
-* \code
-* const static arm_cfft_instance_q31 *S;
-* ...
-* switch (length) {
-* case 16:
-* S = &arm_cfft_sR_q31_len16;
-* break;
-* case 32:
-* S = &arm_cfft_sR_q31_len32;
-* break;
-* case 64:
-* S = &arm_cfft_sR_q31_len64;
-* break;
-* case 128:
-* S = &arm_cfft_sR_q31_len128;
-* break;
-* case 256:
-* S = &arm_cfft_sR_q31_len256;
-* break;
-* case 512:
-* S = &arm_cfft_sR_q31_len512;
-* break;
-* case 1024:
-* S = &arm_cfft_sR_q31_len1024;
-* break;
-* case 2048:
-* S = &arm_cfft_sR_q31_len2048;
-* break;
-* case 4096:
-* S = &arm_cfft_sR_q31_len4096;
-* break;
-* }
-* \endcode
-*
-*/
+ @defgroup ComplexFFT Complex FFT Functions
+
+ @par
+ The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
+ Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
+ than the DFT, especially for long lengths.
+ The algorithms described in this section
+ operate on complex data. A separate set of functions is devoted to handling
+ of real sequences.
+ @par
+ There are separate algorithms for handling floating-point, Q15, and Q31 data
+ types. The algorithms available for each data type are described next.
+ @par
+ The FFT functions operate in-place. That is, the array holding the input data
+ will also be used to hold the corresponding result. The input data is complex
+ and contains 2*fftLen interleaved values as shown below.
+
{real[0], imag[0], real[1], imag[1], ...}
+ The FFT result will be contained in the same array and the frequency domain
+ values will have the same interleaving.
+
+ @par Floating-point
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
+ stages are performed along with a single radix-2 or radix-4 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ Pre-initialized data structures containing twiddle factors and bit reversal
+ tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_f32. For example:
+ @par
+ arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_f32 function follows:
+ @code
+ const static arm_cfft_instance_f32 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_f32_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_f32_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_f32_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_f32_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_f32_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_f32_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_f32_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_f32_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_f32_len4096;
+ break;
+ }
+ @endcode
+ @par Q15 and Q31
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
+ stages are performed along with a single radix-2 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ Pre-initialized data structures containing twiddle factors and bit reversal
+ tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_q31. For example:
+ @par
+ arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_q31 function follows:
+ @code
+ const static arm_cfft_instance_q31 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_q31_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_q31_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_q31_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_q31_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_q31_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_q31_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_q31_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_q31_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_q31_len4096;
+ break;
+ }
+ @endcode
+
+ */
-void arm_cfft_radix8by2_f32( arm_cfft_instance_f32 * S, float32_t * p1)
+void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
{
- uint32_t L = S->fftLen;
- float32_t * pCol1, * pCol2, * pMid1, * pMid2;
- float32_t * p2 = p1 + L;
- const float32_t * tw = (float32_t *) S->pTwiddle;
- float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
- float32_t m0, m1, m2, m3;
- uint32_t l;
+ uint32_t L = S->fftLen;
+ float32_t * pCol1, * pCol2, * pMid1, * pMid2;
+ float32_t * p2 = p1 + L;
+ const float32_t * tw = (float32_t *) S->pTwiddle;
+ float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
+ float32_t m0, m1, m2, m3;
+ uint32_t l;
- pCol1 = p1;
- pCol2 = p2;
+ pCol1 = p1;
+ pCol2 = p2;
- // Define new length
- L >>= 1;
- // Initialize mid pointers
- pMid1 = p1 + L;
- pMid2 = p2 + L;
+ /* Define new length */
+ L >>= 1;
- // do two dot Fourier transform
- for ( l = L >> 2; l > 0; l-- )
- {
- t1[0] = p1[0];
- t1[1] = p1[1];
- t1[2] = p1[2];
- t1[3] = p1[3];
+ /* Initialize mid pointers */
+ pMid1 = p1 + L;
+ pMid2 = p2 + L;
- t2[0] = p2[0];
- t2[1] = p2[1];
- t2[2] = p2[2];
- t2[3] = p2[3];
+ /* do two dot Fourier transform */
+ for (l = L >> 2; l > 0; l-- )
+ {
+ t1[0] = p1[0];
+ t1[1] = p1[1];
+ t1[2] = p1[2];
+ t1[3] = p1[3];
- t3[0] = pMid1[0];
- t3[1] = pMid1[1];
- t3[2] = pMid1[2];
- t3[3] = pMid1[3];
+ t2[0] = p2[0];
+ t2[1] = p2[1];
+ t2[2] = p2[2];
+ t2[3] = p2[3];
- t4[0] = pMid2[0];
- t4[1] = pMid2[1];
- t4[2] = pMid2[2];
- t4[3] = pMid2[3];
+ t3[0] = pMid1[0];
+ t3[1] = pMid1[1];
+ t3[2] = pMid1[2];
+ t3[3] = pMid1[3];
- *p1++ = t1[0] + t2[0];
- *p1++ = t1[1] + t2[1];
- *p1++ = t1[2] + t2[2];
- *p1++ = t1[3] + t2[3]; // col 1
+ t4[0] = pMid2[0];
+ t4[1] = pMid2[1];
+ t4[2] = pMid2[2];
+ t4[3] = pMid2[3];
- t2[0] = t1[0] - t2[0];
- t2[1] = t1[1] - t2[1];
- t2[2] = t1[2] - t2[2];
- t2[3] = t1[3] - t2[3]; // for col 2
+ *p1++ = t1[0] + t2[0];
+ *p1++ = t1[1] + t2[1];
+ *p1++ = t1[2] + t2[2];
+ *p1++ = t1[3] + t2[3]; /* col 1 */
- *pMid1++ = t3[0] + t4[0];
- *pMid1++ = t3[1] + t4[1];
- *pMid1++ = t3[2] + t4[2];
- *pMid1++ = t3[3] + t4[3]; // col 1
+ t2[0] = t1[0] - t2[0];
+ t2[1] = t1[1] - t2[1];
+ t2[2] = t1[2] - t2[2];
+ t2[3] = t1[3] - t2[3]; /* for col 2 */
- t4[0] = t4[0] - t3[0];
- t4[1] = t4[1] - t3[1];
- t4[2] = t4[2] - t3[2];
- t4[3] = t4[3] - t3[3]; // for col 2
+ *pMid1++ = t3[0] + t4[0];
+ *pMid1++ = t3[1] + t4[1];
+ *pMid1++ = t3[2] + t4[2];
+ *pMid1++ = t3[3] + t4[3]; /* col 1 */
- twR = *tw++;
- twI = *tw++;
+ t4[0] = t4[0] - t3[0];
+ t4[1] = t4[1] - t3[1];
+ t4[2] = t4[2] - t3[2];
+ t4[3] = t4[3] - t3[3]; /* for col 2 */
- // multiply by twiddle factors
- m0 = t2[0] * twR;
- m1 = t2[1] * twI;
- m2 = t2[1] * twR;
- m3 = t2[0] * twI;
+ twR = *tw++;
+ twI = *tw++;
- // R = R * Tr - I * Ti
- *p2++ = m0 + m1;
- // I = I * Tr + R * Ti
- *p2++ = m2 - m3;
+ /* multiply by twiddle factors */
+ m0 = t2[0] * twR;
+ m1 = t2[1] * twI;
+ m2 = t2[1] * twR;
+ m3 = t2[0] * twI;
- // use vertical symmetry
- // 0.9988 - 0.0491i <==> -0.0491 - 0.9988i
- m0 = t4[0] * twI;
- m1 = t4[1] * twR;
- m2 = t4[1] * twI;
- m3 = t4[0] * twR;
+ /* R = R * Tr - I * Ti */
+ *p2++ = m0 + m1;
+ /* I = I * Tr + R * Ti */
+ *p2++ = m2 - m3;
- *pMid2++ = m0 - m1;
- *pMid2++ = m2 + m3;
+ /* use vertical symmetry */
+ /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
+ m0 = t4[0] * twI;
+ m1 = t4[1] * twR;
+ m2 = t4[1] * twI;
+ m3 = t4[0] * twR;
- twR = *tw++;
- twI = *tw++;
+ *pMid2++ = m0 - m1;
+ *pMid2++ = m2 + m3;
- m0 = t2[2] * twR;
- m1 = t2[3] * twI;
- m2 = t2[3] * twR;
- m3 = t2[2] * twI;
+ twR = *tw++;
+ twI = *tw++;
- *p2++ = m0 + m1;
- *p2++ = m2 - m3;
+ m0 = t2[2] * twR;
+ m1 = t2[3] * twI;
+ m2 = t2[3] * twR;
+ m3 = t2[2] * twI;
- m0 = t4[2] * twI;
- m1 = t4[3] * twR;
- m2 = t4[3] * twI;
- m3 = t4[2] * twR;
+ *p2++ = m0 + m1;
+ *p2++ = m2 - m3;
- *pMid2++ = m0 - m1;
- *pMid2++ = m2 + m3;
- }
+ m0 = t4[2] * twI;
+ m1 = t4[3] * twR;
+ m2 = t4[3] * twI;
+ m3 = t4[2] * twR;
- // first col
- arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 2U);
- // second col
- arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 2U);
+ *pMid2++ = m0 - m1;
+ *pMid2++ = m2 + m3;
+ }
+
+ /* first col */
+ arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
+
+ /* second col */
+ arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
}
-void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
+void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
{
uint32_t L = S->fftLen >> 1;
float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
@@ -317,11 +319,11 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
float32_t m0, m1, m2, m3;
uint32_t l, twMod2, twMod3, twMod4;
- pCol1 = p1; // points to real values by default
+ pCol1 = p1; /* points to real values by default */
pCol2 = p2;
pCol3 = p3;
pCol4 = p4;
- pEnd1 = p2 - 1; // points to imaginary values by default
+ pEnd1 = p2 - 1; /* points to imaginary values by default */
pEnd2 = p3 - 1;
pEnd3 = p4 - 1;
pEnd4 = pEnd3 + L;
@@ -330,32 +332,32 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
L >>= 1;
- // do four dot Fourier transform
+ /* do four dot Fourier transform */
twMod2 = 2;
twMod3 = 4;
twMod4 = 6;
- // TOP
+ /* TOP */
p1ap3_0 = p1[0] + p3[0];
p1sp3_0 = p1[0] - p3[0];
p1ap3_1 = p1[1] + p3[1];
p1sp3_1 = p1[1] - p3[1];
- // col 2
+ /* col 2 */
t2[0] = p1sp3_0 + p2[1] - p4[1];
t2[1] = p1sp3_1 - p2[0] + p4[0];
- // col 3
+ /* col 3 */
t3[0] = p1ap3_0 - p2[0] - p4[0];
t3[1] = p1ap3_1 - p2[1] - p4[1];
- // col 4
+ /* col 4 */
t4[0] = p1sp3_0 - p2[1] + p4[1];
t4[1] = p1sp3_1 + p2[0] - p4[0];
- // col 1
+ /* col 1 */
*p1++ = p1ap3_0 + p2[0] + p4[0];
*p1++ = p1ap3_1 + p2[1] + p4[1];
- // Twiddle factors are ones
+ /* Twiddle factors are ones */
*p2++ = t2[0];
*p2++ = t2[1];
*p3++ = t3[0];
@@ -369,138 +371,138 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
for (l = (L - 2) >> 1; l > 0; l-- )
{
- // TOP
- p1ap3_0 = p1[0] + p3[0];
- p1sp3_0 = p1[0] - p3[0];
- p1ap3_1 = p1[1] + p3[1];
- p1sp3_1 = p1[1] - p3[1];
- // col 2
- t2[0] = p1sp3_0 + p2[1] - p4[1];
- t2[1] = p1sp3_1 - p2[0] + p4[0];
- // col 3
- t3[0] = p1ap3_0 - p2[0] - p4[0];
- t3[1] = p1ap3_1 - p2[1] - p4[1];
- // col 4
- t4[0] = p1sp3_0 - p2[1] + p4[1];
- t4[1] = p1sp3_1 + p2[0] - p4[0];
- // col 1 - top
- *p1++ = p1ap3_0 + p2[0] + p4[0];
- *p1++ = p1ap3_1 + p2[1] + p4[1];
+ /* TOP */
+ p1ap3_0 = p1[0] + p3[0];
+ p1sp3_0 = p1[0] - p3[0];
+ p1ap3_1 = p1[1] + p3[1];
+ p1sp3_1 = p1[1] - p3[1];
+ /* col 2 */
+ t2[0] = p1sp3_0 + p2[1] - p4[1];
+ t2[1] = p1sp3_1 - p2[0] + p4[0];
+ /* col 3 */
+ t3[0] = p1ap3_0 - p2[0] - p4[0];
+ t3[1] = p1ap3_1 - p2[1] - p4[1];
+ /* col 4 */
+ t4[0] = p1sp3_0 - p2[1] + p4[1];
+ t4[1] = p1sp3_1 + p2[0] - p4[0];
+ /* col 1 - top */
+ *p1++ = p1ap3_0 + p2[0] + p4[0];
+ *p1++ = p1ap3_1 + p2[1] + p4[1];
- // BOTTOM
- p1ap3_1 = pEnd1[-1] + pEnd3[-1];
- p1sp3_1 = pEnd1[-1] - pEnd3[-1];
- p1ap3_0 = pEnd1[0] + pEnd3[0];
- p1sp3_0 = pEnd1[0] - pEnd3[0];
- // col 2
- t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
- t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
- // col 3
- t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
- t3[3] = p1ap3_0 - pEnd2[0] - pEnd4[0];
- // col 4
- t4[2] = pEnd2[0] - pEnd4[0] - p1sp3_1;
- t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
- // col 1 - Bottom
- *pEnd1-- = p1ap3_0 + pEnd2[0] + pEnd4[0];
- *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
+ /* BOTTOM */
+ p1ap3_1 = pEnd1[-1] + pEnd3[-1];
+ p1sp3_1 = pEnd1[-1] - pEnd3[-1];
+ p1ap3_0 = pEnd1[ 0] + pEnd3[0];
+ p1sp3_0 = pEnd1[ 0] - pEnd3[0];
+ /* col 2 */
+ t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
+ t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
+ /* col 3 */
+ t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
+ t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
+ /* col 4 */
+ t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
+ t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
+ /* col 1 - Bottom */
+ *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
+ *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
- // COL 2
- // read twiddle factors
- twR = *tw2++;
- twI = *tw2++;
- // multiply by twiddle factors
- // let Z1 = a + i(b), Z2 = c + i(d)
- // => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d)
+ /* COL 2 */
+ /* read twiddle factors */
+ twR = *tw2++;
+ twI = *tw2++;
+ /* multiply by twiddle factors */
+ /* let Z1 = a + i(b), Z2 = c + i(d) */
+ /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
- // Top
- m0 = t2[0] * twR;
- m1 = t2[1] * twI;
- m2 = t2[1] * twR;
- m3 = t2[0] * twI;
+ /* Top */
+ m0 = t2[0] * twR;
+ m1 = t2[1] * twI;
+ m2 = t2[1] * twR;
+ m3 = t2[0] * twI;
- *p2++ = m0 + m1;
- *p2++ = m2 - m3;
- // use vertical symmetry col 2
- // 0.9997 - 0.0245i <==> 0.0245 - 0.9997i
- // Bottom
- m0 = t2[3] * twI;
- m1 = t2[2] * twR;
- m2 = t2[2] * twI;
- m3 = t2[3] * twR;
+ *p2++ = m0 + m1;
+ *p2++ = m2 - m3;
+ /* use vertical symmetry col 2 */
+ /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
+ /* Bottom */
+ m0 = t2[3] * twI;
+ m1 = t2[2] * twR;
+ m2 = t2[2] * twI;
+ m3 = t2[3] * twR;
- *pEnd2-- = m0 - m1;
- *pEnd2-- = m2 + m3;
+ *pEnd2-- = m0 - m1;
+ *pEnd2-- = m2 + m3;
- // COL 3
- twR = tw3[0];
- twI = tw3[1];
- tw3 += twMod3;
- // Top
- m0 = t3[0] * twR;
- m1 = t3[1] * twI;
- m2 = t3[1] * twR;
- m3 = t3[0] * twI;
+ /* COL 3 */
+ twR = tw3[0];
+ twI = tw3[1];
+ tw3 += twMod3;
+ /* Top */
+ m0 = t3[0] * twR;
+ m1 = t3[1] * twI;
+ m2 = t3[1] * twR;
+ m3 = t3[0] * twI;
- *p3++ = m0 + m1;
- *p3++ = m2 - m3;
- // use vertical symmetry col 3
- // 0.9988 - 0.0491i <==> -0.9988 - 0.0491i
- // Bottom
- m0 = -t3[3] * twR;
- m1 = t3[2] * twI;
- m2 = t3[2] * twR;
- m3 = t3[3] * twI;
+ *p3++ = m0 + m1;
+ *p3++ = m2 - m3;
+ /* use vertical symmetry col 3 */
+ /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
+ /* Bottom */
+ m0 = -t3[3] * twR;
+ m1 = t3[2] * twI;
+ m2 = t3[2] * twR;
+ m3 = t3[3] * twI;
- *pEnd3-- = m0 - m1;
- *pEnd3-- = m3 - m2;
+ *pEnd3-- = m0 - m1;
+ *pEnd3-- = m3 - m2;
- // COL 4
- twR = tw4[0];
- twI = tw4[1];
- tw4 += twMod4;
- // Top
- m0 = t4[0] * twR;
- m1 = t4[1] * twI;
- m2 = t4[1] * twR;
- m3 = t4[0] * twI;
+ /* COL 4 */
+ twR = tw4[0];
+ twI = tw4[1];
+ tw4 += twMod4;
+ /* Top */
+ m0 = t4[0] * twR;
+ m1 = t4[1] * twI;
+ m2 = t4[1] * twR;
+ m3 = t4[0] * twI;
- *p4++ = m0 + m1;
- *p4++ = m2 - m3;
- // use vertical symmetry col 4
- // 0.9973 - 0.0736i <==> -0.0736 + 0.9973i
- // Bottom
- m0 = t4[3] * twI;
- m1 = t4[2] * twR;
- m2 = t4[2] * twI;
- m3 = t4[3] * twR;
+ *p4++ = m0 + m1;
+ *p4++ = m2 - m3;
+ /* use vertical symmetry col 4 */
+ /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
+ /* Bottom */
+ m0 = t4[3] * twI;
+ m1 = t4[2] * twR;
+ m2 = t4[2] * twI;
+ m3 = t4[3] * twR;
- *pEnd4-- = m0 - m1;
- *pEnd4-- = m2 + m3;
+ *pEnd4-- = m0 - m1;
+ *pEnd4-- = m2 + m3;
}
- //MIDDLE
- // Twiddle factors are
- // 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i
+ /* MIDDLE */
+ /* Twiddle factors are */
+ /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
p1ap3_0 = p1[0] + p3[0];
p1sp3_0 = p1[0] - p3[0];
p1ap3_1 = p1[1] + p3[1];
p1sp3_1 = p1[1] - p3[1];
- // col 2
+ /* col 2 */
t2[0] = p1sp3_0 + p2[1] - p4[1];
t2[1] = p1sp3_1 - p2[0] + p4[0];
- // col 3
+ /* col 3 */
t3[0] = p1ap3_0 - p2[0] - p4[0];
t3[1] = p1ap3_1 - p2[1] - p4[1];
- // col 4
+ /* col 4 */
t4[0] = p1sp3_0 - p2[1] + p4[1];
t4[1] = p1sp3_1 + p2[0] - p4[0];
- // col 1 - Top
+ /* col 1 - Top */
*p1++ = p1ap3_0 + p2[0] + p4[0];
*p1++ = p1ap3_1 + p2[1] + p4[1];
- // COL 2
+ /* COL 2 */
twR = tw2[0];
twI = tw2[1];
@@ -511,7 +513,7 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
*p2++ = m0 + m1;
*p2++ = m2 - m3;
- // COL 3
+ /* COL 3 */
twR = tw3[0];
twI = tw3[1];
@@ -522,7 +524,7 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
*p3++ = m0 + m1;
*p3++ = m2 - m3;
- // COL 4
+ /* COL 4 */
twR = tw4[0];
twI = tw4[1];
@@ -534,87 +536,94 @@ void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
*p4++ = m0 + m1;
*p4++ = m2 - m3;
- // first col
- arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 4U);
- // second col
- arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 4U);
- // third col
- arm_radix8_butterfly_f32( pCol3, L, (float32_t *) S->pTwiddle, 4U);
- // fourth col
- arm_radix8_butterfly_f32( pCol4, L, (float32_t *) S->pTwiddle, 4U);
+ /* first col */
+ arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* second col */
+ arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* third col */
+ arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* fourth col */
+ arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
}
/**
-* @addtogroup ComplexFFT
-* @{
-*/
+ @addtogroup ComplexFFT
+ @{
+ */
/**
-* @details
-* @brief Processing function for the floating-point complex FFT.
-* @param[in] *S points to an instance of the floating-point CFFT structure.
-* @param[in, out] *p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return none.
-*/
+ @brief Processing function for the floating-point complex FFT.
+ @param[in] S points to an instance of the floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
void arm_cfft_f32(
- const arm_cfft_instance_f32 * S,
- float32_t * p1,
- uint8_t ifftFlag,
- uint8_t bitReverseFlag)
+ const arm_cfft_instance_f32 * S,
+ float32_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
{
- uint32_t L = S->fftLen, l;
- float32_t invL, * pSrc;
+ uint32_t L = S->fftLen, l;
+ float32_t invL, * pSrc;
- if (ifftFlag == 1U)
+ if (ifftFlag == 1U)
+ {
+ /* Conjugate input data */
+ pSrc = p1 + 1;
+ for (l = 0; l < L; l++)
{
- /* Conjugate input data */
- pSrc = p1 + 1;
- for(l=0; lpTwiddle, 1);
+ break;
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
+
+ if (ifftFlag == 1U)
+ {
+ invL = 1.0f / (float32_t)L;
+
+ /* Conjugate and scale output data */
+ pSrc = p1;
+ for (l= 0; l < L; l++)
{
- case 16:
- case 128:
- case 1024:
- arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
- break;
- case 32:
- case 256:
- case 2048:
- arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
- break;
- case 64:
- case 512:
- case 4096:
- arm_radix8_butterfly_f32( p1, L, (float32_t *) S->pTwiddle, 1);
- break;
- }
-
- if ( bitReverseFlag )
- arm_bitreversal_32((uint32_t*)p1,S->bitRevLength,S->pBitRevTable);
-
- if (ifftFlag == 1U)
- {
- invL = 1.0f/(float32_t)L;
- /* Conjugate and scale output data */
- pSrc = p1;
- for(l=0; l2*fftLen. Processing occurs in-place.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return none.
-*/
+ @brief Processing function for Q15 complex FFT.
+ @param[in] S points to an instance of Q15 CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
void arm_cfft_q15(
- const arm_cfft_instance_q15 * S,
- q15_t * p1,
- uint8_t ifftFlag,
- uint8_t bitReverseFlag)
+ const arm_cfft_instance_q15 * S,
+ q15_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
{
- uint32_t L = S->fftLen;
+ uint32_t L = S->fftLen;
- if (ifftFlag == 1U)
- {
- switch (L)
- {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
- break;
+ if (ifftFlag == 1U)
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
+ break;
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
- break;
- }
- }
- else
- {
- switch (L)
- {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
- break;
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+ else
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
+ break;
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle );
- break;
- }
- }
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
- if ( bitReverseFlag )
- arm_bitreversal_16((uint16_t*)p1,S->bitRevLength,S->pBitRevTable);
+ if ( bitReverseFlag )
+ arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
}
/**
-* @} end of ComplexFFT group
-*/
+ @} end of ComplexFFT group
+ */
void arm_cfft_radix4by2_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- const q15_t * pCoef)
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef)
{
- uint32_t i;
- uint32_t n2;
- q15_t p0, p1, p2, p3;
+ uint32_t i;
+ uint32_t n2;
+ q15_t p0, p1, p2, p3;
#if defined (ARM_MATH_DSP)
- q31_t T, S, R;
- q31_t coeff, out1, out2;
- const q15_t *pC = pCoef;
- q15_t *pSi = pSrc;
- q15_t *pSl = pSrc + fftLen;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+ const q15_t *pC = pCoef;
+ q15_t *pSi = pSrc;
+ q15_t *pSl = pSrc + fftLen;
#else
- uint32_t ia, l;
- q15_t xt, yt, cosVal, sinVal;
+ uint32_t l;
+ q15_t xt, yt, cosVal, sinVal;
#endif
- n2 = fftLen >> 1;
+ n2 = fftLen >> 1U;
#if defined (ARM_MATH_DSP)
- for (i = n2; i > 0; i--)
- {
- coeff = _SIMD32_OFFSET(pC);
- pC += 2;
+ for (i = n2; i > 0; i--)
+ {
+ coeff = read_q15x2_ia ((q15_t **) &pC);
- T = _SIMD32_OFFSET(pSi);
- T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
+ T = read_q15x2 (pSi);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
- S = _SIMD32_OFFSET(pSl);
- S = __SHADD16(S, 0); // this is just a SIMD arithmetic shift right by 1
+ S = read_q15x2 (pSl);
+ S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
- R = __QSUB16(T, S);
+ R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSi) = __SHADD16(T, S);
- pSi += 2;
+ write_q15x2_ia (&pSi, __SHADD16(T, S));
- #ifndef ARM_MATH_BIG_ENDIAN
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16U;
+ out2 = __SMUSDX(coeff, R);
+#else
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- out1 = __SMUAD(coeff, R) >> 16;
- out2 = __SMUSDX(coeff, R);
+ write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+ }
- #else
+#else /* #if defined (ARM_MATH_DSP) */
- out1 = __SMUSDX(R, coeff) >> 16U;
- out2 = __SMUAD(coeff, R);
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
- #endif // #ifndef ARM_MATH_BIG_ENDIAN
+ l = i + n2;
- _SIMD32_OFFSET(pSl) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSl += 2;
- }
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-#else // #if defined (ARM_MATH_DSP)
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
- ia = 0;
- for (i = 0; i < n2; i++)
- {
- cosVal = pCoef[ia * 2];
- sinVal = pCoef[(ia * 2) + 1];
- ia++;
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
- l = i + n2;
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
+ }
- xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
- pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+#endif /* #if defined (ARM_MATH_DSP) */
- yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
- pSrc[2 * i + 1] =
- ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+ /* first col */
+ arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U);
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+ /* second col */
+ arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
- }
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ p2 = pSrc[4 * i + 2];
+ p3 = pSrc[4 * i + 3];
-#endif // #if defined (ARM_MATH_DSP)
+ p0 <<= 1U;
+ p1 <<= 1U;
+ p2 <<= 1U;
+ p3 <<= 1U;
- // first col
- arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U);
- // second col
- arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = p2;
+ pSrc[4 * i + 3] = p3;
+ }
- for (i = 0; i < fftLen >> 1; i++)
- {
- p0 = pSrc[4*i+0];
- p1 = pSrc[4*i+1];
- p2 = pSrc[4*i+2];
- p3 = pSrc[4*i+3];
-
- p0 <<= 1;
- p1 <<= 1;
- p2 <<= 1;
- p3 <<= 1;
-
- pSrc[4*i+0] = p0;
- pSrc[4*i+1] = p1;
- pSrc[4*i+2] = p2;
- pSrc[4*i+3] = p3;
- }
}
void arm_cfft_radix4by2_inverse_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- const q15_t * pCoef)
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef)
{
- uint32_t i;
- uint32_t n2;
- q15_t p0, p1, p2, p3;
+ uint32_t i;
+ uint32_t n2;
+ q15_t p0, p1, p2, p3;
#if defined (ARM_MATH_DSP)
- q31_t T, S, R;
- q31_t coeff, out1, out2;
- const q15_t *pC = pCoef;
- q15_t *pSi = pSrc;
- q15_t *pSl = pSrc + fftLen;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+ const q15_t *pC = pCoef;
+ q15_t *pSi = pSrc;
+ q15_t *pSl = pSrc + fftLen;
#else
- uint32_t ia, l;
- q15_t xt, yt, cosVal, sinVal;
+ uint32_t l;
+ q15_t xt, yt, cosVal, sinVal;
#endif
- n2 = fftLen >> 1;
+ n2 = fftLen >> 1U;
#if defined (ARM_MATH_DSP)
- for (i = n2; i > 0; i--)
- {
- coeff = _SIMD32_OFFSET(pC);
- pC += 2;
+ for (i = n2; i > 0; i--)
+ {
+ coeff = read_q15x2_ia ((q15_t **) &pC);
- T = _SIMD32_OFFSET(pSi);
- T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
+ T = read_q15x2 (pSi);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
- S = _SIMD32_OFFSET(pSl);
- S = __SHADD16(S, 0); // this is just a SIMD arithmetic shift right by 1
+ S = read_q15x2 (pSl);
+ S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
- R = __QSUB16(T, S);
+ R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSi) = __SHADD16(T, S);
- pSi += 2;
+ write_q15x2_ia (&pSi, __SHADD16(T, S));
- #ifndef ARM_MATH_BIG_ENDIAN
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16U;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- out1 = __SMUSD(coeff, R) >> 16;
- out2 = __SMUADX(coeff, R);
- #else
+ write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+ }
- out1 = __SMUADX(R, coeff) >> 16U;
- out2 = __SMUSD(__QSUB(0, coeff), R);
+#else /* #if defined (ARM_MATH_DSP) */
- #endif // #ifndef ARM_MATH_BIG_ENDIAN
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
- _SIMD32_OFFSET(pSl) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSl += 2;
- }
+ l = i + n2;
-#else // #if defined (ARM_MATH_DSP)
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
- ia = 0;
- for (i = 0; i < n2; i++)
- {
- cosVal = pCoef[ia * 2];
- sinVal = pCoef[(ia * 2) + 1];
- ia++;
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
- l = i + n2;
- xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
- pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
- yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
- pSrc[2 * i + 1] =
- ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
+ }
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+#endif /* #if defined (ARM_MATH_DSP) */
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
- }
+ /* first col */
+ arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U);
-#endif // #if defined (ARM_MATH_DSP)
+ /* second col */
+ arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
- // first col
- arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U);
- // second col
- arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ p2 = pSrc[4 * i + 2];
+ p3 = pSrc[4 * i + 3];
- for (i = 0; i < fftLen >> 1; i++)
- {
- p0 = pSrc[4*i+0];
- p1 = pSrc[4*i+1];
- p2 = pSrc[4*i+2];
- p3 = pSrc[4*i+3];
+ p0 <<= 1U;
+ p1 <<= 1U;
+ p2 <<= 1U;
+ p3 <<= 1U;
- p0 <<= 1;
- p1 <<= 1;
- p2 <<= 1;
- p3 <<= 1;
-
- pSrc[4*i+0] = p0;
- pSrc[4*i+1] = p1;
- pSrc[4*i+2] = p2;
- pSrc[4*i+3] = p3;
- }
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = p2;
+ pSrc[4 * i + 3] = p3;
+ }
}
-
diff --git a/DSP/Source/TransformFunctions/arm_cfft_q31.c b/DSP/Source/TransformFunctions/arm_cfft_q31.c
index ff4ff94..701ac95 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_q31.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_q31.c
* Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,224 +29,226 @@
#include "arm_math.h"
extern void arm_radix4_butterfly_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint32_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
extern void arm_radix4_butterfly_inverse_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint32_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
extern void arm_bitreversal_32(
- uint32_t * pSrc,
- const uint16_t bitRevLen,
- const uint16_t * pBitRevTable);
+ uint32_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
void arm_cfft_radix4by2_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- const q31_t * pCoef);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef);
void arm_cfft_radix4by2_inverse_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- const q31_t * pCoef);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef);
/**
-* @ingroup groupTransforms
-*/
+ @ingroup groupTransforms
+ */
/**
-* @addtogroup ComplexFFT
-* @{
-*/
+ @addtogroup ComplexFFT
+ @{
+ */
/**
-* @details
-* @brief Processing function for the fixed-point complex FFT in Q31 format.
-* @param[in] *S points to an instance of the fixed-point CFFT structure.
-* @param[in, out] *p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return none.
-*/
+ @brief Processing function for the Q31 complex FFT.
+ @param[in] S points to an instance of the fixed-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
void arm_cfft_q31(
- const arm_cfft_instance_q31 * S,
- q31_t * p1,
- uint8_t ifftFlag,
- uint8_t bitReverseFlag)
+ const arm_cfft_instance_q31 * S,
+ q31_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
{
- uint32_t L = S->fftLen;
+ uint32_t L = S->fftLen;
- if (ifftFlag == 1U)
- {
- switch (L)
- {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
- break;
+ if (ifftFlag == 1U)
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+ break;
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
- break;
- }
- }
- else
- {
- switch (L)
- {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
- break;
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+ else
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+ break;
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
- break;
- }
- }
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
- if ( bitReverseFlag )
- arm_bitreversal_32((uint32_t*)p1,S->bitRevLength,S->pBitRevTable);
+ if ( bitReverseFlag )
+ arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
}
/**
-* @} end of ComplexFFT group
-*/
+ @} end of ComplexFFT group
+ */
void arm_cfft_radix4by2_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- const q31_t * pCoef)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef)
{
- uint32_t i, l;
- uint32_t n2, ia;
- q31_t xt, yt, cosVal, sinVal;
- q31_t p0, p1;
+ uint32_t i, l;
+ uint32_t n2;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
- n2 = fftLen >> 1;
- ia = 0;
- for (i = 0; i < n2; i++)
- {
- cosVal = pCoef[2*ia];
- sinVal = pCoef[2*ia + 1];
- ia++;
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
- l = i + n2;
- xt = (pSrc[2 * i] >> 2) - (pSrc[2 * l] >> 2);
- pSrc[2 * i] = (pSrc[2 * i] >> 2) + (pSrc[2 * l] >> 2);
+ l = i + n2;
- yt = (pSrc[2 * i + 1] >> 2) - (pSrc[2 * l + 1] >> 2);
- pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2) + (pSrc[2 * i + 1] >> 2);
+ xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+ pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
- mult_32x32_keep32_R(p0, xt, cosVal);
- mult_32x32_keep32_R(p1, yt, cosVal);
- multAcc_32x32_keep32_R(p0, yt, sinVal);
- multSub_32x32_keep32_R(p1, xt, sinVal);
+ yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
- pSrc[2U * l] = p0 << 1;
- pSrc[2U * l + 1U] = p1 << 1;
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multAcc_32x32_keep32_R(p0, yt, sinVal);
+ multSub_32x32_keep32_R(p1, xt, sinVal);
- }
+ pSrc[2 * l] = p0 << 1;
+ pSrc[2 * l + 1] = p1 << 1;
+ }
- // first col
- arm_radix4_butterfly_q31( pSrc, n2, (q31_t*)pCoef, 2U);
- // second col
- arm_radix4_butterfly_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+ /* first col */
+ arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
- for (i = 0; i < fftLen >> 1; i++)
- {
- p0 = pSrc[4*i+0];
- p1 = pSrc[4*i+1];
- xt = pSrc[4*i+2];
- yt = pSrc[4*i+3];
+ /* second col */
+ arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
- p0 <<= 1;
- p1 <<= 1;
- xt <<= 1;
- yt <<= 1;
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ xt = pSrc[4 * i + 2];
+ yt = pSrc[4 * i + 3];
- pSrc[4*i+0] = p0;
- pSrc[4*i+1] = p1;
- pSrc[4*i+2] = xt;
- pSrc[4*i+3] = yt;
- }
+ p0 <<= 1U;
+ p1 <<= 1U;
+ xt <<= 1U;
+ yt <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = xt;
+ pSrc[4 * i + 3] = yt;
+ }
}
void arm_cfft_radix4by2_inverse_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- const q31_t * pCoef)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef)
{
- uint32_t i, l;
- uint32_t n2, ia;
- q31_t xt, yt, cosVal, sinVal;
- q31_t p0, p1;
+ uint32_t i, l;
+ uint32_t n2;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
- n2 = fftLen >> 1;
- ia = 0;
- for (i = 0; i < n2; i++)
- {
- cosVal = pCoef[2*ia];
- sinVal = pCoef[2*ia + 1];
- ia++;
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
- l = i + n2;
- xt = (pSrc[2 * i] >> 2) - (pSrc[2 * l] >> 2);
- pSrc[2 * i] = (pSrc[2 * i] >> 2) + (pSrc[2 * l] >> 2);
+ l = i + n2;
- yt = (pSrc[2 * i + 1] >> 2) - (pSrc[2 * l + 1] >> 2);
- pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2) + (pSrc[2 * i + 1] >> 2);
+ xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+ pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
- mult_32x32_keep32_R(p0, xt, cosVal);
- mult_32x32_keep32_R(p1, yt, cosVal);
- multSub_32x32_keep32_R(p0, yt, sinVal);
- multAcc_32x32_keep32_R(p1, xt, sinVal);
+ yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
- pSrc[2U * l] = p0 << 1;
- pSrc[2U * l + 1U] = p1 << 1;
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multSub_32x32_keep32_R(p0, yt, sinVal);
+ multAcc_32x32_keep32_R(p1, xt, sinVal);
- }
+ pSrc[2 * l] = p0 << 1U;
+ pSrc[2 * l + 1] = p1 << 1U;
+ }
- // first col
- arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
- // second col
- arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+ /* first col */
+ arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
- for (i = 0; i < fftLen >> 1; i++)
- {
- p0 = pSrc[4*i+0];
- p1 = pSrc[4*i+1];
- xt = pSrc[4*i+2];
- yt = pSrc[4*i+3];
+ /* second col */
+ arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
- p0 <<= 1;
- p1 <<= 1;
- xt <<= 1;
- yt <<= 1;
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ xt = pSrc[4 * i + 2];
+ yt = pSrc[4 * i + 3];
- pSrc[4*i+0] = p0;
- pSrc[4*i+1] = p1;
- pSrc[4*i+2] = xt;
- pSrc[4*i+3] = yt;
- }
+ p0 <<= 1U;
+ p1 <<= 1U;
+ xt <<= 1U;
+ yt <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = xt;
+ pSrc[4 * i + 3] = yt;
+ }
}
-
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
index d35988d..f75e329 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_f32.c
* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,64 +29,62 @@
#include "arm_math.h"
void arm_radix2_butterfly_f32(
- float32_t * pSrc,
- uint32_t fftLen,
- float32_t * pCoef,
- uint16_t twidCoefModifier);
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_radix2_butterfly_inverse_f32(
- float32_t * pSrc,
- uint32_t fftLen,
- float32_t * pCoef,
- uint16_t twidCoefModifier,
- float32_t onebyfftLen);
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen);
extern void arm_bitreversal_f32(
- float32_t * pSrc,
- uint16_t fftSize,
- uint16_t bitRevFactor,
- uint16_t * pBitRevTab);
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
/**
-* @ingroup groupTransforms
-*/
+ @ingroup groupTransforms
+ */
/**
-* @addtogroup ComplexFFT
-* @{
-*/
+ @addtogroup ComplexFFT
+ @{
+ */
/**
-* @details
-* @brief Radix-2 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed
-* in the future.
-* @param[in] *S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure.
-* @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
-* @return none.
-*/
+ @brief Radix-2 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future
+ @param[in] S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
void arm_cfft_radix2_f32(
const arm_cfft_radix2_instance_f32 * S,
-float32_t * pSrc)
+ float32_t * pSrc)
{
if (S->ifftFlag == 1U)
{
- /* Complex IFFT radix-2 */
+ /* Complex IFFT radix-2 */
arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle,
S->twidCoefModifier, S->onebyfftLen);
}
else
{
- /* Complex FFT radix-2 */
+ /* Complex FFT radix-2 */
arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle,
S->twidCoefModifier);
}
if (S->bitReverseFlag == 1U)
{
- /* Bit Reversal */
+ /* Bit Reversal */
arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
}
@@ -94,36 +92,36 @@ float32_t * pSrc)
/**
-* @} end of ComplexFFT group
-*/
+ @} end of ComplexFFT group
+ */
/* ----------------------------------------------------------------------
-** Internal helper function used by the FFTs
-** ------------------------------------------------------------------- */
+ ** Internal helper function used by the FFTs
+ ** ------------------------------------------------------------------- */
-/*
-* @brief Core function for the floating-point CFFT butterfly process.
-* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
-* @param[in] fftLen length of the FFT.
-* @param[in] *pCoef points to the twiddle coefficient buffer.
-* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-*/
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ return none
+ */
void arm_radix2_butterfly_f32(
-float32_t * pSrc,
-uint32_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier)
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
{
- uint32_t i, j, k, l;
- uint32_t n1, n2, ia;
- float32_t xt, yt, cosVal, sinVal;
- float32_t p0, p1, p2, p3;
- float32_t a0, a1;
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+ float32_t p0, p1, p2, p3;
+ float32_t a0, a1;
#if defined (ARM_MATH_DSP)
@@ -227,7 +225,7 @@ uint16_t twidCoefModifier)
pSrc[2 * i + 3] = yt;
} // groups loop end
-#else
+#else /* #if defined (ARM_MATH_DSP) */
n2 = fftLen;
@@ -275,24 +273,24 @@ uint16_t twidCoefModifier)
twidCoefModifier <<= 1U;
}
-#endif // #if defined (ARM_MATH_DSP)
+#endif /* #if defined (ARM_MATH_DSP) */
}
void arm_radix2_butterfly_inverse_f32(
-float32_t * pSrc,
-uint32_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier,
-float32_t onebyfftLen)
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen)
{
- uint32_t i, j, k, l;
- uint32_t n1, n2, ia;
- float32_t xt, yt, cosVal, sinVal;
- float32_t p0, p1, p2, p3;
- float32_t a0, a1;
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+ float32_t p0, p1, p2, p3;
+ float32_t a0, a1;
#if defined (ARM_MATH_DSP)
@@ -392,7 +390,7 @@ float32_t onebyfftLen)
pSrc[2 * i + 3] = p3;
} // butterfly loop end
-#else
+#else /* #if defined (ARM_MATH_DSP) */
n2 = fftLen;
@@ -461,12 +459,12 @@ float32_t onebyfftLen)
p3 = yt * onebyfftLen;
pSrc[2 * i] = p0;
- pSrc[2U * l] = p2;
+ pSrc[2 * l] = p2;
pSrc[2 * i + 1] = p1;
- pSrc[2U * l + 1U] = p3;
+ pSrc[2 * l + 1] = p3;
} // butterfly loop end
-#endif // #if defined (ARM_MATH_DSP)
+#endif /* #if defined (ARM_MATH_DSP) */
}
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
index fdfa63e..417ad91 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_init_f32.c
* Description: Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,36 +30,41 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
-* @brief Initialization function for the floating-point CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed
-* in the future.
-* @param[in,out] *S points to an instance of the floating-point CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
*/
+
arm_status arm_cfft_radix2_init_f32(
arm_cfft_radix2_instance_f32 * S,
uint16_t fftLen,
@@ -188,5 +193,5 @@ arm_status arm_cfft_radix2_init_f32(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
index 2646374..3d865d0 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_init_q15.c
* Description: Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,35 +30,40 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
-* @brief Initialization function for the Q15 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
-* @param[in,out] *S points to an instance of the Q15 CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ @brief Initialization function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
+ @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure.
+ @param[in] fftLen length of the FFT.
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
*/
arm_status arm_cfft_radix2_init_q15(
@@ -173,5 +178,5 @@ arm_status arm_cfft_radix2_init_q15(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
index 56fff4b..f4a20d6 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_init_q31.c
* Description: Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,36 +30,39 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
-
/**
-*
-* @brief Initialization function for the Q31 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
-* @param[in,out] *S points to an instance of the Q31 CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ @brief Initialization function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
*/
arm_status arm_cfft_radix2_init_q31(
@@ -76,8 +79,10 @@ arm_status arm_cfft_radix2_init_q31(
/* Initialise the Twiddle coefficient pointer */
S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
+
/* Initialise the Flag for selection of CFFT or CIFFT */
S->ifftFlag = ifftFlag;
+
/* Initialise the Flag for calculation Bit reversal or not */
S->bitReverseFlag = bitReverseFlag;
@@ -170,5 +175,5 @@ arm_status arm_cfft_radix2_init_q31(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
index 8880ab9..2a03b57 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_q15.c
* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,74 +29,71 @@
#include "arm_math.h"
void arm_radix2_butterfly_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pCoef,
- uint16_t twidCoefModifier);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_radix2_butterfly_inverse_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pCoef,
- uint16_t twidCoefModifier);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_bitreversal_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- uint16_t bitRevFactor,
- uint16_t * pBitRevTab);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
- * @details
- * @brief Processing function for the fixed-point CFFT/CIFFT.
- * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
- * @param[in] *S points to an instance of the fixed-point CFFT/CIFFT structure.
- * @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
- * @return none.
+ @brief Processing function for the fixed-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
*/
void arm_cfft_radix2_q15(
const arm_cfft_radix2_instance_q15 * S,
- q15_t * pSrc)
+ q15_t * pSrc)
{
if (S->ifftFlag == 1U)
{
- arm_radix2_butterfly_inverse_q15(pSrc, S->fftLen,
- S->pTwiddle, S->twidCoefModifier);
+ arm_radix2_butterfly_inverse_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
}
else
{
- arm_radix2_butterfly_q15(pSrc, S->fftLen,
- S->pTwiddle, S->twidCoefModifier);
+ arm_radix2_butterfly_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
}
arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
void arm_radix2_butterfly_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pCoef,
- uint16_t twidCoefModifier)
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
- unsigned i, j, k, l;
- unsigned n1, n2, ia;
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
q15_t in;
q31_t T, S, R;
q31_t coeff, out1, out2;
@@ -111,215 +108,196 @@ void arm_radix2_butterfly_q15(
// loop for groups
for (i = 0; i < n2; i++)
{
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
in = ((int16_t) (T & 0xFFFF)) >> 1;
T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
in = ((int16_t) (S & 0xFFFF)) >> 1;
S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUAD(coeff, R) >> 16;
out2 = __SMUSDX(coeff, R);
-
#else
-
out1 = __SMUSDX(R, coeff) >> 16U;
out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
-
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
i++;
l++;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
in = ((int16_t) (T & 0xFFFF)) >> 1;
T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
in = ((int16_t) (S & 0xFFFF)) >> 1;
S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUAD(coeff, R) >> 16;
out2 = __SMUSDX(coeff, R);
-
#else
out1 = __SMUSDX(R, coeff) >> 16U;
out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
-
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- // loop for stage
+ /* loop for stage */
for (k = fftLen / 2; k > 2; k = k >> 1)
{
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUAD(coeff, R) >> 16;
out2 = __SMUSDX(coeff, R);
-
#else
-
out1 = __SMUSDX(R, coeff) >> 16U;
out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
-
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
i += n1;
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUAD(coeff, R) >> 16;
out2 = __SMUSDX(coeff, R);
-
#else
-
out1 = __SMUSDX(R, coeff) >> 16U;
out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ } /* butterfly loop end */
- } // butterfly loop end
-
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- } // stages loop end
+ } /* stages loop end */
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = 0; i < fftLen; i += n1)
{
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
- _SIMD32_OFFSET(pSrc + (2U * l)) = R;
+ write_q15x2 (pSrc + (2 * l), R);
i += n1;
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
- _SIMD32_OFFSET(pSrc + (2U * l)) = R;
+ write_q15x2 (pSrc + (2 * l), R);
- } // groups loop end
+ } /* groups loop end */
-#else
+#else /* #if defined (ARM_MATH_DSP) */
- unsigned i, j, k, l;
- unsigned n1, n2, ia;
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
q15_t xt, yt, cosVal, sinVal;
- //N = fftLen;
+ // N = fftLen;
n2 = fftLen;
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- cosVal = pCoef[ia * 2];
+ cosVal = pCoef[(ia * 2)];
sinVal = pCoef[(ia * 2) + 1];
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
@@ -327,36 +305,36 @@ void arm_radix2_butterfly_q15(
pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
- pSrc[2 * i + 1] =
- ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+ (pSrc[2 * i + 1] >> 1U) ) >> 1U;
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+ pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- // loop for stage
+ /* loop for stage */
for (k = fftLen / 2; k > 2; k = k >> 1)
{
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
cosVal = pCoef[ia * 2];
sinVal = pCoef[(ia * 2) + 1];
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
@@ -366,24 +344,24 @@ void arm_radix2_butterfly_q15(
yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+ pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- } // stages loop end
+ } /* stages loop end */
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
cosVal = pCoef[ia * 2];
@@ -391,7 +369,7 @@ void arm_radix2_butterfly_q15(
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
@@ -401,244 +379,226 @@ void arm_radix2_butterfly_q15(
yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
- pSrc[2U * l] = xt;
+ pSrc[2 * l] = xt;
- pSrc[2U * l + 1U] = yt;
+ pSrc[2 * l + 1] = yt;
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
-#endif // #if defined (ARM_MATH_DSP)
+#endif /* #if defined (ARM_MATH_DSP) */
}
void arm_radix2_butterfly_inverse_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pCoef,
- uint16_t twidCoefModifier)
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
- unsigned i, j, k, l;
- unsigned n1, n2, ia;
- q15_t in;
- q31_t T, S, R;
- q31_t coeff, out1, out2;
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t in;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
- //N = fftLen;
+ // N = fftLen;
n2 = fftLen;
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (i = 0; i < n2; i++)
{
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
in = ((int16_t) (T & 0xFFFF)) >> 1;
T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
in = ((int16_t) (S & 0xFFFF)) >> 1;
S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUSD(coeff, R) >> 16;
out2 = __SMUADX(coeff, R);
#else
-
out1 = __SMUADX(R, coeff) >> 16U;
out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
-
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
i++;
l++;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
in = ((int16_t) (T & 0xFFFF)) >> 1;
T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
in = ((int16_t) (S & 0xFFFF)) >> 1;
S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUSD(coeff, R) >> 16;
out2 = __SMUADX(coeff, R);
#else
-
out1 = __SMUADX(R, coeff) >> 16U;
out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
-
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- // loop for stage
+ /* loop for stage */
for (k = fftLen / 2; k > 2; k = k >> 1)
{
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUSD(coeff, R) >> 16;
out2 = __SMUADX(coeff, R);
-
#else
-
out1 = __SMUADX(R, coeff) >> 16U;
out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
-
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
i += n1;
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUSD(coeff, R) >> 16;
out2 = __SMUADX(coeff, R);
#else
-
out1 = __SMUADX(R, coeff) >> 16U;
out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif // #ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
- _SIMD32_OFFSET(pSrc + (2U * l)) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ } /* butterfly loop end */
- } // butterfly loop end
-
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- } // stages loop end
+ } /* stages loop end */
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- coeff = _SIMD32_OFFSET(pCoef + (ia * 2U));
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
- T = _SIMD32_OFFSET(pSrc + (2 * i));
+ T = read_q15x2 (pSrc + (2 * i));
- S = _SIMD32_OFFSET(pSrc + (2 * l));
+ S = read_q15x2 (pSrc + (2 * l));
R = __QSUB16(T, S);
- _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
- _SIMD32_OFFSET(pSrc + (2U * l)) = R;
+ write_q15x2 (pSrc + (2 * l), R);
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
-#else
+#else /* #if defined (ARM_MATH_DSP) */
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t xt, yt, cosVal, sinVal;
- unsigned i, j, k, l;
- unsigned n1, n2, ia;
- q15_t xt, yt, cosVal, sinVal;
-
- //N = fftLen;
+ // N = fftLen;
n2 = fftLen;
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- cosVal = pCoef[ia * 2];
+ cosVal = pCoef[(ia * 2)];
sinVal = pCoef[(ia * 2) + 1];
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
@@ -646,36 +606,36 @@ void arm_radix2_butterfly_inverse_q15(
pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
- pSrc[2 * i + 1] =
- ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+ (pSrc[2 * i + 1] >> 1U) ) >> 1U;
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- // loop for stage
+ /* loop for stage */
for (k = fftLen / 2; k > 2; k = k >> 1)
{
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- // loop for groups
+ /* loop for groups */
for (j = 0; j < n2; j++)
{
- cosVal = pCoef[ia * 2];
+ cosVal = pCoef[(ia * 2)];
sinVal = pCoef[(ia * 2) + 1];
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = j; i < fftLen; i += n1)
{
l = i + n2;
@@ -685,29 +645,29 @@ void arm_radix2_butterfly_inverse_q15(
yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
- pSrc[2U * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
- ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)) );
- pSrc[2U * l + 1U] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
- ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)) );
- } // butterfly loop end
+ } /* butterfly loop end */
- } // groups loop end
+ } /* groups loop end */
twidCoefModifier = twidCoefModifier << 1U;
- } // stages loop end
+ } /* stages loop end */
n1 = n2;
n2 = n2 >> 1;
ia = 0;
- cosVal = pCoef[ia * 2];
+ cosVal = pCoef[(ia * 2)];
sinVal = pCoef[(ia * 2) + 1];
ia = ia + twidCoefModifier;
- // loop for butterfly
+ /* loop for butterfly */
for (i = 0; i < fftLen; i += n1)
{
l = i + n2;
@@ -717,13 +677,13 @@ void arm_radix2_butterfly_inverse_q15(
yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
- pSrc[2U * l] = xt;
+ pSrc[2 * l] = xt;
- pSrc[2U * l + 1U] = yt;
+ pSrc[2 * l + 1] = yt;
- } // groups loop end
+ } /* groups loop end */
-#endif // #if defined (ARM_MATH_DSP)
+#endif /* #if defined (ARM_MATH_DSP) */
}
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c b/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
index c9b1537..6c79a65 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix2_q31.c
* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,44 +29,43 @@
#include "arm_math.h"
void arm_radix2_butterfly_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint16_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_radix2_butterfly_inverse_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint16_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_bitreversal_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- uint16_t bitRevFactor,
- uint16_t * pBitRevTab);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
/**
-* @ingroup groupTransforms
-*/
+ @ingroup groupTransforms
+ */
/**
-* @addtogroup ComplexFFT
-* @{
-*/
+ @addtogroup ComplexFFT
+ @{
+ */
/**
-* @details
-* @brief Processing function for the fixed-point CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
-* @param[in] *S points to an instance of the fixed-point CFFT/CIFFT structure.
-* @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
-* @return none.
-*/
+ @brief Processing function for the fixed-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
void arm_cfft_radix2_q31(
-const arm_cfft_radix2_instance_q31 * S,
-q31_t * pSrc)
+ const arm_cfft_radix2_instance_q31 * S,
+ q31_t * pSrc)
{
if (S->ifftFlag == 1U)
@@ -84,14 +83,14 @@ q31_t * pSrc)
}
/**
-* @} end of ComplexFFT group
-*/
+ @} end of ComplexFFT group
+ */
void arm_radix2_butterfly_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-q31_t * pCoef,
-uint16_t twidCoefModifier)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier)
{
unsigned i, j, k, l, m;
@@ -216,10 +215,10 @@ uint16_t twidCoefModifier)
void arm_radix2_butterfly_inverse_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-q31_t * pCoef,
-uint16_t twidCoefModifier)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier)
{
unsigned i, j, k, l;
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
index d6f66ae..9629145 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix4_f32.c
* Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,47 +29,45 @@
#include "arm_math.h"
extern void arm_bitreversal_f32(
-float32_t * pSrc,
-uint16_t fftSize,
-uint16_t bitRevFactor,
-uint16_t * pBitRevTab);
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
void arm_radix4_butterfly_f32(
-float32_t * pSrc,
-uint16_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier);
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
void arm_radix4_butterfly_inverse_f32(
-float32_t * pSrc,
-uint16_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier,
-float32_t onebyfftLen);
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen);
/**
-* @ingroup groupTransforms
-*/
+ @ingroup groupTransforms
+ */
/**
-* @addtogroup ComplexFFT
-* @{
-*/
+ @addtogroup ComplexFFT
+ @{
+ */
/**
-* @details
-* @brief Processing function for the floating-point Radix-4 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed
-* in the future.
-* @param[in] *S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure.
-* @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
-* @return none.
-*/
+ @brief Processing function for the floating-point Radix-4 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in] S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
void arm_cfft_radix4_f32(
const arm_cfft_radix4_instance_f32 * S,
- float32_t * pSrc)
+ float32_t * pSrc)
{
if (S->ifftFlag == 1U)
{
@@ -91,46 +89,43 @@ void arm_cfft_radix4_f32(
}
/**
-* @} end of ComplexFFT group
-*/
+ @} end of ComplexFFT group
+ */
/* ----------------------------------------------------------------------
* Internal helper function used by the FFTs
* ---------------------------------------------------------------------- */
-/*
-* @brief Core function for the floating-point CFFT butterfly process.
-* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
-* @param[in] fftLen length of the FFT.
-* @param[in] *pCoef points to the twiddle coefficient buffer.
-* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-*/
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to the twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ return none
+ */
void arm_radix4_butterfly_f32(
-float32_t * pSrc,
-uint16_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier)
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
{
+ float32_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
- float32_t co1, co2, co3, si1, si2, si3;
- uint32_t ia1, ia2, ia3;
- uint32_t i0, i1, i2, i3;
- uint32_t n1, n2, j, k;
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
- float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
- Ybminusd;
- float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
- float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
- float32_t *ptr1;
- float32_t p0,p1,p2,p3,p4,p5;
- float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+ float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float32_t *ptr1;
+ float32_t p0,p1,p2,p3,p4,p5;
+ float32_t a0,a1,a2,a3,a4,a5,a6,a7;
/* Initializations for the first stage */
n2 = fftLen;
@@ -290,11 +285,11 @@ uint16_t twidCoefModifier)
/* index calculation for the coefficients */
ia2 = ia1 + ia1;
ia3 = ia2 + ia1;
- co1 = pCoef[ia1 * 2U];
+ co1 = pCoef[(ia1 * 2U)];
si1 = pCoef[(ia1 * 2U) + 1U];
- co2 = pCoef[ia2 * 2U];
+ co2 = pCoef[(ia2 * 2U)];
si2 = pCoef[(ia2 * 2U) + 1U];
- co3 = pCoef[ia3 * 2U];
+ co3 = pCoef[(ia3 * 2U)];
si3 = pCoef[(ia3 * 2U) + 1U];
/* Twiddle coefficients index modifier */
@@ -484,11 +479,9 @@ uint16_t twidCoefModifier)
#else
- float32_t t1, t2, r1, r2, s1, s2;
+ float32_t t1, t2, r1, r2, s1, s2;
- /* Run the below code for Cortex-M0 */
-
- /* Initializations for the fft calculation */
+ /* Initializations for the fft calculation */
n2 = fftLen;
n1 = n2;
for (k = fftLen; k > 1U; k >>= 2U)
@@ -597,42 +590,42 @@ uint16_t twidCoefModifier)
twidCoefModifier <<= 2U;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
}
-/*
-* @brief Core function for the floating-point CIFFT butterfly process.
-* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
-* @param[in] fftLen length of the FFT.
-* @param[in] *pCoef points to twiddle coefficient buffer.
-* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @param[in] onebyfftLen value of 1/fftLen.
-* @return none.
-*/
+/**
+ brief Core function for the floating-point CIFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ param[in] onebyfftLen value of 1/fftLen
+ return none
+ */
void arm_radix4_butterfly_inverse_f32(
-float32_t * pSrc,
-uint16_t fftLen,
-float32_t * pCoef,
-uint16_t twidCoefModifier,
-float32_t onebyfftLen)
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen)
{
- float32_t co1, co2, co3, si1, si2, si3;
- uint32_t ia1, ia2, ia3;
- uint32_t i0, i1, i2, i3;
- uint32_t n1, n2, j, k;
+ float32_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)
- float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
- float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
- Ybminusd;
- float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
- float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
- float32_t *ptr1;
- float32_t p0,p1,p2,p3,p4,p5,p6,p7;
- float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+ float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float32_t *ptr1;
+ float32_t p0,p1,p2,p3,p4,p5,p6,p7;
+ float32_t a0,a1,a2,a3,a4,a5,a6,a7;
/* Initializations for the first stage */
@@ -1008,9 +1001,7 @@ float32_t onebyfftLen)
#else
- float32_t t1, t2, r1, r2, s1, s2;
-
- /* Run the below code for Cortex-M0 */
+ float32_t t1, t2, r1, r2, s1, s2;
/* Initializations for the first stage */
n2 = fftLen;
@@ -1203,7 +1194,7 @@ float32_t onebyfftLen)
pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
}
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
index 14ea487..930c2c1 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix4_init_f32.c
* Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,36 +30,40 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
-* @brief Initialization function for the floating-point CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed
-* in the future.
-* @param[in,out] *S points to an instance of the floating-point CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
arm_status arm_cfft_radix4_init_f32(
arm_cfft_radix4_instance_f32 * S,
@@ -148,5 +152,5 @@ arm_status arm_cfft_radix4_init_f32(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
index ed78236..0090688 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix4_init_q15.c
* Description: Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,37 +30,42 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
-* @brief Initialization function for the Q15 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
-* @param[in,out] *S points to an instance of the Q15 CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
+ @brief Initialization function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
arm_status arm_cfft_radix4_init_q15(
arm_cfft_radix4_instance_q15 * S,
@@ -136,5 +141,5 @@ arm_status arm_cfft_radix4_init_q15(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
index 6f11763..17d16b7 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix4_init_q31.c
* Description: Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,35 +30,40 @@
#include "arm_common_tables.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
-*
-* @brief Initialization function for the Q31 CFFT/CIFFT.
-* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
-* @param[in,out] *S points to an instance of the Q31 CFFT/CIFFT structure.
-* @param[in] fftLen length of the FFT.
-* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter ifftFlag controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par
-* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-* \par
-* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+
+ @brief Initialization function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure.
+ @param[in] fftLen length of the FFT.
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
*/
arm_status arm_cfft_radix4_init_q31(
@@ -132,5 +137,5 @@ arm_status arm_cfft_radix4_init_q31(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
index f3451f7..b4cabb1 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
@@ -4,13 +4,13 @@
* Description: This file has function definition of Radix-4 FFT & IFFT function and
* In-place bit reversal using bit reversal table
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -31,54 +31,52 @@
void arm_radix4_butterfly_q15(
- q15_t * pSrc16,
- uint32_t fftLen,
- q15_t * pCoef16,
- uint32_t twidCoefModifier);
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier);
void arm_radix4_butterfly_inverse_q15(
- q15_t * pSrc16,
- uint32_t fftLen,
- q15_t * pCoef16,
- uint32_t twidCoefModifier);
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier);
void arm_bitreversal_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- uint16_t bitRevFactor,
- uint16_t * pBitRevTab);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
- * @details
- * @brief Processing function for the Q15 CFFT/CIFFT.
- * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
- * @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
- * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
- * @return none.
- *
- * \par Input and output formats:
- * \par
- * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
- * Hence the output format is different for different FFT sizes.
- * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
- * \par
- * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
- * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
+ @brief Processing function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in] S points to an instance of the Q15 CFFT/CIFFT structure.
+ @param[in,out] pSrc points to the complex data buffer. Processing occurs in-place.
+ @return none
+
+ @par Input and output formats:
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different FFT sizes.
+ The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+ @par
+ \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
+ \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
*/
void arm_cfft_radix4_q15(
const arm_cfft_radix4_instance_q15 * S,
- q15_t * pSrc)
+ q15_t * pSrc)
{
if (S->ifftFlag == 1U)
{
@@ -100,74 +98,72 @@ void arm_cfft_radix4_q15(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
/*
-* Radix-4 FFT algorithm used is :
-*
-* Input real and imaginary data:
-* x(n) = xa + j * ya
-* x(n+N/4 ) = xb + j * yb
-* x(n+N/2 ) = xc + j * yc
-* x(n+3N 4) = xd + j * yd
-*
-*
-* Output real and imaginary data:
-* x(4r) = xa'+ j * ya'
-* x(4r+1) = xb'+ j * yb'
-* x(4r+2) = xc'+ j * yc'
-* x(4r+3) = xd'+ j * yd'
-*
-*
-* Twiddle factors for radix-4 FFT:
-* Wn = co1 + j * (- si1)
-* W2n = co2 + j * (- si2)
-* W3n = co3 + j * (- si3)
-
-* The real and imaginary output values for the radix-4 butterfly are
-* xa' = xa + xb + xc + xd
-* ya' = ya + yb + yc + yd
-* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
-* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
-* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
-* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
-* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
-* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
-*
-*/
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
/**
- * @brief Core function for the Q15 CFFT butterfly process.
- * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
- * @param[in] fftLen length of the FFT.
- * @param[in] *pCoef16 points to twiddle coefficient buffer.
- * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
- * @return none.
+ @brief Core function for the Q15 CFFT butterfly process.
+ @param[in,out] pSrc16 points to the in-place buffer of Q15 data type
+ @param[in] fftLen length of the FFT
+ @param[in] pCoef16 points to twiddle coefficient buffer
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
*/
void arm_radix4_butterfly_q15(
- q15_t * pSrc16,
- uint32_t fftLen,
- q15_t * pCoef16,
- uint32_t twidCoefModifier)
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t R, S, T, U;
+ q31_t C1, C2, C3, out1, out2;
+ uint32_t n1, n2, ic, i0, j, k;
- q31_t R, S, T, U;
- q31_t C1, C2, C3, out1, out2;
- uint32_t n1, n2, ic, i0, j, k;
+ q15_t *ptr1;
+ q15_t *pSi0;
+ q15_t *pSi1;
+ q15_t *pSi2;
+ q15_t *pSi3;
- q15_t *ptr1;
- q15_t *pSi0;
- q15_t *pSi1;
- q15_t *pSi2;
- q15_t *pSi3;
-
- q31_t xaya, xbyb, xcyc, xdyd;
+ q31_t xaya, xbyb, xcyc, xdyd;
/* Total process is divided into three stages */
@@ -198,16 +194,18 @@ void arm_radix4_butterfly_q15(
{
/* Butterfly implementation */
- /* Reading i0, i0+fftLen/2 inputs */
+ /* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
- T = _SIMD32_OFFSET(pSi0);
- T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
- T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
- //in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
- //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+ T = read_q15x2 (pSi0);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+ T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
+/*
+ in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+*/
/* Read yc (real), xc(imag) input */
- S = _SIMD32_OFFSET(pSi2);
+ S = read_q15x2 (pSi2);
S = __SHADD16(S, 0);
S = __SHADD16(S, 0);
@@ -219,12 +217,12 @@ void arm_radix4_butterfly_q15(
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
@@ -234,46 +232,39 @@ void arm_radix4_butterfly_q15(
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
- _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
- pSi0 += 2;
+ write_q15x2_ia (&pSi0, __SHADD16(R, T));
/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
R = __QSUB16(R, T);
/* co2 & si2 are read from SIMD Coefficient pointer */
- C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out1 = __SMUAD(C2, R) >> 16U;
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUSDX(C2, R);
-
#else
-
/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUSDX(R, C2) >> 16U;
/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out2 = __SMUAD(C2, R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+fftLen/4 */
/* T = packed(yb, xb) */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
- _SIMD32_OFFSET(pSi1) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSi1 += 2;
+ write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Butterfly calculations */
/* U = packed(yd, xd) */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
@@ -281,71 +272,54 @@ void arm_radix4_butterfly_q15(
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QSAX(S, T);
-
#else
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QSAX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QASX(S, T);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* co1 & si1 are read from SIMD Coefficient pointer */
- C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
/* Butterfly process for the i0+fftLen/2 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out1 = __SMUAD(C1, S) >> 16U;
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out2 = __SMUSDX(C1, S);
-
#else
-
/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out1 = __SMUSDX(S, C1) >> 16U;
/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out2 = __SMUAD(C1, S);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
- _SIMD32_OFFSET(pSi2) =
- ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
- pSi2 += 2;
-
+ write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
/* co3 & si3 are read from SIMD Coefficient pointer */
- C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out1 = __SMUAD(C3, R) >> 16U;
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out2 = __SMUSDX(C3, R);
-
#else
-
/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out1 = __SMUSDX(R, C3) >> 16U;
/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out2 = __SMUAD(C3, R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
- _SIMD32_OFFSET(pSi3) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSi3 += 2;
+ write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@@ -372,9 +346,9 @@ void arm_radix4_butterfly_q15(
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
- C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
- C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
- C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@@ -389,10 +363,10 @@ void arm_radix4_butterfly_q15(
{
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
- T = _SIMD32_OFFSET(pSi0);
+ T = read_q15x2 (pSi0);
/* Read yc (real), xc(imag) input */
- S = _SIMD32_OFFSET(pSi2);
+ S = read_q15x2 (pSi2);
/* R = packed( (ya + yc), (xa + xc)) */
R = __QADD16(T, S);
@@ -402,10 +376,10 @@ void arm_radix4_butterfly_q15(
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
/* T = packed( (yb + yd), (xb + xd)) */
T = __QADD16(T, U);
@@ -416,51 +390,45 @@ void arm_radix4_butterfly_q15(
/* ya' = ya + yb + yc + yd */
out1 = __SHADD16(R, T);
out1 = __SHADD16(out1, 0);
- _SIMD32_OFFSET(pSi0) = out1;
+ write_q15x2 (pSi0, out1);
pSi0 += 2 * n1;
/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
R = __SHSUB16(R, T);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out1 = __SMUAD(C2, R) >> 16U;
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUSDX(C2, R);
-
#else
-
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUSDX(R, C2) >> 16U;
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out2 = __SMUAD(C2, R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
- _SIMD32_OFFSET(pSi1) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi1 += 2 * n1;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHASX(S, T);
@@ -471,9 +439,7 @@ void arm_radix4_butterfly_q15(
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUAD(C1, S) >> 16U;
out2 = __SMUSDX(C1, S);
-
#else
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHSAX(S, T);
@@ -484,33 +450,26 @@ void arm_radix4_butterfly_q15(
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUSDX(S, C1) >> 16U;
out2 = __SMUAD(C1, S);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
- _SIMD32_OFFSET(pSi2) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUAD(C3, R) >> 16U;
out2 = __SMUSDX(C3, R);
-
#else
-
out1 = __SMUSDX(R, C3) >> 16U;
out2 = __SMUAD(C3, R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
- _SIMD32_OFFSET(pSi3) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi3 += 2 * n1;
}
}
@@ -536,16 +495,16 @@ void arm_radix4_butterfly_q15(
do
{
/* Read xa (real), ya(imag) input */
- xaya = *__SIMD32(ptr1)++;
+ xaya = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xb (real), yb(imag) input */
- xbyb = *__SIMD32(ptr1)++;
+ xbyb = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xc (real), yc(imag) input */
- xcyc = *__SIMD32(ptr1)++;
+ xcyc = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xd (real), yd(imag) input */
- xdyd = *__SIMD32(ptr1)++;
+ xdyd = read_q15x2_ia ((q15_t **) &ptr1);
/* R = packed((ya + yc), (xa + xc)) */
R = __QADD16(xaya, xcyc);
@@ -559,14 +518,14 @@ void arm_radix4_butterfly_q15(
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
- *__SIMD32(ptr1)++ = __SHADD16(R, T);
+ write_q15x2_ia (&ptr1, __SHADD16(R, T));
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
- *__SIMD32(ptr1)++ = __SHSUB16(R, T);
+ write_q15x2_ia (&ptr1, __SHSUB16(R, T));
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(xaya, xcyc);
@@ -576,28 +535,22 @@ void arm_radix4_butterfly_q15(
U = __QSUB16(xbyb, xdyd);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
- *__SIMD32(ptr1)++ = __SHSAX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
- *__SIMD32(ptr1)++ = __SHASX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
#else
-
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
- *__SIMD32(ptr1)++ = __SHASX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
- *__SIMD32(ptr1)++ = __SHSAX(S, U);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
} while (--j);
@@ -609,13 +562,11 @@ void arm_radix4_butterfly_q15(
/* output is in 5.11(q11) format for the 16 point */
-#else
+#else /* #if defined (ARM_MATH_DSP) */
- /* Run the below code for Cortex-M0 */
-
- q15_t R0, R1, S0, S1, T0, T1, U0, U1;
- q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
- uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+ q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+ q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+ uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
/* Total process is divided into three stages */
@@ -1015,76 +966,74 @@ void arm_radix4_butterfly_q15(
/**
- * @brief Core function for the Q15 CIFFT butterfly process.
- * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
- * @param[in] fftLen length of the FFT.
- * @param[in] *pCoef16 points to twiddle coefficient buffer.
- * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
- * @return none.
+ @brief Core function for the Q15 CIFFT butterfly process.
+ @param[in,out] pSrc16 points to the in-place buffer of Q15 data type
+ @param[in] fftLen length of the FFT
+ @param[in] pCoef16 points to twiddle coefficient buffer
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
*/
/*
-* Radix-4 IFFT algorithm used is :
-*
-* CIFFT uses same twiddle coefficients as CFFT function
-* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
-*
-*
-* IFFT is implemented with following changes in equations from FFT
-*
-* Input real and imaginary data:
-* x(n) = xa + j * ya
-* x(n+N/4 ) = xb + j * yb
-* x(n+N/2 ) = xc + j * yc
-* x(n+3N 4) = xd + j * yd
-*
-*
-* Output real and imaginary data:
-* x(4r) = xa'+ j * ya'
-* x(4r+1) = xb'+ j * yb'
-* x(4r+2) = xc'+ j * yc'
-* x(4r+3) = xd'+ j * yd'
-*
-*
-* Twiddle factors for radix-4 IFFT:
-* Wn = co1 + j * (si1)
-* W2n = co2 + j * (si2)
-* W3n = co3 + j * (si3)
-
-* The real and imaginary output values for the radix-4 butterfly are
-* xa' = xa + xb + xc + xd
-* ya' = ya + yb + yc + yd
-* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
-* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
-* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
-* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
-* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
-* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
-*
-*/
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT function
+ * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
void arm_radix4_butterfly_inverse_q15(
- q15_t * pSrc16,
- uint32_t fftLen,
- q15_t * pCoef16,
- uint32_t twidCoefModifier)
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- q31_t R, S, T, U;
- q31_t C1, C2, C3, out1, out2;
- uint32_t n1, n2, ic, i0, j, k;
-
- q15_t *ptr1;
- q15_t *pSi0;
- q15_t *pSi1;
- q15_t *pSi2;
- q15_t *pSi3;
-
- q31_t xaya, xbyb, xcyc, xdyd;
+ q31_t R, S, T, U;
+ q31_t C1, C2, C3, out1, out2;
+ uint32_t n1, n2, ic, i0, j, k;
+
+ q15_t *ptr1;
+ q15_t *pSi0;
+ q15_t *pSi1;
+ q15_t *pSi2;
+ q15_t *pSi3;
+
+ q31_t xaya, xbyb, xcyc, xdyd;
/* Total process is divided into three stages */
@@ -1117,12 +1066,12 @@ void arm_radix4_butterfly_inverse_q15(
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
- T = _SIMD32_OFFSET(pSi0);
+ T = read_q15x2 (pSi0);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yc (real), xc(imag) input */
- S = _SIMD32_OFFSET(pSi2);
+ S = read_q15x2 (pSi2);
S = __SHADD16(S, 0);
S = __SHADD16(S, 0);
@@ -1134,12 +1083,12 @@ void arm_radix4_butterfly_inverse_q15(
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
@@ -1149,46 +1098,39 @@ void arm_radix4_butterfly_inverse_q15(
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
- _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
- pSi0 += 2;
+ write_q15x2_ia (&pSi0, __SHADD16(R, T));
/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
R = __QSUB16(R, T);
/* co2 & si2 are read from SIMD Coefficient pointer */
- C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out1 = __SMUSD(C2, R) >> 16U;
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUADX(C2, R);
-
#else
-
/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUADX(C2, R) >> 16U;
/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out2 = __SMUSD(__QSUB16(0, C2), R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+fftLen/4 */
/* T = packed(yb, xb) */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
- _SIMD32_OFFSET(pSi1) =
- (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSi1 += 2;
+ write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Butterfly calculations */
/* U = packed(yd, xd) */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
@@ -1196,71 +1138,54 @@ void arm_radix4_butterfly_inverse_q15(
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QSAX(S, T);
/* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
S = __QASX(S, T);
-
#else
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QSAX(S, T);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* co1 & si1 are read from SIMD Coefficient pointer */
- C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
/* Butterfly process for the i0+fftLen/2 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out1 = __SMUSD(C1, S) >> 16U;
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out2 = __SMUADX(C1, S);
-
#else
-
/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out1 = __SMUADX(C1, S) >> 16U;
/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out2 = __SMUSD(__QSUB16(0, C1), S);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
- _SIMD32_OFFSET(pSi2) =
- ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
- pSi2 += 2;
-
+ write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
/* co3 & si3 are read from SIMD Coefficient pointer */
- C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out1 = __SMUSD(C3, R) >> 16U;
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out2 = __SMUADX(C3, R);
-
#else
-
/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out1 = __SMUADX(C3, R) >> 16U;
/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out2 = __SMUSD(__QSUB16(0, C3), R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
- _SIMD32_OFFSET(pSi3) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
- pSi3 += 2;
+ write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@@ -1287,9 +1212,9 @@ void arm_radix4_butterfly_inverse_q15(
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
- C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
- C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
- C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@@ -1304,10 +1229,10 @@ void arm_radix4_butterfly_inverse_q15(
{
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
- T = _SIMD32_OFFSET(pSi0);
+ T = read_q15x2 (pSi0);
/* Read yc (real), xc(imag) input */
- S = _SIMD32_OFFSET(pSi2);
+ S = read_q15x2 (pSi2);
/* R = packed( (ya + yc), (xa + xc)) */
R = __QADD16(T, S);
@@ -1317,10 +1242,10 @@ void arm_radix4_butterfly_inverse_q15(
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
/* T = packed( (yb + yd), (xb + xd)) */
T = __QADD16(T, U);
@@ -1331,101 +1256,84 @@ void arm_radix4_butterfly_inverse_q15(
/* ya' = ya + yb + yc + yd */
out1 = __SHADD16(R, T);
out1 = __SHADD16(out1, 0);
- _SIMD32_OFFSET(pSi0) = out1;
+ write_q15x2 (pSi0, out1);
pSi0 += 2 * n1;
/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
R = __SHSUB16(R, T);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out1 = __SMUSD(C2, R) >> 16U;
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUADX(C2, R);
-
#else
-
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUADX(R, C2) >> 16U;
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out2 = __SMUSD(__QSUB16(0, C2), R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
- T = _SIMD32_OFFSET(pSi1);
+ T = read_q15x2 (pSi1);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
- _SIMD32_OFFSET(pSi1) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi1 += 2 * n1;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
- U = _SIMD32_OFFSET(pSi3);
+ U = read_q15x2 (pSi3);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHSAX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHASX(S, T);
-
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUSD(C1, S) >> 16U;
out2 = __SMUADX(C1, S);
-
#else
-
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHSAX(S, T);
-
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUADX(S, C1) >> 16U;
out2 = __SMUSD(__QSUB16(0, C1), S);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
- _SIMD32_OFFSET(pSi2) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
-
out1 = __SMUSD(C3, R) >> 16U;
out2 = __SMUADX(C3, R);
-
#else
-
out1 = __SMUADX(C3, R) >> 16U;
out2 = __SMUSD(__QSUB16(0, C3), R);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
- _SIMD32_OFFSET(pSi3) =
- ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+ write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi3 += 2 * n1;
}
}
@@ -1450,16 +1358,16 @@ void arm_radix4_butterfly_inverse_q15(
do
{
/* Read xa (real), ya(imag) input */
- xaya = *__SIMD32(ptr1)++;
+ xaya = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xb (real), yb(imag) input */
- xbyb = *__SIMD32(ptr1)++;
+ xbyb = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xc (real), yc(imag) input */
- xcyc = *__SIMD32(ptr1)++;
+ xcyc = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xd (real), yd(imag) input */
- xdyd = *__SIMD32(ptr1)++;
+ xdyd = read_q15x2_ia ((q15_t **) &ptr1);
/* R = packed((ya + yc), (xa + xc)) */
R = __QADD16(xaya, xcyc);
@@ -1473,14 +1381,14 @@ void arm_radix4_butterfly_inverse_q15(
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
- *__SIMD32(ptr1)++ = __SHADD16(R, T);
+ write_q15x2_ia (&ptr1, __SHADD16(R, T));
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
- *__SIMD32(ptr1)++ = __SHSUB16(R, T);
+ write_q15x2_ia (&ptr1, __SHSUB16(R, T));
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(xaya, xcyc);
@@ -1490,29 +1398,22 @@ void arm_radix4_butterfly_inverse_q15(
U = __QSUB16(xbyb, xdyd);
#ifndef ARM_MATH_BIG_ENDIAN
-
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
- *__SIMD32(ptr1)++ = __SHASX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
- *__SIMD32(ptr1)++ = __SHSAX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
#else
-
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
- *__SIMD32(ptr1)++ = __SHSAX(S, U);
-
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
- *__SIMD32(ptr1)++ = __SHASX(S, U);
-
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
} while (--j);
@@ -1524,13 +1425,11 @@ void arm_radix4_butterfly_inverse_q15(
/* output is in 5.11(q11) format for the 16 point */
-#else
+#else /* arm_radix4_butterfly_inverse_q15 */
- /* Run the below code for Cortex-M0 */
-
- q15_t R0, R1, S0, S1, T0, T1, U0, U1;
- q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
- uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+ q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+ q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+ uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
/* Total process is divided into three stages */
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c b/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
index 95292e4..a9a59dd 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
@@ -4,13 +4,13 @@
* Description: This file has function definition of Radix-4 FFT & IFFT function and
* In-place bit reversal using bit reversal table
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -30,54 +30,51 @@
#include "arm_math.h"
void arm_radix4_butterfly_inverse_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-q31_t * pCoef,
-uint32_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
void arm_radix4_butterfly_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-q31_t * pCoef,
-uint32_t twidCoefModifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
void arm_bitreversal_q31(
-q31_t * pSrc,
-uint32_t fftLen,
-uint16_t bitRevFactor,
-uint16_t * pBitRevTab);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @addtogroup ComplexFFT
- * @{
+ @addtogroup ComplexFFT
+ @{
*/
/**
- * @details
- * @brief Processing function for the Q31 CFFT/CIFFT.
- * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
- * @param[in] *S points to an instance of the Q31 CFFT/CIFFT structure.
- * @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
- * @return none.
- *
- * \par Input and output formats:
- * \par
- * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
- * Hence the output format is different for different FFT sizes.
- * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
- * \par
- * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
- * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
- *
+ @brief Processing function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in] S points to an instance of the Q31 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+
+ @par Input and output formats:
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different FFT sizes.
+ The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+ @par
+ \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
+ \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
*/
void arm_cfft_radix4_q31(
const arm_cfft_radix4_instance_q31 * S,
- q31_t * pSrc)
+ q31_t * pSrc)
{
if (S->ifftFlag == 1U)
{
@@ -99,69 +96,68 @@ void arm_cfft_radix4_q31(
}
/**
- * @} end of ComplexFFT group
+ @} end of ComplexFFT group
*/
/*
-* Radix-4 FFT algorithm used is :
-*
-* Input real and imaginary data:
-* x(n) = xa + j * ya
-* x(n+N/4 ) = xb + j * yb
-* x(n+N/2 ) = xc + j * yc
-* x(n+3N 4) = xd + j * yd
-*
-*
-* Output real and imaginary data:
-* x(4r) = xa'+ j * ya'
-* x(4r+1) = xb'+ j * yb'
-* x(4r+2) = xc'+ j * yc'
-* x(4r+3) = xd'+ j * yd'
-*
-*
-* Twiddle factors for radix-4 FFT:
-* Wn = co1 + j * (- si1)
-* W2n = co2 + j * (- si2)
-* W3n = co3 + j * (- si3)
-*
-* Butterfly implementation:
-* xa' = xa + xb + xc + xd
-* ya' = ya + yb + yc + yd
-* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
-* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
-* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
-* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
-* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
-* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
-*
-*/
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+ *
+ * Butterfly implementation:
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
/**
- * @brief Core function for the Q31 CFFT butterfly process.
- * @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
- * @param[in] fftLen length of the FFT.
- * @param[in] *pCoef points to twiddle coefficient buffer.
- * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
- * @return none.
+ @brief Core function for the Q31 CFFT butterfly process.
+ @param[in,out] pSrc points to the in-place buffer of Q31 data type.
+ @param[in] fftLen length of the FFT.
+ @param[in] pCoef points to twiddle coefficient buffer.
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
*/
void arm_radix4_butterfly_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint32_t twidCoefModifier)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier)
{
-#if defined(ARM_MATH_CM7)
- uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
- q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+ uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+ q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+
+ q31_t xa, xb, xc, xd;
+ q31_t ya, yb, yc, yd;
+ q31_t xa_out, xb_out, xc_out, xd_out;
+ q31_t ya_out, yb_out, yc_out, yd_out;
+
+ q31_t *ptr1;
- q31_t xa, xb, xc, xd;
- q31_t ya, yb, yc, yd;
- q31_t xa_out, xb_out, xc_out, xd_out;
- q31_t ya_out, yb_out, yc_out, yd_out;
-
- q31_t *ptr1;
- q63_t xaya, xbyb, xcyc, xdyd;
/* Total process is divided into three stages */
/* process first stage, middle stages, & last stage */
@@ -194,10 +190,10 @@ void arm_radix4_butterfly_q31(
/* xa + xc */
r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
/* xa - xc */
- r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
+ r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
/* xb + xd */
- t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
+ t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
/* ya + yc */
s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
@@ -220,11 +216,11 @@ void arm_radix4_butterfly_q31(
/* yb - yd */
t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
/* xb - xd */
- t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
+ t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
/* index calculation for the coefficients */
ia2 = 2U * ia1;
- co2 = pCoef[ia2 * 2U];
+ co2 = pCoef[(ia2 * 2U)];
si2 = pCoef[(ia2 * 2U) + 1U];
/* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
@@ -245,7 +241,7 @@ void arm_radix4_butterfly_q31(
/* (ya - yc) + (xb - xd) */
s2 = s2 + t2;
- co1 = pCoef[ia1 * 2U];
+ co1 = pCoef[(ia1 * 2U)];
si1 = pCoef[(ia1 * 2U) + 1U];
/* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
@@ -258,7 +254,7 @@ void arm_radix4_butterfly_q31(
/* index calculation for the coefficients */
ia3 = 3U * ia1;
- co3 = pCoef[ia3 * 2U];
+ co3 = pCoef[(ia3 * 2U)];
si3 = pCoef[(ia3 * 2U) + 1U];
/* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
@@ -303,11 +299,11 @@ void arm_radix4_butterfly_q31(
/* index calculation for the coefficients */
ia2 = ia1 + ia1;
ia3 = ia2 + ia1;
- co1 = pCoef[ia1 * 2U];
+ co1 = pCoef[(ia1 * 2U)];
si1 = pCoef[(ia1 * 2U) + 1U];
- co2 = pCoef[ia2 * 2U];
+ co2 = pCoef[(ia2 * 2U)];
si2 = pCoef[(ia2 * 2U) + 1U];
- co3 = pCoef[ia3 * 2U];
+ co3 = pCoef[(ia3 * 2U)];
si3 = pCoef[(ia3 * 2U) + 1U];
/* Twiddle coefficients index modifier */
ia1 = ia1 + twidCoefModifier;
@@ -389,256 +385,6 @@ void arm_radix4_butterfly_q31(
}
twidCoefModifier <<= 2U;
}
-#else
- uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
- q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
-
- q31_t xa, xb, xc, xd;
- q31_t ya, yb, yc, yd;
- q31_t xa_out, xb_out, xc_out, xd_out;
- q31_t ya_out, yb_out, yc_out, yd_out;
-
- q31_t *ptr1;
- q31_t *pSi0;
- q31_t *pSi1;
- q31_t *pSi2;
- q31_t *pSi3;
- q63_t xaya, xbyb, xcyc, xdyd;
- /* Total process is divided into three stages */
-
- /* process first stage, middle stages, & last stage */
-
-
- /* start of first stage process */
-
- /* Initializations for the first stage */
- n2 = fftLen;
- n1 = n2;
- /* n2 = fftLen/4 */
- n2 >>= 2U;
-
- ia1 = 0U;
-
- j = n2;
-
- pSi0 = pSrc;
- pSi1 = pSi0 + 2 * n2;
- pSi2 = pSi1 + 2 * n2;
- pSi3 = pSi2 + 2 * n2;
-
- /* Calculation of first stage */
- do
- {
- /* input is in 1.31(q31) format and provide 4 guard bits for the input */
-
- /* Butterfly implementation */
- /* xa + xc */
- r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
- /* xa - xc */
- r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
-
- /* xb + xd */
- t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
-
- /* ya + yc */
- s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
- /* ya - yc */
- s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
-
- /* xa' = xa + xb + xc + xd */
- *pSi0++ = (r1 + t1);
- /* (xa + xc) - (xb + xd) */
- r1 = r1 - t1;
- /* yb + yd */
- t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
-
- /* ya' = ya + yb + yc + yd */
- *pSi0++ = (s1 + t2);
-
- /* (ya + yc) - (yb + yd) */
- s1 = s1 - t2;
-
- /* yb - yd */
- t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
- /* xb - xd */
- t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
-
- /* index calculation for the coefficients */
- ia2 = 2U * ia1;
- co2 = pCoef[ia2 * 2U];
- si2 = pCoef[(ia2 * 2U) + 1U];
-
- /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
- *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
- ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
-
- /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
- *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
- ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
-
- /* (xa - xc) + (yb - yd) */
- r1 = r2 + t1;
- /* (xa - xc) - (yb - yd) */
- r2 = r2 - t1;
-
- /* (ya - yc) - (xb - xd) */
- s1 = s2 - t2;
- /* (ya - yc) + (xb - xd) */
- s2 = s2 + t2;
-
- co1 = pCoef[ia1 * 2U];
- si1 = pCoef[(ia1 * 2U) + 1U];
-
- /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
- *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
- ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
-
- /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
- *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
- ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
-
- /* index calculation for the coefficients */
- ia3 = 3U * ia1;
- co3 = pCoef[ia3 * 2U];
- si3 = pCoef[(ia3 * 2U) + 1U];
-
- /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
- *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
- ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
-
- /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
- *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
- ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
-
- /* Twiddle coefficients index modifier */
- ia1 = ia1 + twidCoefModifier;
-
- } while (--j);
-
- /* end of first stage process */
-
- /* data is in 5.27(q27) format */
-
-
- /* start of Middle stages process */
-
-
- /* each stage in middle stages provides two down scaling of the input */
-
- twidCoefModifier <<= 2U;
-
-
- for (k = fftLen / 4U; k > 4U; k >>= 2U)
- {
- /* Initializations for the first stage */
- n1 = n2;
- n2 >>= 2U;
- ia1 = 0U;
-
- /* Calculation of first stage */
- for (j = 0U; j <= (n2 - 1U); j++)
- {
- /* index calculation for the coefficients */
- ia2 = ia1 + ia1;
- ia3 = ia2 + ia1;
- co1 = pCoef[ia1 * 2U];
- si1 = pCoef[(ia1 * 2U) + 1U];
- co2 = pCoef[ia2 * 2U];
- si2 = pCoef[(ia2 * 2U) + 1U];
- co3 = pCoef[ia3 * 2U];
- si3 = pCoef[(ia3 * 2U) + 1U];
- /* Twiddle coefficients index modifier */
- ia1 = ia1 + twidCoefModifier;
-
- pSi0 = pSrc + 2 * j;
- pSi1 = pSi0 + 2 * n2;
- pSi2 = pSi1 + 2 * n2;
- pSi3 = pSi2 + 2 * n2;
-
- for (i0 = j; i0 < fftLen; i0 += n1)
- {
- /* Butterfly implementation */
- /* xa + xc */
- r1 = pSi0[0] + pSi2[0];
-
- /* xa - xc */
- r2 = pSi0[0] - pSi2[0];
-
-
- /* ya + yc */
- s1 = pSi0[1] + pSi2[1];
-
- /* ya - yc */
- s2 = pSi0[1] - pSi2[1];
-
-
- /* xb + xd */
- t1 = pSi1[0] + pSi3[0];
-
-
- /* xa' = xa + xb + xc + xd */
- pSi0[0] = (r1 + t1) >> 2U;
- /* xa + xc -(xb + xd) */
- r1 = r1 - t1;
-
- /* yb + yd */
- t2 = pSi1[1] + pSi3[1];
-
- /* ya' = ya + yb + yc + yd */
- pSi0[1] = (s1 + t2) >> 2U;
- pSi0 += 2 * n1;
-
- /* (ya + yc) - (yb + yd) */
- s1 = s1 - t2;
-
- /* (yb - yd) */
- t1 = pSi1[1] - pSi3[1];
-
- /* (xb - xd) */
- t2 = pSi1[0] - pSi3[0];
-
-
- /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
- pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
- ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
-
- /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
- pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
- ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
- pSi1 += 2 * n1;
-
- /* (xa - xc) + (yb - yd) */
- r1 = r2 + t1;
- /* (xa - xc) - (yb - yd) */
- r2 = r2 - t1;
-
- /* (ya - yc) - (xb - xd) */
- s1 = s2 - t2;
- /* (ya - yc) + (xb - xd) */
- s2 = s2 + t2;
-
- /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
- pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
- ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
-
- /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
- pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
- ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
- pSi2 += 2 * n1;
-
- /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
- pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
- ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
-
- /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
- pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
- ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
- pSi3 += 2 * n1;
- }
- }
- twidCoefModifier <<= 2U;
- }
-#endif
/* End of Middle stages process */
@@ -656,53 +402,21 @@ void arm_radix4_butterfly_q31(
/* Calculations of last stage */
do
{
-
-#ifndef ARM_MATH_BIG_ENDIAN
-
/* Read xa (real), ya(imag) input */
- xaya = *__SIMD64(ptr1)++;
- xa = (q31_t) xaya;
- ya = (q31_t) (xaya >> 32);
+ xa = *ptr1++;
+ ya = *ptr1++;
/* Read xb (real), yb(imag) input */
- xbyb = *__SIMD64(ptr1)++;
- xb = (q31_t) xbyb;
- yb = (q31_t) (xbyb >> 32);
+ xb = *ptr1++;
+ yb = *ptr1++;
/* Read xc (real), yc(imag) input */
- xcyc = *__SIMD64(ptr1)++;
- xc = (q31_t) xcyc;
- yc = (q31_t) (xcyc >> 32);
+ xc = *ptr1++;
+ yc = *ptr1++;
/* Read xc (real), yc(imag) input */
- xdyd = *__SIMD64(ptr1)++;
- xd = (q31_t) xdyd;
- yd = (q31_t) (xdyd >> 32);
-
-#else
-
- /* Read xa (real), ya(imag) input */
- xaya = *__SIMD64(ptr1)++;
- ya = (q31_t) xaya;
- xa = (q31_t) (xaya >> 32);
-
- /* Read xb (real), yb(imag) input */
- xbyb = *__SIMD64(ptr1)++;
- yb = (q31_t) xbyb;
- xb = (q31_t) (xbyb >> 32);
-
- /* Read xc (real), yc(imag) input */
- xcyc = *__SIMD64(ptr1)++;
- yc = (q31_t) xcyc;
- xc = (q31_t) (xcyc >> 32);
-
- /* Read xc (real), yc(imag) input */
- xdyd = *__SIMD64(ptr1)++;
- yd = (q31_t) xdyd;
- xd = (q31_t) (xdyd >> 32);
-
-
-#endif
+ xd = *ptr1++;
+ yd = *ptr1++;
/* xa' = xa + xb + xc + xd */
xa_out = xa + xb + xc + xd;
@@ -752,71 +466,68 @@ void arm_radix4_butterfly_q31(
/**
- * @brief Core function for the Q31 CIFFT butterfly process.
- * @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
- * @param[in] fftLen length of the FFT.
- * @param[in] *pCoef points to twiddle coefficient buffer.
- * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
- * @return none.
+ @brief Core function for the Q31 CIFFT butterfly process.
+ @param[in,out] pSrc points to the in-place buffer of Q31 data type.
+ @param[in] fftLen length of the FFT.
+ @param[in] pCoef points to twiddle coefficient buffer.
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
*/
-
/*
-* Radix-4 IFFT algorithm used is :
-*
-* CIFFT uses same twiddle coefficients as CFFT Function
-* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
-*
-*
-* IFFT is implemented with following changes in equations from FFT
-*
-* Input real and imaginary data:
-* x(n) = xa + j * ya
-* x(n+N/4 ) = xb + j * yb
-* x(n+N/2 ) = xc + j * yc
-* x(n+3N 4) = xd + j * yd
-*
-*
-* Output real and imaginary data:
-* x(4r) = xa'+ j * ya'
-* x(4r+1) = xb'+ j * yb'
-* x(4r+2) = xc'+ j * yc'
-* x(4r+3) = xd'+ j * yd'
-*
-*
-* Twiddle factors for radix-4 IFFT:
-* Wn = co1 + j * (si1)
-* W2n = co2 + j * (si2)
-* W3n = co3 + j * (si3)
-
-* The real and imaginary output values for the radix-4 butterfly are
-* xa' = xa + xb + xc + xd
-* ya' = ya + yb + yc + yd
-* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
-* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
-* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
-* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
-* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
-* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
-*
-*/
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT Function
+ * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
void arm_radix4_butterfly_inverse_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pCoef,
- uint32_t twidCoefModifier)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier)
{
-#if defined(ARM_MATH_CM7)
- uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
- q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
- q31_t xa, xb, xc, xd;
- q31_t ya, yb, yc, yd;
- q31_t xa_out, xb_out, xc_out, xd_out;
- q31_t ya_out, yb_out, yc_out, yd_out;
-
- q31_t *ptr1;
- q63_t xaya, xbyb, xcyc, xdyd;
+ uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+ q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+ q31_t xa, xb, xc, xd;
+ q31_t ya, yb, yc, yd;
+ q31_t xa_out, xb_out, xc_out, xd_out;
+ q31_t ya_out, yb_out, yc_out, yd_out;
+
+ q31_t *ptr1;
/* input is be 1.31(q31) format for all FFT sizes */
/* Total process is divided into three stages */
@@ -836,7 +547,6 @@ void arm_radix4_butterfly_inverse_q31(
do
{
-
/* input is in 1.31(q31) format and provide 4 guard bits for the input */
/* index calculation for the input as, */
@@ -952,11 +662,11 @@ void arm_radix4_butterfly_inverse_q31(
/* index calculation for the coefficients */
ia2 = ia1 + ia1;
ia3 = ia2 + ia1;
- co1 = pCoef[ia1 * 2U];
+ co1 = pCoef[(ia1 * 2U)];
si1 = pCoef[(ia1 * 2U) + 1U];
- co2 = pCoef[ia2 * 2U];
+ co2 = pCoef[(ia2 * 2U)];
si2 = pCoef[(ia2 * 2U) + 1U];
- co3 = pCoef[ia3 * 2U];
+ co3 = pCoef[(ia3 * 2U)];
si3 = pCoef[(ia3 * 2U) + 1U];
/* Twiddle coefficients index modifier */
ia1 = ia1 + twidCoefModifier;
@@ -1005,9 +715,8 @@ void arm_radix4_butterfly_inverse_q31(
((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
/* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
- pSrc[(2U * i1) + 1U] =
- (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
- ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
+ pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
+ ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
/* (xa - xc) - (yb - yd) */
r1 = r2 - t1;
@@ -1038,247 +747,6 @@ void arm_radix4_butterfly_inverse_q31(
}
twidCoefModifier <<= 2U;
}
-#else
- uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
- q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
- q31_t xa, xb, xc, xd;
- q31_t ya, yb, yc, yd;
- q31_t xa_out, xb_out, xc_out, xd_out;
- q31_t ya_out, yb_out, yc_out, yd_out;
-
- q31_t *ptr1;
- q31_t *pSi0;
- q31_t *pSi1;
- q31_t *pSi2;
- q31_t *pSi3;
- q63_t xaya, xbyb, xcyc, xdyd;
-
- /* input is be 1.31(q31) format for all FFT sizes */
- /* Total process is divided into three stages */
- /* process first stage, middle stages, & last stage */
-
- /* Start of first stage process */
-
- /* Initializations for the first stage */
- n2 = fftLen;
- n1 = n2;
- /* n2 = fftLen/4 */
- n2 >>= 2U;
-
- ia1 = 0U;
-
- j = n2;
-
- pSi0 = pSrc;
- pSi1 = pSi0 + 2 * n2;
- pSi2 = pSi1 + 2 * n2;
- pSi3 = pSi2 + 2 * n2;
-
- do
- {
- /* Butterfly implementation */
- /* xa + xc */
- r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
- /* xa - xc */
- r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
-
- /* xb + xd */
- t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
-
- /* ya + yc */
- s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
- /* ya - yc */
- s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
-
- /* xa' = xa + xb + xc + xd */
- *pSi0++ = (r1 + t1);
- /* (xa + xc) - (xb + xd) */
- r1 = r1 - t1;
- /* yb + yd */
- t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
- /* ya' = ya + yb + yc + yd */
- *pSi0++ = (s1 + t2);
-
- /* (ya + yc) - (yb + yd) */
- s1 = s1 - t2;
-
- /* yb - yd */
- t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
- /* xb - xd */
- t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
-
- /* index calculation for the coefficients */
- ia2 = 2U * ia1;
- co2 = pCoef[ia2 * 2U];
- si2 = pCoef[(ia2 * 2U) + 1U];
-
- /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
- *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
- ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
-
- /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
- *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
- ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
-
- /* (xa - xc) - (yb - yd) */
- r1 = r2 - t1;
- /* (xa - xc) + (yb - yd) */
- r2 = r2 + t1;
-
- /* (ya - yc) + (xb - xd) */
- s1 = s2 + t2;
- /* (ya - yc) - (xb - xd) */
- s2 = s2 - t2;
-
- co1 = pCoef[ia1 * 2U];
- si1 = pCoef[(ia1 * 2U) + 1U];
-
- /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
- *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
- ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
-
- /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
- *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
- ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
-
- /* index calculation for the coefficients */
- ia3 = 3U * ia1;
- co3 = pCoef[ia3 * 2U];
- si3 = pCoef[(ia3 * 2U) + 1U];
-
- /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
- *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
- ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
-
- /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
- *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
- ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
-
- /* Twiddle coefficients index modifier */
- ia1 = ia1 + twidCoefModifier;
-
- } while (--j);
-
- /* data is in 5.27(q27) format */
- /* each stage provides two down scaling of the input */
-
-
- /* Start of Middle stages process */
-
- twidCoefModifier <<= 2U;
-
- /* Calculation of second stage to excluding last stage */
- for (k = fftLen / 4U; k > 4U; k >>= 2U)
- {
- /* Initializations for the first stage */
- n1 = n2;
- n2 >>= 2U;
- ia1 = 0U;
-
- for (j = 0; j <= (n2 - 1U); j++)
- {
- /* index calculation for the coefficients */
- ia2 = ia1 + ia1;
- ia3 = ia2 + ia1;
- co1 = pCoef[ia1 * 2U];
- si1 = pCoef[(ia1 * 2U) + 1U];
- co2 = pCoef[ia2 * 2U];
- si2 = pCoef[(ia2 * 2U) + 1U];
- co3 = pCoef[ia3 * 2U];
- si3 = pCoef[(ia3 * 2U) + 1U];
- /* Twiddle coefficients index modifier */
- ia1 = ia1 + twidCoefModifier;
-
- pSi0 = pSrc + 2 * j;
- pSi1 = pSi0 + 2 * n2;
- pSi2 = pSi1 + 2 * n2;
- pSi3 = pSi2 + 2 * n2;
-
- for (i0 = j; i0 < fftLen; i0 += n1)
- {
- /* Butterfly implementation */
- /* xa + xc */
- r1 = pSi0[0] + pSi2[0];
-
- /* xa - xc */
- r2 = pSi0[0] - pSi2[0];
-
-
- /* ya + yc */
- s1 = pSi0[1] + pSi2[1];
-
- /* ya - yc */
- s2 = pSi0[1] - pSi2[1];
-
-
- /* xb + xd */
- t1 = pSi1[0] + pSi3[0];
-
-
- /* xa' = xa + xb + xc + xd */
- pSi0[0] = (r1 + t1) >> 2U;
- /* xa + xc -(xb + xd) */
- r1 = r1 - t1;
- /* yb + yd */
- t2 = pSi1[1] + pSi3[1];
-
- /* ya' = ya + yb + yc + yd */
- pSi0[1] = (s1 + t2) >> 2U;
- pSi0 += 2 * n1;
-
- /* (ya + yc) - (yb + yd) */
- s1 = s1 - t2;
-
- /* (yb - yd) */
- t1 = pSi1[1] - pSi3[1];
-
- /* (xb - xd) */
- t2 = pSi1[0] - pSi3[0];
-
-
- /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
- pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
- ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
-
- /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
- pSi1[1] =
-
- (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
- ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
- pSi1 += 2 * n1;
-
- /* (xa - xc) - (yb - yd) */
- r1 = r2 - t1;
- /* (xa - xc) + (yb - yd) */
- r2 = r2 + t1;
-
- /* (ya - yc) + (xb - xd) */
- s1 = s2 + t2;
- /* (ya - yc) - (xb - xd) */
- s2 = s2 - t2;
-
- /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
- pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
- ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
-
- /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
- pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
- ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
- pSi2 += 2 * n1;
-
- /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
- pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
- ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
-
- /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
- pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
- ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
- pSi3 += 2 * n1;
- }
- }
- twidCoefModifier <<= 2U;
- }
-#endif
/* End of Middle stages process */
@@ -1298,51 +766,21 @@ void arm_radix4_butterfly_inverse_q31(
/* Calculations of last stage */
do
{
-#ifndef ARM_MATH_BIG_ENDIAN
/* Read xa (real), ya(imag) input */
- xaya = *__SIMD64(ptr1)++;
- xa = (q31_t) xaya;
- ya = (q31_t) (xaya >> 32);
+ xa = *ptr1++;
+ ya = *ptr1++;
/* Read xb (real), yb(imag) input */
- xbyb = *__SIMD64(ptr1)++;
- xb = (q31_t) xbyb;
- yb = (q31_t) (xbyb >> 32);
+ xb = *ptr1++;
+ yb = *ptr1++;
/* Read xc (real), yc(imag) input */
- xcyc = *__SIMD64(ptr1)++;
- xc = (q31_t) xcyc;
- yc = (q31_t) (xcyc >> 32);
+ xc = *ptr1++;
+ yc = *ptr1++;
/* Read xc (real), yc(imag) input */
- xdyd = *__SIMD64(ptr1)++;
- xd = (q31_t) xdyd;
- yd = (q31_t) (xdyd >> 32);
-
-#else
-
- /* Read xa (real), ya(imag) input */
- xaya = *__SIMD64(ptr1)++;
- ya = (q31_t) xaya;
- xa = (q31_t) (xaya >> 32);
-
- /* Read xb (real), yb(imag) input */
- xbyb = *__SIMD64(ptr1)++;
- yb = (q31_t) xbyb;
- xb = (q31_t) (xbyb >> 32);
-
- /* Read xc (real), yc(imag) input */
- xcyc = *__SIMD64(ptr1)++;
- yc = (q31_t) xcyc;
- xc = (q31_t) (xcyc >> 32);
-
- /* Read xc (real), yc(imag) input */
- xdyd = *__SIMD64(ptr1)++;
- yd = (q31_t) xdyd;
- xd = (q31_t) (xdyd >> 32);
-
-
-#endif
+ xd = *ptr1++;
+ yd = *ptr1++;
/* xa' = xa + xb + xc + xd */
xa_out = xa + xb + xc + xd;
diff --git a/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c b/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
index b70ab38..50048f6 100644
--- a/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
+++ b/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
@@ -3,13 +3,13 @@
* Title: arm_cfft_radix8_f32.c
* Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -33,20 +33,20 @@
* Internal helper function used by the FFTs
* -------------------------------------------------------------------- */
-/*
-* @brief Core function for the floating-point CFFT butterfly process.
-* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
-* @param[in] fftLen length of the FFT.
-* @param[in] *pCoef points to the twiddle coefficient buffer.
-* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type.
+ param[in] fftLen length of the FFT.
+ param[in] pCoef points to the twiddle coefficient buffer.
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ return none
*/
void arm_radix8_butterfly_f32(
-float32_t * pSrc,
-uint16_t fftLen,
-const float32_t * pCoef,
-uint16_t twidCoefModifier)
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
{
uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
diff --git a/DSP/Source/TransformFunctions/arm_dct4_f32.c b/DSP/Source/TransformFunctions/arm_dct4_f32.c
index 231c79a..87455dc 100644
--- a/DSP/Source/TransformFunctions/arm_dct4_f32.c
+++ b/DSP/Source/TransformFunctions/arm_dct4_f32.c
@@ -3,13 +3,13 @@
* Title: arm_dct4_f32.c
* Description: Processing function of DCT4 & IDCT4 F32
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,109 +29,111 @@
#include "arm_math.h"
/**
- * @ingroup groupTransforms
+ @ingroup groupTransforms
*/
/**
- * @defgroup DCT4_IDCT4 DCT Type IV Functions
- * Representation of signals by minimum number of values is important for storage and transmission.
- * The possibility of large discontinuity between the beginning and end of a period of a signal
- * in DFT can be avoided by extending the signal so that it is even-symmetric.
- * Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
- * spectrum and is very widely used in signal and image coding applications.
- * The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
- * DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
- *
- * DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
- * Reordering of the input data makes the computation of DCT just a problem of
- * computing the DFT of a real signal with a few additional operations.
- * This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
- *
- * DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
- * DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
- * DCT2 implementation can be described in the following steps:
- * - Re-ordering input
- * - Calculating Real FFT
- * - Multiplication of weights and Real FFT output and getting real part from the product.
- *
- * This process is explained by the block diagram below:
- * \image html DCT4.gif "Discrete Cosine Transform - type-IV"
- *
- * \par Algorithm:
- * The N-point type-IV DCT is defined as a real, linear transformation by the formula:
- * \image html DCT4Equation.gif
- * where k = 0,1,2,.....N-1
- *\par
- * Its inverse is defined as follows:
- * \image html IDCT4Equation.gif
- * where n = 0,1,2,.....N-1
- *\par
- * The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
- * The symmetry of the transform matrix indicates that the fast algorithms for the forward
- * and inverse transform computation are identical.
- * Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
- *
- * \par Lengths supported by the transform:
- * As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
- * The library provides separate functions for Q15, Q31, and floating-point data types.
- * \par Instance Structure
- * The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
- * A separate instance structure must be defined for each transform.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Initializes Real FFT as its process function is used internally in DCT4, by calling arm_rfft_init_f32().
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Manually initialize the instance structure as follows:
- *
- * where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
- * \c normalize is normalizing factor used and is equal to sqrt(2/N);
- * \c pTwiddle points to the twiddle factor table;
- * \c pCosFactor points to the cosFactor table;
- * \c pRfft points to the real FFT instance;
- * \c pCfft points to the complex FFT instance;
- * The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
- * and arm_rfft_f32() respectively for details regarding static initialization.
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the fixed-point versions of the DCT4 transform functions.
- * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
- * Refer to the function specific documentation below for usage guidelines.
+ @defgroup DCT4_IDCT4 DCT Type IV Functions
+
+ Representation of signals by minimum number of values is important for storage and transmission.
+ The possibility of large discontinuity between the beginning and end of a period of a signal
+ in DFT can be avoided by extending the signal so that it is even-symmetric.
+ Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
+ spectrum and is very widely used in signal and image coding applications.
+ The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
+ DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
+
+ DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
+ Reordering of the input data makes the computation of DCT just a problem of
+ computing the DFT of a real signal with a few additional operations.
+ This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
+
+ DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
+ DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
+ DCT2 implementation can be described in the following steps:
+ - Re-ordering input
+ - Calculating Real FFT
+ - Multiplication of weights and Real FFT output and getting real part from the product.
+
+ This process is explained by the block diagram below:
+ \image html DCT4.gif "Discrete Cosine Transform - type-IV"
+
+ @par Algorithm
+ The N-point type-IV DCT is defined as a real, linear transformation by the formula:
+ \image html DCT4Equation.gif
+ where k = 0, 1, 2, ..., N-1
+ @par
+ Its inverse is defined as follows:
+ \image html IDCT4Equation.gif
+ where n = 0, 1, 2, ..., N-1
+ @par
+ The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
+ The symmetry of the transform matrix indicates that the fast algorithms for the forward
+ and inverse transform computation are identical.
+ Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
+
+ @par Lengths supported by the transform:
+ As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
+ The library provides separate functions for Q15, Q31, and floating-point data types.
+
+ @par Instance Structure
+ The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
+ A separate instance structure must be defined for each transform.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Initializes Real FFT as its process function is used internally in DCT4, by calling \ref arm_rfft_init_f32().
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Manually initialize the instance structure as follows:
+
+ where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
+ \c normalize is normalizing factor used and is equal to sqrt(2/N);
+ \c pTwiddle points to the twiddle factor table;
+ \c pCosFactor points to the cosFactor table;
+ \c pRfft points to the real FFT instance;
+ \c pCfft points to the complex FFT instance;
+ The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
+ and arm_rfft_f32() respectively for details regarding static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the DCT4 transform functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
*/
/**
- * @addtogroup DCT4_IDCT4
- * @{
+ @addtogroup DCT4_IDCT4
+ @{
*/
/**
- * @brief Processing function for the floating-point DCT4/IDCT4.
- * @param[in] *S points to an instance of the floating-point DCT4/IDCT4 structure.
- * @param[in] *pState points to state buffer.
- * @param[in,out] *pInlineBuffer points to the in-place input and output buffer.
- * @return none.
+ @brief Processing function for the floating-point DCT4/IDCT4.
+ @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure
+ @param[in] pState points to state buffer
+ @param[in,out] pInlineBuffer points to the in-place input and output buffer
+ @return none
*/
void arm_dct4_f32(
const arm_dct4_instance_f32 * S,
- float32_t * pState,
- float32_t * pInlineBuffer)
+ float32_t * pState,
+ float32_t * pInlineBuffer)
{
- uint32_t i; /* Loop counter */
- float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */
- float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
- float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
- float32_t in; /* Temporary variable */
+ const float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */
+ const float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
+ float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
+ float32_t in; /* Temporary variable */
+ uint32_t i; /* Loop counter */
/* DCT4 computation involves DCT2 (which is calculated using RFFT)
@@ -153,13 +155,13 @@ void arm_dct4_f32(
* (d) Multiplying the output with the normalizing factor sqrt(2/N).
*/
- /*-------- Pre-processing ------------*/
+ /*-------- Pre-processing ------------*/
/* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
/* ----------------------------------------------------------------
- * Step1: Re-ordering of even and odd elements as,
+ * Step1: Re-ordering of even and odd elements as
* pState[i] = pInlineBuffer[2*i] and
* pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
---------------------------------------------------------------------*/
@@ -173,12 +175,11 @@ void arm_dct4_f32(
/* pbuff initialized to input buffer */
pbuff = pInlineBuffer;
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
/* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
- i = (uint32_t) S->Nby2 >> 2U;
+ i = S->Nby2 >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
@@ -199,7 +200,7 @@ void arm_dct4_f32(
*pS1++ = *pbuff++;
*pS2-- = *pbuff++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
} while (i > 0U);
@@ -210,7 +211,7 @@ void arm_dct4_f32(
pS1 = pState;
/* Initializing the loop counter to N/4 instead of N for loop unrolling */
- i = (uint32_t) S->N >> 2U;
+ i = S->N >> 2U;
/* Processing with loop unrolling 4 times as N is always multiple of 4.
* Compute 4 outputs at a time */
@@ -231,12 +232,12 @@ void arm_dct4_f32(
* Step2: Calculate RFFT for N-point input
* ---------------------------------------------------------- */
/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
- arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
+ arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
- /*----------------------------------------------------------------------
- * Step3: Multiply the FFT output with the weights.
- *----------------------------------------------------------------------*/
- arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
/* ----------- Post-processing ---------- */
/* DCT-IV can be obtained from DCT-II by the equation,
@@ -245,7 +246,7 @@ void arm_dct4_f32(
/* Getting only real part from the output and Converting to DCT-IV */
/* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
- i = ((uint32_t) S->N - 1U) >> 2U;
+ i = (S->N - 1U) >> 2U;
/* pbuff initialized to input buffer. */
pbuff = pInlineBuffer;
@@ -290,7 +291,7 @@ void arm_dct4_f32(
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
- i = ((uint32_t) S->N - 1U) % 0x4U;
+ i = (S->N - 1U) % 0x4U;
while (i > 0U)
{
@@ -298,6 +299,7 @@ void arm_dct4_f32(
/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
in = *pS1++ - in;
*pbuff++ = in;
+
/* points to the next real value */
pS1++;
@@ -306,10 +308,10 @@ void arm_dct4_f32(
}
- /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
/* Initializing the loop counter to N/4 instead of N for loop unrolling */
- i = (uint32_t) S->N >> 2U;
+ i = S->N >> 2U;
/* pbuff initialized to the pInlineBuffer(now contains the output values) */
pbuff = pInlineBuffer;
@@ -337,10 +339,8 @@ void arm_dct4_f32(
#else
- /* Run the below code for Cortex-M0 */
-
/* Initializing the loop counter to N/2 */
- i = (uint32_t) S->Nby2;
+ i = S->Nby2;
do
{
@@ -361,7 +361,7 @@ void arm_dct4_f32(
pS1 = pState;
/* Initializing the loop counter */
- i = (uint32_t) S->N;
+ i = S->N;
do
{
@@ -377,12 +377,12 @@ void arm_dct4_f32(
* Step2: Calculate RFFT for N-point input
* ---------------------------------------------------------- */
/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
- arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
+ arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
- /*----------------------------------------------------------------------
- * Step3: Multiply the FFT output with the weights.
- *----------------------------------------------------------------------*/
- arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
/* ----------- Post-processing ---------- */
/* DCT-IV can be obtained from DCT-II by the equation,
@@ -405,7 +405,7 @@ void arm_dct4_f32(
pS1++;
/* Initializing the loop counter */
- i = ((uint32_t) S->N - 1U);
+ i = (S->N - 1U);
do
{
@@ -413,21 +413,20 @@ void arm_dct4_f32(
/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
in = *pS1++ - in;
*pbuff++ = in;
+
/* points to the next real value */
pS1++;
-
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
} while (i > 0U);
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
- /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+ /* Initializing loop counter */
+ i = S->N;
- /* Initializing the loop counter */
- i = (uint32_t) S->N;
-
- /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ /* pbuff initialized to the pInlineBuffer (now contains the output values) */
pbuff = pInlineBuffer;
do
@@ -436,14 +435,14 @@ void arm_dct4_f32(
in = *pbuff;
*pbuff++ = in * S->normalize;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
i--;
} while (i > 0U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
}
/**
- * @} end of DCT4_IDCT4 group
- */
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/DSP/Source/TransformFunctions/arm_dct4_init_f32.c b/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
index 9b39cd4..3fd70e9 100644
--- a/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
+++ b/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
@@ -3,13 +3,13 @@
* Title: arm_dct4_init_f32.c
* Description: Initialization function of DCT-4 & IDCT4 F32
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,16417 +27,35 @@
*/
#include "arm_math.h"
+#include "arm_common_tables.h"
/**
- * @ingroup DCT4_IDCT4
+ @ingroup DCT4_IDCT4
+ */
+
+
+/**
+ @addtogroup DCT4_IDCT4
+ @{
*/
/**
- * @addtogroup DCT4_IDCT4_Table DCT Type IV Tables
- * @{
- */
+ @brief Initialization function for the floating-point DCT4/IDCT4.
+ @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure
+ @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure
+ @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure
+ @param[in] N length of the DCT4
+ @param[in] Nby2 half of the length of the DCT4
+ @param[in] normalize normalizing factor.
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : N is not a supported transform length
-/*
-* @brief Weights Table
-*/
+ @par Normalizing factor
+ The normalizing factor is sqrt(2/N), which depends on the size of transform N.
+ Floating-point normalizing factors are mentioned in the table below for different DCT sizes:
-/**
- * \par
- * Weights tables are generated using the formula :
- * where fftLenReal is the length of the real transform;
- * fftLenBy2 length of the internal complex transform.
- * ifftFlagR Selects forward (=0) or inverse (=1) transform.
- * bitReverseFlagR Selects bit reversed output (=0) or normal order
- * output (=1).
- * twidCoefRModifier stride modifier for the twiddle factor table.
- * The value is based on the FFT length;
- * pTwiddleARealpoints to the A array of twiddle coefficients;
- * pTwiddleBRealpoints to the B array of twiddle coefficients;
- * pCfft points to the CFFT Instance structure. The CFFT structure
- * must also be initialized. Refer to arm_cfft_radix4_f32() for details regarding
- * static initialization of the complex FFT instance structure.
+ @defgroup RealFFT Real FFT Functions
+
+ @par
+ The CMSIS DSP library includes specialized algorithms for computing the
+ FFT of real data sequences. The FFT is defined over complex data but
+ in many applications the input is real. Real FFT algorithms take advantage
+ of the symmetry properties of the FFT and have a speed advantage over complex
+ algorithms of the same length.
+ @par
+ The Fast RFFT algorith relays on the mixed radix CFFT that save processor usage.
+ @par
+ The real length N forward FFT of a sequence is computed using the steps shown below.
+ @par
+ \image html RFFT.gif "Real Fast Fourier Transform"
+ @par
+ The real sequence is initially treated as if it were complex to perform a CFFT.
+ Later, a processing stage reshapes the data to obtain half of the frequency spectrum
+ in complex format. Except the first complex number that contains the two real numbers
+ X[0] and X[N/2] all the data is complex. In other words, the first complex sample
+ contains two real values packed.
+ @par
+ The input for the inverse RFFT should keep the same format as the output of the
+ forward RFFT. A first processing stage pre-process the data to later perform an
+ inverse CFFT.
+ @par
+ \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+ @par
+ The algorithms for floating-point, Q15, and Q31 data are slightly different
+ and we describe each algorithm in turn.
+ @par Floating-point
+ The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32().
+ The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated
+ but are still documented.
+ @par
+ The FFT of a real N-point sequence has even symmetry in the frequency domain.
+ The second half of the data equals the conjugate of the first half flipped in frequency.
+ Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
+ These are packed into the output array in alternating real and imaginary components:
+ @par
+ X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
+ real[(N/2)-1], imag[(N/2)-1 }
+ @par
+ It happens that the first complex number (real[0], imag[0]) is actually
+ all real. real[0] represents the DC offset, and imag[0] should be 0.
+ (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
+ the first harmonic and so on.
+ @par
+ The real FFT functions pack the frequency domain data in this fashion.
+ The forward transform outputs the data in this form and the inverse
+ transform expects input data in this form. The function always performs
+ the needed bitreversal so that the input and output data is always in
+ normal order. The functions support lengths of [32, 64, 128, ..., 4096]
+ samples.
+ @par Q15 and Q31
+ The real algorithms are defined in a similar manner and utilize N/2 complex
+ transforms behind the scenes.
+ @par
+ The complex transforms used internally include scaling to prevent fixed-point
+ overflows. The overall scaling equals 1/(fftLen/2).
+ @par
+ A separate instance structure must be defined for each transform used but
+ twiddle factor and bit reversal tables can be reused.
+ @par
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Initializes twiddle factor table and bit reversal table pointers.
+ - Initializes the internal complex FFT data structure.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure
+ cannot be placed into a const data section. To place an instance structure
+ into a const data section, the instance structure should be manually
+ initialized as follows:
+
for (i = 0; i < n; i++)
-* {
-* pBTable[2 * i] = 0.5 * (1.0 + sin (2 * PI / (double) (2 * n) * (double) i));
-* pBTable[2 * i + 1] = 0.5 * (1.0 * cos (2 * PI / (double) (2 * n) * (double) i));
-* }
-* \par
-* Convert to fixed point Q31 format
-* round(pBTable[i] * pow(2, 31))
-*
-*/
-
-const q31_t realCoefBQ31[8192] = {
- (q31_t)0x40000000, (q31_t)0x40000000, (q31_t)0x400c90fe, (q31_t)0x3ffffec4,
- (q31_t)0x401921fb, (q31_t)0x3ffffb11, (q31_t)0x4025b2f7, (q31_t)0x3ffff4e6,
- (q31_t)0x403243f1, (q31_t)0x3fffec43, (q31_t)0x403ed4ea, (q31_t)0x3fffe128,
- (q31_t)0x404b65e1, (q31_t)0x3fffd396, (q31_t)0x4057f6d4, (q31_t)0x3fffc38c,
- (q31_t)0x406487c4, (q31_t)0x3fffb10b, (q31_t)0x407118b0, (q31_t)0x3fff9c12,
- (q31_t)0x407da998, (q31_t)0x3fff84a1, (q31_t)0x408a3a7b, (q31_t)0x3fff6ab9,
- (q31_t)0x4096cb58, (q31_t)0x3fff4e59, (q31_t)0x40a35c30, (q31_t)0x3fff2f82,
- (q31_t)0x40afed02, (q31_t)0x3fff0e32, (q31_t)0x40bc7dcc, (q31_t)0x3ffeea6c,
- (q31_t)0x40c90e90, (q31_t)0x3ffec42d, (q31_t)0x40d59f4c, (q31_t)0x3ffe9b77,
- (q31_t)0x40e22fff, (q31_t)0x3ffe704a, (q31_t)0x40eec0aa, (q31_t)0x3ffe42a4,
- (q31_t)0x40fb514b, (q31_t)0x3ffe1288, (q31_t)0x4107e1e3, (q31_t)0x3ffddff3,
- (q31_t)0x41147271, (q31_t)0x3ffdaae7, (q31_t)0x412102f4, (q31_t)0x3ffd7364,
- (q31_t)0x412d936c, (q31_t)0x3ffd3969, (q31_t)0x413a23d8, (q31_t)0x3ffcfcf6,
- (q31_t)0x4146b438, (q31_t)0x3ffcbe0c, (q31_t)0x4153448c, (q31_t)0x3ffc7caa,
- (q31_t)0x415fd4d2, (q31_t)0x3ffc38d1, (q31_t)0x416c650b, (q31_t)0x3ffbf280,
- (q31_t)0x4178f536, (q31_t)0x3ffba9b8, (q31_t)0x41858552, (q31_t)0x3ffb5e78,
- (q31_t)0x4192155f, (q31_t)0x3ffb10c1, (q31_t)0x419ea55d, (q31_t)0x3ffac092,
- (q31_t)0x41ab354b, (q31_t)0x3ffa6dec, (q31_t)0x41b7c528, (q31_t)0x3ffa18cf,
- (q31_t)0x41c454f5, (q31_t)0x3ff9c13a, (q31_t)0x41d0e4b0, (q31_t)0x3ff9672d,
- (q31_t)0x41dd7459, (q31_t)0x3ff90aaa, (q31_t)0x41ea03ef, (q31_t)0x3ff8abae,
- (q31_t)0x41f69373, (q31_t)0x3ff84a3c, (q31_t)0x420322e3, (q31_t)0x3ff7e652,
- (q31_t)0x420fb240, (q31_t)0x3ff77ff1, (q31_t)0x421c4188, (q31_t)0x3ff71718,
- (q31_t)0x4228d0bb, (q31_t)0x3ff6abc8, (q31_t)0x42355fd9, (q31_t)0x3ff63e01,
- (q31_t)0x4241eee2, (q31_t)0x3ff5cdc3, (q31_t)0x424e7dd4, (q31_t)0x3ff55b0d,
- (q31_t)0x425b0caf, (q31_t)0x3ff4e5e0, (q31_t)0x42679b73, (q31_t)0x3ff46e3c,
- (q31_t)0x42742a1f, (q31_t)0x3ff3f420, (q31_t)0x4280b8b3, (q31_t)0x3ff3778e,
- (q31_t)0x428d472e, (q31_t)0x3ff2f884, (q31_t)0x4299d590, (q31_t)0x3ff27703,
- (q31_t)0x42a663d8, (q31_t)0x3ff1f30b, (q31_t)0x42b2f207, (q31_t)0x3ff16c9c,
- (q31_t)0x42bf801a, (q31_t)0x3ff0e3b6, (q31_t)0x42cc0e13, (q31_t)0x3ff05858,
- (q31_t)0x42d89bf0, (q31_t)0x3fefca84, (q31_t)0x42e529b0, (q31_t)0x3fef3a39,
- (q31_t)0x42f1b755, (q31_t)0x3feea776, (q31_t)0x42fe44dc, (q31_t)0x3fee123d,
- (q31_t)0x430ad245, (q31_t)0x3fed7a8c, (q31_t)0x43175f91, (q31_t)0x3fece065,
- (q31_t)0x4323ecbe, (q31_t)0x3fec43c7, (q31_t)0x433079cc, (q31_t)0x3feba4b2,
- (q31_t)0x433d06bb, (q31_t)0x3feb0326, (q31_t)0x43499389, (q31_t)0x3fea5f23,
- (q31_t)0x43562038, (q31_t)0x3fe9b8a9, (q31_t)0x4362acc5, (q31_t)0x3fe90fb9,
- (q31_t)0x436f3931, (q31_t)0x3fe86452, (q31_t)0x437bc57b, (q31_t)0x3fe7b674,
- (q31_t)0x438851a2, (q31_t)0x3fe7061f, (q31_t)0x4394dda7, (q31_t)0x3fe65354,
- (q31_t)0x43a16988, (q31_t)0x3fe59e12, (q31_t)0x43adf546, (q31_t)0x3fe4e659,
- (q31_t)0x43ba80df, (q31_t)0x3fe42c2a, (q31_t)0x43c70c54, (q31_t)0x3fe36f84,
- (q31_t)0x43d397a3, (q31_t)0x3fe2b067, (q31_t)0x43e022cc, (q31_t)0x3fe1eed5,
- (q31_t)0x43ecadcf, (q31_t)0x3fe12acb, (q31_t)0x43f938ac, (q31_t)0x3fe0644b,
- (q31_t)0x4405c361, (q31_t)0x3fdf9b55, (q31_t)0x44124dee, (q31_t)0x3fdecfe8,
- (q31_t)0x441ed854, (q31_t)0x3fde0205, (q31_t)0x442b6290, (q31_t)0x3fdd31ac,
- (q31_t)0x4437eca4, (q31_t)0x3fdc5edc, (q31_t)0x4444768d, (q31_t)0x3fdb8996,
- (q31_t)0x4451004d, (q31_t)0x3fdab1d9, (q31_t)0x445d89e2, (q31_t)0x3fd9d7a7,
- (q31_t)0x446a134c, (q31_t)0x3fd8fafe, (q31_t)0x44769c8b, (q31_t)0x3fd81bdf,
- (q31_t)0x4483259d, (q31_t)0x3fd73a4a, (q31_t)0x448fae83, (q31_t)0x3fd6563f,
- (q31_t)0x449c373c, (q31_t)0x3fd56fbe, (q31_t)0x44a8bfc7, (q31_t)0x3fd486c7,
- (q31_t)0x44b54825, (q31_t)0x3fd39b5a, (q31_t)0x44c1d054, (q31_t)0x3fd2ad77,
- (q31_t)0x44ce5854, (q31_t)0x3fd1bd1e, (q31_t)0x44dae024, (q31_t)0x3fd0ca4f,
- (q31_t)0x44e767c5, (q31_t)0x3fcfd50b, (q31_t)0x44f3ef35, (q31_t)0x3fcedd50,
- (q31_t)0x45007674, (q31_t)0x3fcde320, (q31_t)0x450cfd82, (q31_t)0x3fcce67a,
- (q31_t)0x4519845e, (q31_t)0x3fcbe75e, (q31_t)0x45260b08, (q31_t)0x3fcae5cd,
- (q31_t)0x4532917f, (q31_t)0x3fc9e1c6, (q31_t)0x453f17c3, (q31_t)0x3fc8db4a,
- (q31_t)0x454b9dd3, (q31_t)0x3fc7d258, (q31_t)0x455823ae, (q31_t)0x3fc6c6f0,
- (q31_t)0x4564a955, (q31_t)0x3fc5b913, (q31_t)0x45712ec7, (q31_t)0x3fc4a8c1,
- (q31_t)0x457db403, (q31_t)0x3fc395f9, (q31_t)0x458a3908, (q31_t)0x3fc280bc,
- (q31_t)0x4596bdd7, (q31_t)0x3fc1690a, (q31_t)0x45a3426f, (q31_t)0x3fc04ee3,
- (q31_t)0x45afc6d0, (q31_t)0x3fbf3246, (q31_t)0x45bc4af8, (q31_t)0x3fbe1334,
- (q31_t)0x45c8cee7, (q31_t)0x3fbcf1ad, (q31_t)0x45d5529e, (q31_t)0x3fbbcdb1,
- (q31_t)0x45e1d61b, (q31_t)0x3fbaa740, (q31_t)0x45ee595d, (q31_t)0x3fb97e5a,
- (q31_t)0x45fadc66, (q31_t)0x3fb852ff, (q31_t)0x46075f33, (q31_t)0x3fb7252f,
- (q31_t)0x4613e1c5, (q31_t)0x3fb5f4ea, (q31_t)0x4620641a, (q31_t)0x3fb4c231,
- (q31_t)0x462ce634, (q31_t)0x3fb38d02, (q31_t)0x46396810, (q31_t)0x3fb2555f,
- (q31_t)0x4645e9af, (q31_t)0x3fb11b48, (q31_t)0x46526b10, (q31_t)0x3fafdebb,
- (q31_t)0x465eec33, (q31_t)0x3fae9fbb, (q31_t)0x466b6d16, (q31_t)0x3fad5e45,
- (q31_t)0x4677edbb, (q31_t)0x3fac1a5b, (q31_t)0x46846e1f, (q31_t)0x3faad3fd,
- (q31_t)0x4690ee44, (q31_t)0x3fa98b2a, (q31_t)0x469d6e27, (q31_t)0x3fa83fe3,
- (q31_t)0x46a9edc9, (q31_t)0x3fa6f228, (q31_t)0x46b66d29, (q31_t)0x3fa5a1f9,
- (q31_t)0x46c2ec48, (q31_t)0x3fa44f55, (q31_t)0x46cf6b23, (q31_t)0x3fa2fa3d,
- (q31_t)0x46dbe9bb, (q31_t)0x3fa1a2b2, (q31_t)0x46e86810, (q31_t)0x3fa048b2,
- (q31_t)0x46f4e620, (q31_t)0x3f9eec3e, (q31_t)0x470163eb, (q31_t)0x3f9d8d56,
- (q31_t)0x470de172, (q31_t)0x3f9c2bfb, (q31_t)0x471a5eb3, (q31_t)0x3f9ac82c,
- (q31_t)0x4726dbae, (q31_t)0x3f9961e8, (q31_t)0x47335862, (q31_t)0x3f97f932,
- (q31_t)0x473fd4cf, (q31_t)0x3f968e07, (q31_t)0x474c50f4, (q31_t)0x3f952069,
- (q31_t)0x4758ccd2, (q31_t)0x3f93b058, (q31_t)0x47654867, (q31_t)0x3f923dd2,
- (q31_t)0x4771c3b3, (q31_t)0x3f90c8da, (q31_t)0x477e3eb5, (q31_t)0x3f8f516e,
- (q31_t)0x478ab96e, (q31_t)0x3f8dd78f, (q31_t)0x479733dc, (q31_t)0x3f8c5b3d,
- (q31_t)0x47a3adff, (q31_t)0x3f8adc77, (q31_t)0x47b027d7, (q31_t)0x3f895b3e,
- (q31_t)0x47bca163, (q31_t)0x3f87d792, (q31_t)0x47c91aa3, (q31_t)0x3f865174,
- (q31_t)0x47d59396, (q31_t)0x3f84c8e2, (q31_t)0x47e20c3b, (q31_t)0x3f833ddd,
- (q31_t)0x47ee8493, (q31_t)0x3f81b065, (q31_t)0x47fafc9c, (q31_t)0x3f80207b,
- (q31_t)0x48077457, (q31_t)0x3f7e8e1e, (q31_t)0x4813ebc2, (q31_t)0x3f7cf94e,
- (q31_t)0x482062de, (q31_t)0x3f7b620c, (q31_t)0x482cd9a9, (q31_t)0x3f79c857,
- (q31_t)0x48395024, (q31_t)0x3f782c30, (q31_t)0x4845c64d, (q31_t)0x3f768d96,
- (q31_t)0x48523c25, (q31_t)0x3f74ec8a, (q31_t)0x485eb1ab, (q31_t)0x3f73490b,
- (q31_t)0x486b26de, (q31_t)0x3f71a31b, (q31_t)0x48779bbe, (q31_t)0x3f6ffab8,
- (q31_t)0x4884104b, (q31_t)0x3f6e4fe3, (q31_t)0x48908483, (q31_t)0x3f6ca29c,
- (q31_t)0x489cf867, (q31_t)0x3f6af2e3, (q31_t)0x48a96bf6, (q31_t)0x3f6940b8,
- (q31_t)0x48b5df30, (q31_t)0x3f678c1c, (q31_t)0x48c25213, (q31_t)0x3f65d50d,
- (q31_t)0x48cec4a0, (q31_t)0x3f641b8d, (q31_t)0x48db36d6, (q31_t)0x3f625f9b,
- (q31_t)0x48e7a8b5, (q31_t)0x3f60a138, (q31_t)0x48f41a3c, (q31_t)0x3f5ee063,
- (q31_t)0x49008b6a, (q31_t)0x3f5d1d1d, (q31_t)0x490cfc40, (q31_t)0x3f5b5765,
- (q31_t)0x49196cbc, (q31_t)0x3f598f3c, (q31_t)0x4925dcdf, (q31_t)0x3f57c4a2,
- (q31_t)0x49324ca7, (q31_t)0x3f55f796, (q31_t)0x493ebc14, (q31_t)0x3f54281a,
- (q31_t)0x494b2b27, (q31_t)0x3f52562c, (q31_t)0x495799dd, (q31_t)0x3f5081cd,
- (q31_t)0x49640837, (q31_t)0x3f4eaafe, (q31_t)0x49707635, (q31_t)0x3f4cd1be,
- (q31_t)0x497ce3d5, (q31_t)0x3f4af60d, (q31_t)0x49895118, (q31_t)0x3f4917eb,
- (q31_t)0x4995bdfd, (q31_t)0x3f473759, (q31_t)0x49a22a83, (q31_t)0x3f455456,
- (q31_t)0x49ae96aa, (q31_t)0x3f436ee3, (q31_t)0x49bb0271, (q31_t)0x3f4186ff,
- (q31_t)0x49c76dd8, (q31_t)0x3f3f9cab, (q31_t)0x49d3d8df, (q31_t)0x3f3dafe7,
- (q31_t)0x49e04385, (q31_t)0x3f3bc0b3, (q31_t)0x49ecadc9, (q31_t)0x3f39cf0e,
- (q31_t)0x49f917ac, (q31_t)0x3f37dafa, (q31_t)0x4a05812c, (q31_t)0x3f35e476,
- (q31_t)0x4a11ea49, (q31_t)0x3f33eb81, (q31_t)0x4a1e5303, (q31_t)0x3f31f01d,
- (q31_t)0x4a2abb59, (q31_t)0x3f2ff24a, (q31_t)0x4a37234a, (q31_t)0x3f2df206,
- (q31_t)0x4a438ad7, (q31_t)0x3f2bef53, (q31_t)0x4a4ff1fe, (q31_t)0x3f29ea31,
- (q31_t)0x4a5c58c0, (q31_t)0x3f27e29f, (q31_t)0x4a68bf1b, (q31_t)0x3f25d89e,
- (q31_t)0x4a752510, (q31_t)0x3f23cc2e, (q31_t)0x4a818a9d, (q31_t)0x3f21bd4e,
- (q31_t)0x4a8defc3, (q31_t)0x3f1fabff, (q31_t)0x4a9a5480, (q31_t)0x3f1d9842,
- (q31_t)0x4aa6b8d5, (q31_t)0x3f1b8215, (q31_t)0x4ab31cc1, (q31_t)0x3f19697a,
- (q31_t)0x4abf8043, (q31_t)0x3f174e70, (q31_t)0x4acbe35b, (q31_t)0x3f1530f7,
- (q31_t)0x4ad84609, (q31_t)0x3f13110f, (q31_t)0x4ae4a84b, (q31_t)0x3f10eeb9,
- (q31_t)0x4af10a22, (q31_t)0x3f0ec9f5, (q31_t)0x4afd6b8d, (q31_t)0x3f0ca2c2,
- (q31_t)0x4b09cc8c, (q31_t)0x3f0a7921, (q31_t)0x4b162d1d, (q31_t)0x3f084d12,
- (q31_t)0x4b228d42, (q31_t)0x3f061e95, (q31_t)0x4b2eecf8, (q31_t)0x3f03eda9,
- (q31_t)0x4b3b4c40, (q31_t)0x3f01ba50, (q31_t)0x4b47ab19, (q31_t)0x3eff8489,
- (q31_t)0x4b540982, (q31_t)0x3efd4c54, (q31_t)0x4b60677c, (q31_t)0x3efb11b1,
- (q31_t)0x4b6cc506, (q31_t)0x3ef8d4a1, (q31_t)0x4b79221f, (q31_t)0x3ef69523,
- (q31_t)0x4b857ec7, (q31_t)0x3ef45338, (q31_t)0x4b91dafc, (q31_t)0x3ef20ee0,
- (q31_t)0x4b9e36c0, (q31_t)0x3eefc81a, (q31_t)0x4baa9211, (q31_t)0x3eed7ee7,
- (q31_t)0x4bb6ecef, (q31_t)0x3eeb3347, (q31_t)0x4bc34759, (q31_t)0x3ee8e53a,
- (q31_t)0x4bcfa150, (q31_t)0x3ee694c1, (q31_t)0x4bdbfad1, (q31_t)0x3ee441da,
- (q31_t)0x4be853de, (q31_t)0x3ee1ec87, (q31_t)0x4bf4ac75, (q31_t)0x3edf94c7,
- (q31_t)0x4c010496, (q31_t)0x3edd3a9a, (q31_t)0x4c0d5c41, (q31_t)0x3edade01,
- (q31_t)0x4c19b374, (q31_t)0x3ed87efc, (q31_t)0x4c260a31, (q31_t)0x3ed61d8a,
- (q31_t)0x4c326075, (q31_t)0x3ed3b9ad, (q31_t)0x4c3eb641, (q31_t)0x3ed15363,
- (q31_t)0x4c4b0b94, (q31_t)0x3eceeaad, (q31_t)0x4c57606e, (q31_t)0x3ecc7f8b,
- (q31_t)0x4c63b4ce, (q31_t)0x3eca11fe, (q31_t)0x4c7008b3, (q31_t)0x3ec7a205,
- (q31_t)0x4c7c5c1e, (q31_t)0x3ec52fa0, (q31_t)0x4c88af0e, (q31_t)0x3ec2bad0,
- (q31_t)0x4c950182, (q31_t)0x3ec04394, (q31_t)0x4ca1537a, (q31_t)0x3ebdc9ed,
- (q31_t)0x4cada4f5, (q31_t)0x3ebb4ddb, (q31_t)0x4cb9f5f3, (q31_t)0x3eb8cf5d,
- (q31_t)0x4cc64673, (q31_t)0x3eb64e75, (q31_t)0x4cd29676, (q31_t)0x3eb3cb21,
- (q31_t)0x4cdee5f9, (q31_t)0x3eb14563, (q31_t)0x4ceb34fe, (q31_t)0x3eaebd3a,
- (q31_t)0x4cf78383, (q31_t)0x3eac32a6, (q31_t)0x4d03d189, (q31_t)0x3ea9a5a8,
- (q31_t)0x4d101f0e, (q31_t)0x3ea7163f, (q31_t)0x4d1c6c11, (q31_t)0x3ea4846c,
- (q31_t)0x4d28b894, (q31_t)0x3ea1f02f, (q31_t)0x4d350495, (q31_t)0x3e9f5988,
- (q31_t)0x4d415013, (q31_t)0x3e9cc076, (q31_t)0x4d4d9b0e, (q31_t)0x3e9a24fb,
- (q31_t)0x4d59e586, (q31_t)0x3e978715, (q31_t)0x4d662f7b, (q31_t)0x3e94e6c6,
- (q31_t)0x4d7278eb, (q31_t)0x3e92440d, (q31_t)0x4d7ec1d6, (q31_t)0x3e8f9eeb,
- (q31_t)0x4d8b0a3d, (q31_t)0x3e8cf75f, (q31_t)0x4d97521d, (q31_t)0x3e8a4d6a,
- (q31_t)0x4da39978, (q31_t)0x3e87a10c, (q31_t)0x4dafe04b, (q31_t)0x3e84f245,
- (q31_t)0x4dbc2698, (q31_t)0x3e824114, (q31_t)0x4dc86c5d, (q31_t)0x3e7f8d7b,
- (q31_t)0x4dd4b19a, (q31_t)0x3e7cd778, (q31_t)0x4de0f64f, (q31_t)0x3e7a1f0d,
- (q31_t)0x4ded3a7b, (q31_t)0x3e77643a, (q31_t)0x4df97e1d, (q31_t)0x3e74a6fd,
- (q31_t)0x4e05c135, (q31_t)0x3e71e759, (q31_t)0x4e1203c3, (q31_t)0x3e6f254c,
- (q31_t)0x4e1e45c6, (q31_t)0x3e6c60d7, (q31_t)0x4e2a873e, (q31_t)0x3e6999fa,
- (q31_t)0x4e36c82a, (q31_t)0x3e66d0b4, (q31_t)0x4e430889, (q31_t)0x3e640507,
- (q31_t)0x4e4f485c, (q31_t)0x3e6136f3, (q31_t)0x4e5b87a2, (q31_t)0x3e5e6676,
- (q31_t)0x4e67c65a, (q31_t)0x3e5b9392, (q31_t)0x4e740483, (q31_t)0x3e58be47,
- (q31_t)0x4e80421e, (q31_t)0x3e55e694, (q31_t)0x4e8c7f2a, (q31_t)0x3e530c7a,
- (q31_t)0x4e98bba7, (q31_t)0x3e502ff9, (q31_t)0x4ea4f793, (q31_t)0x3e4d5110,
- (q31_t)0x4eb132ef, (q31_t)0x3e4a6fc1, (q31_t)0x4ebd6db9, (q31_t)0x3e478c0b,
- (q31_t)0x4ec9a7f3, (q31_t)0x3e44a5ef, (q31_t)0x4ed5e19a, (q31_t)0x3e41bd6c,
- (q31_t)0x4ee21aaf, (q31_t)0x3e3ed282, (q31_t)0x4eee5331, (q31_t)0x3e3be532,
- (q31_t)0x4efa8b20, (q31_t)0x3e38f57c, (q31_t)0x4f06c27a, (q31_t)0x3e360360,
- (q31_t)0x4f12f941, (q31_t)0x3e330ede, (q31_t)0x4f1f2f73, (q31_t)0x3e3017f6,
- (q31_t)0x4f2b650f, (q31_t)0x3e2d1ea8, (q31_t)0x4f379a16, (q31_t)0x3e2a22f4,
- (q31_t)0x4f43ce86, (q31_t)0x3e2724db, (q31_t)0x4f500260, (q31_t)0x3e24245d,
- (q31_t)0x4f5c35a3, (q31_t)0x3e212179, (q31_t)0x4f68684e, (q31_t)0x3e1e1c30,
- (q31_t)0x4f749a61, (q31_t)0x3e1b1482, (q31_t)0x4f80cbdc, (q31_t)0x3e180a6f,
- (q31_t)0x4f8cfcbe, (q31_t)0x3e14fdf7, (q31_t)0x4f992d06, (q31_t)0x3e11ef1b,
- (q31_t)0x4fa55cb4, (q31_t)0x3e0eddd9, (q31_t)0x4fb18bc8, (q31_t)0x3e0bca34,
- (q31_t)0x4fbdba40, (q31_t)0x3e08b42a, (q31_t)0x4fc9e81e, (q31_t)0x3e059bbb,
- (q31_t)0x4fd6155f, (q31_t)0x3e0280e9, (q31_t)0x4fe24205, (q31_t)0x3dff63b2,
- (q31_t)0x4fee6e0d, (q31_t)0x3dfc4418, (q31_t)0x4ffa9979, (q31_t)0x3df9221a,
- (q31_t)0x5006c446, (q31_t)0x3df5fdb8, (q31_t)0x5012ee76, (q31_t)0x3df2d6f3,
- (q31_t)0x501f1807, (q31_t)0x3defadca, (q31_t)0x502b40f8, (q31_t)0x3dec823e,
- (q31_t)0x5037694b, (q31_t)0x3de9544f, (q31_t)0x504390fd, (q31_t)0x3de623fd,
- (q31_t)0x504fb80e, (q31_t)0x3de2f148, (q31_t)0x505bde7f, (q31_t)0x3ddfbc30,
- (q31_t)0x5068044e, (q31_t)0x3ddc84b5, (q31_t)0x5074297b, (q31_t)0x3dd94ad8,
- (q31_t)0x50804e06, (q31_t)0x3dd60e99, (q31_t)0x508c71ee, (q31_t)0x3dd2cff7,
- (q31_t)0x50989532, (q31_t)0x3dcf8ef3, (q31_t)0x50a4b7d3, (q31_t)0x3dcc4b8d,
- (q31_t)0x50b0d9d0, (q31_t)0x3dc905c5, (q31_t)0x50bcfb28, (q31_t)0x3dc5bd9b,
- (q31_t)0x50c91bda, (q31_t)0x3dc2730f, (q31_t)0x50d53be7, (q31_t)0x3dbf2622,
- (q31_t)0x50e15b4e, (q31_t)0x3dbbd6d4, (q31_t)0x50ed7a0e, (q31_t)0x3db88524,
- (q31_t)0x50f99827, (q31_t)0x3db53113, (q31_t)0x5105b599, (q31_t)0x3db1daa2,
- (q31_t)0x5111d263, (q31_t)0x3dae81cf, (q31_t)0x511dee84, (q31_t)0x3dab269b,
- (q31_t)0x512a09fc, (q31_t)0x3da7c907, (q31_t)0x513624cb, (q31_t)0x3da46912,
- (q31_t)0x51423ef0, (q31_t)0x3da106bd, (q31_t)0x514e586a, (q31_t)0x3d9da208,
- (q31_t)0x515a713a, (q31_t)0x3d9a3af2, (q31_t)0x5166895f, (q31_t)0x3d96d17d,
- (q31_t)0x5172a0d7, (q31_t)0x3d9365a8, (q31_t)0x517eb7a4, (q31_t)0x3d8ff772,
- (q31_t)0x518acdc4, (q31_t)0x3d8c86de, (q31_t)0x5196e337, (q31_t)0x3d8913ea,
- (q31_t)0x51a2f7fc, (q31_t)0x3d859e96, (q31_t)0x51af0c13, (q31_t)0x3d8226e4,
- (q31_t)0x51bb1f7c, (q31_t)0x3d7eacd2, (q31_t)0x51c73235, (q31_t)0x3d7b3061,
- (q31_t)0x51d3443f, (q31_t)0x3d77b192, (q31_t)0x51df5599, (q31_t)0x3d743064,
- (q31_t)0x51eb6643, (q31_t)0x3d70acd7, (q31_t)0x51f7763c, (q31_t)0x3d6d26ec,
- (q31_t)0x52038584, (q31_t)0x3d699ea3, (q31_t)0x520f941a, (q31_t)0x3d6613fb,
- (q31_t)0x521ba1fd, (q31_t)0x3d6286f6, (q31_t)0x5227af2e, (q31_t)0x3d5ef793,
- (q31_t)0x5233bbac, (q31_t)0x3d5b65d2, (q31_t)0x523fc776, (q31_t)0x3d57d1b3,
- (q31_t)0x524bd28c, (q31_t)0x3d543b37, (q31_t)0x5257dced, (q31_t)0x3d50a25e,
- (q31_t)0x5263e699, (q31_t)0x3d4d0728, (q31_t)0x526fef90, (q31_t)0x3d496994,
- (q31_t)0x527bf7d1, (q31_t)0x3d45c9a4, (q31_t)0x5287ff5b, (q31_t)0x3d422757,
- (q31_t)0x5294062f, (q31_t)0x3d3e82ae, (q31_t)0x52a00c4b, (q31_t)0x3d3adba7,
- (q31_t)0x52ac11af, (q31_t)0x3d373245, (q31_t)0x52b8165b, (q31_t)0x3d338687,
- (q31_t)0x52c41a4f, (q31_t)0x3d2fd86c, (q31_t)0x52d01d89, (q31_t)0x3d2c27f6,
- (q31_t)0x52dc2009, (q31_t)0x3d287523, (q31_t)0x52e821cf, (q31_t)0x3d24bff6,
- (q31_t)0x52f422db, (q31_t)0x3d21086c, (q31_t)0x5300232c, (q31_t)0x3d1d4e88,
- (q31_t)0x530c22c1, (q31_t)0x3d199248, (q31_t)0x5318219a, (q31_t)0x3d15d3ad,
- (q31_t)0x53241fb6, (q31_t)0x3d1212b7, (q31_t)0x53301d16, (q31_t)0x3d0e4f67,
- (q31_t)0x533c19b8, (q31_t)0x3d0a89bc, (q31_t)0x5348159d, (q31_t)0x3d06c1b6,
- (q31_t)0x535410c3, (q31_t)0x3d02f757, (q31_t)0x53600b2a, (q31_t)0x3cff2a9d,
- (q31_t)0x536c04d2, (q31_t)0x3cfb5b89, (q31_t)0x5377fdbb, (q31_t)0x3cf78a1b,
- (q31_t)0x5383f5e3, (q31_t)0x3cf3b653, (q31_t)0x538fed4b, (q31_t)0x3cefe032,
- (q31_t)0x539be3f2, (q31_t)0x3cec07b8, (q31_t)0x53a7d9d7, (q31_t)0x3ce82ce4,
- (q31_t)0x53b3cefa, (q31_t)0x3ce44fb7, (q31_t)0x53bfc35b, (q31_t)0x3ce07031,
- (q31_t)0x53cbb6f8, (q31_t)0x3cdc8e52, (q31_t)0x53d7a9d3, (q31_t)0x3cd8aa1b,
- (q31_t)0x53e39be9, (q31_t)0x3cd4c38b, (q31_t)0x53ef8d3c, (q31_t)0x3cd0daa2,
- (q31_t)0x53fb7dc9, (q31_t)0x3cccef62, (q31_t)0x54076d91, (q31_t)0x3cc901c9,
- (q31_t)0x54135c94, (q31_t)0x3cc511d9, (q31_t)0x541f4ad1, (q31_t)0x3cc11f90,
- (q31_t)0x542b3846, (q31_t)0x3cbd2af0, (q31_t)0x543724f5, (q31_t)0x3cb933f9,
- (q31_t)0x544310dd, (q31_t)0x3cb53aaa, (q31_t)0x544efbfc, (q31_t)0x3cb13f04,
- (q31_t)0x545ae653, (q31_t)0x3cad4107, (q31_t)0x5466cfe1, (q31_t)0x3ca940b3,
- (q31_t)0x5472b8a5, (q31_t)0x3ca53e09, (q31_t)0x547ea0a0, (q31_t)0x3ca13908,
- (q31_t)0x548a87d1, (q31_t)0x3c9d31b0, (q31_t)0x54966e36, (q31_t)0x3c992803,
- (q31_t)0x54a253d1, (q31_t)0x3c951bff, (q31_t)0x54ae38a0, (q31_t)0x3c910da5,
- (q31_t)0x54ba1ca3, (q31_t)0x3c8cfcf6, (q31_t)0x54c5ffd9, (q31_t)0x3c88e9f1,
- (q31_t)0x54d1e242, (q31_t)0x3c84d496, (q31_t)0x54ddc3de, (q31_t)0x3c80bce7,
- (q31_t)0x54e9a4ac, (q31_t)0x3c7ca2e2, (q31_t)0x54f584ac, (q31_t)0x3c788688,
- (q31_t)0x550163dc, (q31_t)0x3c7467d9, (q31_t)0x550d423d, (q31_t)0x3c7046d6,
- (q31_t)0x55191fcf, (q31_t)0x3c6c237e, (q31_t)0x5524fc90, (q31_t)0x3c67fdd1,
- (q31_t)0x5530d881, (q31_t)0x3c63d5d1, (q31_t)0x553cb3a0, (q31_t)0x3c5fab7c,
- (q31_t)0x55488dee, (q31_t)0x3c5b7ed4, (q31_t)0x5554676a, (q31_t)0x3c574fd8,
- (q31_t)0x55604013, (q31_t)0x3c531e88, (q31_t)0x556c17e9, (q31_t)0x3c4eeae5,
- (q31_t)0x5577eeec, (q31_t)0x3c4ab4ef, (q31_t)0x5583c51b, (q31_t)0x3c467ca6,
- (q31_t)0x558f9a76, (q31_t)0x3c42420a, (q31_t)0x559b6efb, (q31_t)0x3c3e051b,
- (q31_t)0x55a742ac, (q31_t)0x3c39c5da, (q31_t)0x55b31587, (q31_t)0x3c358446,
- (q31_t)0x55bee78c, (q31_t)0x3c314060, (q31_t)0x55cab8ba, (q31_t)0x3c2cfa28,
- (q31_t)0x55d68911, (q31_t)0x3c28b19e, (q31_t)0x55e25890, (q31_t)0x3c2466c2,
- (q31_t)0x55ee2738, (q31_t)0x3c201994, (q31_t)0x55f9f507, (q31_t)0x3c1bca16,
- (q31_t)0x5605c1fd, (q31_t)0x3c177845, (q31_t)0x56118e1a, (q31_t)0x3c132424,
- (q31_t)0x561d595d, (q31_t)0x3c0ecdb2, (q31_t)0x562923c5, (q31_t)0x3c0a74f0,
- (q31_t)0x5634ed53, (q31_t)0x3c0619dc, (q31_t)0x5640b606, (q31_t)0x3c01bc78,
- (q31_t)0x564c7ddd, (q31_t)0x3bfd5cc4, (q31_t)0x565844d8, (q31_t)0x3bf8fac0,
- (q31_t)0x56640af7, (q31_t)0x3bf4966c, (q31_t)0x566fd039, (q31_t)0x3bf02fc9,
- (q31_t)0x567b949d, (q31_t)0x3bebc6d5, (q31_t)0x56875823, (q31_t)0x3be75b93,
- (q31_t)0x56931acb, (q31_t)0x3be2ee01, (q31_t)0x569edc94, (q31_t)0x3bde7e20,
- (q31_t)0x56aa9d7e, (q31_t)0x3bda0bf0, (q31_t)0x56b65d88, (q31_t)0x3bd59771,
- (q31_t)0x56c21cb2, (q31_t)0x3bd120a4, (q31_t)0x56cddafb, (q31_t)0x3bcca789,
- (q31_t)0x56d99864, (q31_t)0x3bc82c1f, (q31_t)0x56e554ea, (q31_t)0x3bc3ae67,
- (q31_t)0x56f1108f, (q31_t)0x3bbf2e62, (q31_t)0x56fccb51, (q31_t)0x3bbaac0e,
- (q31_t)0x57088531, (q31_t)0x3bb6276e, (q31_t)0x57143e2d, (q31_t)0x3bb1a080,
- (q31_t)0x571ff646, (q31_t)0x3bad1744, (q31_t)0x572bad7a, (q31_t)0x3ba88bbc,
- (q31_t)0x573763c9, (q31_t)0x3ba3fde7, (q31_t)0x57431933, (q31_t)0x3b9f6dc5,
- (q31_t)0x574ecdb8, (q31_t)0x3b9adb57, (q31_t)0x575a8157, (q31_t)0x3b96469d,
- (q31_t)0x5766340f, (q31_t)0x3b91af97, (q31_t)0x5771e5e0, (q31_t)0x3b8d1644,
- (q31_t)0x577d96ca, (q31_t)0x3b887aa6, (q31_t)0x578946cc, (q31_t)0x3b83dcbc,
- (q31_t)0x5794f5e6, (q31_t)0x3b7f3c87, (q31_t)0x57a0a417, (q31_t)0x3b7a9a07,
- (q31_t)0x57ac515f, (q31_t)0x3b75f53c, (q31_t)0x57b7fdbd, (q31_t)0x3b714e25,
- (q31_t)0x57c3a931, (q31_t)0x3b6ca4c4, (q31_t)0x57cf53bb, (q31_t)0x3b67f919,
- (q31_t)0x57dafd59, (q31_t)0x3b634b23, (q31_t)0x57e6a60c, (q31_t)0x3b5e9ae4,
- (q31_t)0x57f24dd3, (q31_t)0x3b59e85a, (q31_t)0x57fdf4ae, (q31_t)0x3b553386,
- (q31_t)0x58099a9c, (q31_t)0x3b507c69, (q31_t)0x58153f9d, (q31_t)0x3b4bc303,
- (q31_t)0x5820e3b0, (q31_t)0x3b470753, (q31_t)0x582c86d5, (q31_t)0x3b42495a,
- (q31_t)0x5838290c, (q31_t)0x3b3d8918, (q31_t)0x5843ca53, (q31_t)0x3b38c68e,
- (q31_t)0x584f6aab, (q31_t)0x3b3401bb, (q31_t)0x585b0a13, (q31_t)0x3b2f3aa0,
- (q31_t)0x5866a88a, (q31_t)0x3b2a713d, (q31_t)0x58724611, (q31_t)0x3b25a591,
- (q31_t)0x587de2a7, (q31_t)0x3b20d79e, (q31_t)0x58897e4a, (q31_t)0x3b1c0764,
- (q31_t)0x589518fc, (q31_t)0x3b1734e2, (q31_t)0x58a0b2bb, (q31_t)0x3b126019,
- (q31_t)0x58ac4b87, (q31_t)0x3b0d8909, (q31_t)0x58b7e35f, (q31_t)0x3b08afb2,
- (q31_t)0x58c37a44, (q31_t)0x3b03d414, (q31_t)0x58cf1034, (q31_t)0x3afef630,
- (q31_t)0x58daa52f, (q31_t)0x3afa1605, (q31_t)0x58e63935, (q31_t)0x3af53395,
- (q31_t)0x58f1cc45, (q31_t)0x3af04edf, (q31_t)0x58fd5e5f, (q31_t)0x3aeb67e3,
- (q31_t)0x5908ef82, (q31_t)0x3ae67ea1, (q31_t)0x59147fae, (q31_t)0x3ae1931a,
- (q31_t)0x59200ee3, (q31_t)0x3adca54e, (q31_t)0x592b9d1f, (q31_t)0x3ad7b53d,
- (q31_t)0x59372a64, (q31_t)0x3ad2c2e8, (q31_t)0x5942b6af, (q31_t)0x3acdce4d,
- (q31_t)0x594e4201, (q31_t)0x3ac8d76f, (q31_t)0x5959cc5a, (q31_t)0x3ac3de4c,
- (q31_t)0x596555b8, (q31_t)0x3abee2e5, (q31_t)0x5970de1b, (q31_t)0x3ab9e53a,
- (q31_t)0x597c6584, (q31_t)0x3ab4e54c, (q31_t)0x5987ebf0, (q31_t)0x3aafe31b,
- (q31_t)0x59937161, (q31_t)0x3aaadea6, (q31_t)0x599ef5d6, (q31_t)0x3aa5d7ee,
- (q31_t)0x59aa794d, (q31_t)0x3aa0cef3, (q31_t)0x59b5fbc8, (q31_t)0x3a9bc3b6,
- (q31_t)0x59c17d44, (q31_t)0x3a96b636, (q31_t)0x59ccfdc2, (q31_t)0x3a91a674,
- (q31_t)0x59d87d42, (q31_t)0x3a8c9470, (q31_t)0x59e3fbc3, (q31_t)0x3a87802a,
- (q31_t)0x59ef7944, (q31_t)0x3a8269a3, (q31_t)0x59faf5c5, (q31_t)0x3a7d50da,
- (q31_t)0x5a067145, (q31_t)0x3a7835cf, (q31_t)0x5a11ebc5, (q31_t)0x3a731884,
- (q31_t)0x5a1d6544, (q31_t)0x3a6df8f8, (q31_t)0x5a28ddc0, (q31_t)0x3a68d72b,
- (q31_t)0x5a34553b, (q31_t)0x3a63b31d, (q31_t)0x5a3fcbb3, (q31_t)0x3a5e8cd0,
- (q31_t)0x5a4b4128, (q31_t)0x3a596442, (q31_t)0x5a56b599, (q31_t)0x3a543974,
- (q31_t)0x5a622907, (q31_t)0x3a4f0c67, (q31_t)0x5a6d9b70, (q31_t)0x3a49dd1a,
- (q31_t)0x5a790cd4, (q31_t)0x3a44ab8e, (q31_t)0x5a847d33, (q31_t)0x3a3f77c3,
- (q31_t)0x5a8fec8c, (q31_t)0x3a3a41b9, (q31_t)0x5a9b5adf, (q31_t)0x3a350970,
- (q31_t)0x5aa6c82b, (q31_t)0x3a2fcee8, (q31_t)0x5ab23471, (q31_t)0x3a2a9223,
- (q31_t)0x5abd9faf, (q31_t)0x3a25531f, (q31_t)0x5ac909e5, (q31_t)0x3a2011de,
- (q31_t)0x5ad47312, (q31_t)0x3a1ace5f, (q31_t)0x5adfdb37, (q31_t)0x3a1588a2,
- (q31_t)0x5aeb4253, (q31_t)0x3a1040a8, (q31_t)0x5af6a865, (q31_t)0x3a0af671,
- (q31_t)0x5b020d6c, (q31_t)0x3a05a9fd, (q31_t)0x5b0d716a, (q31_t)0x3a005b4d,
- (q31_t)0x5b18d45c, (q31_t)0x39fb0a60, (q31_t)0x5b243643, (q31_t)0x39f5b737,
- (q31_t)0x5b2f971e, (q31_t)0x39f061d2, (q31_t)0x5b3af6ec, (q31_t)0x39eb0a31,
- (q31_t)0x5b4655ae, (q31_t)0x39e5b054, (q31_t)0x5b51b363, (q31_t)0x39e0543c,
- (q31_t)0x5b5d100a, (q31_t)0x39daf5e8, (q31_t)0x5b686ba3, (q31_t)0x39d5955a,
- (q31_t)0x5b73c62d, (q31_t)0x39d03291, (q31_t)0x5b7f1fa9, (q31_t)0x39cacd8d,
- (q31_t)0x5b8a7815, (q31_t)0x39c5664f, (q31_t)0x5b95cf71, (q31_t)0x39bffcd7,
- (q31_t)0x5ba125bd, (q31_t)0x39ba9125, (q31_t)0x5bac7af9, (q31_t)0x39b52339,
- (q31_t)0x5bb7cf23, (q31_t)0x39afb313, (q31_t)0x5bc3223c, (q31_t)0x39aa40b4,
- (q31_t)0x5bce7442, (q31_t)0x39a4cc1c, (q31_t)0x5bd9c537, (q31_t)0x399f554b,
- (q31_t)0x5be51518, (q31_t)0x3999dc42, (q31_t)0x5bf063e6, (q31_t)0x399460ff,
- (q31_t)0x5bfbb1a0, (q31_t)0x398ee385, (q31_t)0x5c06fe46, (q31_t)0x398963d2,
- (q31_t)0x5c1249d8, (q31_t)0x3983e1e8, (q31_t)0x5c1d9454, (q31_t)0x397e5dc6,
- (q31_t)0x5c28ddbb, (q31_t)0x3978d76c, (q31_t)0x5c34260c, (q31_t)0x39734edc,
- (q31_t)0x5c3f6d47, (q31_t)0x396dc414, (q31_t)0x5c4ab36b, (q31_t)0x39683715,
- (q31_t)0x5c55f878, (q31_t)0x3962a7e0, (q31_t)0x5c613c6d, (q31_t)0x395d1675,
- (q31_t)0x5c6c7f4a, (q31_t)0x395782d3, (q31_t)0x5c77c10e, (q31_t)0x3951ecfc,
- (q31_t)0x5c8301b9, (q31_t)0x394c54ee, (q31_t)0x5c8e414b, (q31_t)0x3946baac,
- (q31_t)0x5c997fc4, (q31_t)0x39411e33, (q31_t)0x5ca4bd21, (q31_t)0x393b7f86,
- (q31_t)0x5caff965, (q31_t)0x3935dea4, (q31_t)0x5cbb348d, (q31_t)0x39303b8e,
- (q31_t)0x5cc66e99, (q31_t)0x392a9642, (q31_t)0x5cd1a78a, (q31_t)0x3924eec3,
- (q31_t)0x5cdcdf5e, (q31_t)0x391f4510, (q31_t)0x5ce81615, (q31_t)0x39199929,
- (q31_t)0x5cf34baf, (q31_t)0x3913eb0e, (q31_t)0x5cfe802b, (q31_t)0x390e3ac0,
- (q31_t)0x5d09b389, (q31_t)0x3908883f, (q31_t)0x5d14e5c9, (q31_t)0x3902d38b,
- (q31_t)0x5d2016e9, (q31_t)0x38fd1ca4, (q31_t)0x5d2b46ea, (q31_t)0x38f7638b,
- (q31_t)0x5d3675cb, (q31_t)0x38f1a840, (q31_t)0x5d41a38c, (q31_t)0x38ebeac2,
- (q31_t)0x5d4cd02c, (q31_t)0x38e62b13, (q31_t)0x5d57fbaa, (q31_t)0x38e06932,
- (q31_t)0x5d632608, (q31_t)0x38daa520, (q31_t)0x5d6e4f43, (q31_t)0x38d4dedd,
- (q31_t)0x5d79775c, (q31_t)0x38cf1669, (q31_t)0x5d849e51, (q31_t)0x38c94bc4,
- (q31_t)0x5d8fc424, (q31_t)0x38c37eef, (q31_t)0x5d9ae8d2, (q31_t)0x38bdafea,
- (q31_t)0x5da60c5d, (q31_t)0x38b7deb4, (q31_t)0x5db12ec3, (q31_t)0x38b20b4f,
- (q31_t)0x5dbc5004, (q31_t)0x38ac35ba, (q31_t)0x5dc7701f, (q31_t)0x38a65df6,
- (q31_t)0x5dd28f15, (q31_t)0x38a08402, (q31_t)0x5dddace4, (q31_t)0x389aa7e0,
- (q31_t)0x5de8c98c, (q31_t)0x3894c98f, (q31_t)0x5df3e50d, (q31_t)0x388ee910,
- (q31_t)0x5dfeff67, (q31_t)0x38890663, (q31_t)0x5e0a1898, (q31_t)0x38832187,
- (q31_t)0x5e1530a1, (q31_t)0x387d3a7e, (q31_t)0x5e204781, (q31_t)0x38775147,
- (q31_t)0x5e2b5d38, (q31_t)0x387165e3, (q31_t)0x5e3671c5, (q31_t)0x386b7852,
- (q31_t)0x5e418528, (q31_t)0x38658894, (q31_t)0x5e4c9760, (q31_t)0x385f96a9,
- (q31_t)0x5e57a86d, (q31_t)0x3859a292, (q31_t)0x5e62b84f, (q31_t)0x3853ac4f,
- (q31_t)0x5e6dc705, (q31_t)0x384db3e0, (q31_t)0x5e78d48e, (q31_t)0x3847b946,
- (q31_t)0x5e83e0eb, (q31_t)0x3841bc7f, (q31_t)0x5e8eec1b, (q31_t)0x383bbd8e,
- (q31_t)0x5e99f61d, (q31_t)0x3835bc71, (q31_t)0x5ea4fef0, (q31_t)0x382fb92a,
- (q31_t)0x5eb00696, (q31_t)0x3829b3b9, (q31_t)0x5ebb0d0d, (q31_t)0x3823ac1d,
- (q31_t)0x5ec61254, (q31_t)0x381da256, (q31_t)0x5ed1166b, (q31_t)0x38179666,
- (q31_t)0x5edc1953, (q31_t)0x3811884d, (q31_t)0x5ee71b0a, (q31_t)0x380b780a,
- (q31_t)0x5ef21b90, (q31_t)0x3805659e, (q31_t)0x5efd1ae4, (q31_t)0x37ff5109,
- (q31_t)0x5f081907, (q31_t)0x37f93a4b, (q31_t)0x5f1315f7, (q31_t)0x37f32165,
- (q31_t)0x5f1e11b5, (q31_t)0x37ed0657, (q31_t)0x5f290c3f, (q31_t)0x37e6e921,
- (q31_t)0x5f340596, (q31_t)0x37e0c9c3, (q31_t)0x5f3efdb9, (q31_t)0x37daa83d,
- (q31_t)0x5f49f4a8, (q31_t)0x37d48490, (q31_t)0x5f54ea62, (q31_t)0x37ce5ebd,
- (q31_t)0x5f5fdee6, (q31_t)0x37c836c2, (q31_t)0x5f6ad235, (q31_t)0x37c20ca1,
- (q31_t)0x5f75c44e, (q31_t)0x37bbe05a, (q31_t)0x5f80b531, (q31_t)0x37b5b1ec,
- (q31_t)0x5f8ba4dc, (q31_t)0x37af8159, (q31_t)0x5f969350, (q31_t)0x37a94ea0,
- (q31_t)0x5fa1808c, (q31_t)0x37a319c2, (q31_t)0x5fac6c91, (q31_t)0x379ce2be,
- (q31_t)0x5fb7575c, (q31_t)0x3796a996, (q31_t)0x5fc240ef, (q31_t)0x37906e49,
- (q31_t)0x5fcd2948, (q31_t)0x378a30d8, (q31_t)0x5fd81067, (q31_t)0x3783f143,
- (q31_t)0x5fe2f64c, (q31_t)0x377daf89, (q31_t)0x5feddaf6, (q31_t)0x37776bac,
- (q31_t)0x5ff8be65, (q31_t)0x377125ac, (q31_t)0x6003a099, (q31_t)0x376add88,
- (q31_t)0x600e8190, (q31_t)0x37649341, (q31_t)0x6019614c, (q31_t)0x375e46d8,
- (q31_t)0x60243fca, (q31_t)0x3757f84c, (q31_t)0x602f1d0b, (q31_t)0x3751a79e,
- (q31_t)0x6039f90f, (q31_t)0x374b54ce, (q31_t)0x6044d3d4, (q31_t)0x3744ffdd,
- (q31_t)0x604fad5b, (q31_t)0x373ea8ca, (q31_t)0x605a85a3, (q31_t)0x37384f95,
- (q31_t)0x60655cac, (q31_t)0x3731f440, (q31_t)0x60703275, (q31_t)0x372b96ca,
- (q31_t)0x607b06fe, (q31_t)0x37253733, (q31_t)0x6085da46, (q31_t)0x371ed57c,
- (q31_t)0x6090ac4d, (q31_t)0x371871a5, (q31_t)0x609b7d13, (q31_t)0x37120bae,
- (q31_t)0x60a64c97, (q31_t)0x370ba398, (q31_t)0x60b11ad9, (q31_t)0x37053962,
- (q31_t)0x60bbe7d8, (q31_t)0x36fecd0e, (q31_t)0x60c6b395, (q31_t)0x36f85e9a,
- (q31_t)0x60d17e0d, (q31_t)0x36f1ee09, (q31_t)0x60dc4742, (q31_t)0x36eb7b58,
- (q31_t)0x60e70f32, (q31_t)0x36e5068a, (q31_t)0x60f1d5de, (q31_t)0x36de8f9e,
- (q31_t)0x60fc9b44, (q31_t)0x36d81695, (q31_t)0x61075f65, (q31_t)0x36d19b6e,
- (q31_t)0x61122240, (q31_t)0x36cb1e2a, (q31_t)0x611ce3d5, (q31_t)0x36c49ec9,
- (q31_t)0x6127a423, (q31_t)0x36be1d4c, (q31_t)0x61326329, (q31_t)0x36b799b3,
- (q31_t)0x613d20e8, (q31_t)0x36b113fd, (q31_t)0x6147dd5f, (q31_t)0x36aa8c2c,
- (q31_t)0x6152988d, (q31_t)0x36a4023f, (q31_t)0x615d5273, (q31_t)0x369d7637,
- (q31_t)0x61680b0f, (q31_t)0x3696e814, (q31_t)0x6172c262, (q31_t)0x369057d6,
- (q31_t)0x617d786a, (q31_t)0x3689c57d, (q31_t)0x61882d28, (q31_t)0x3683310b,
- (q31_t)0x6192e09b, (q31_t)0x367c9a7e, (q31_t)0x619d92c2, (q31_t)0x367601d7,
- (q31_t)0x61a8439e, (q31_t)0x366f6717, (q31_t)0x61b2f32e, (q31_t)0x3668ca3e,
- (q31_t)0x61bda171, (q31_t)0x36622b4c, (q31_t)0x61c84e67, (q31_t)0x365b8a41,
- (q31_t)0x61d2fa0f, (q31_t)0x3654e71d, (q31_t)0x61dda46a, (q31_t)0x364e41e2,
- (q31_t)0x61e84d76, (q31_t)0x36479a8e, (q31_t)0x61f2f534, (q31_t)0x3640f123,
- (q31_t)0x61fd9ba3, (q31_t)0x363a45a0, (q31_t)0x620840c2, (q31_t)0x36339806,
- (q31_t)0x6212e492, (q31_t)0x362ce855, (q31_t)0x621d8711, (q31_t)0x3626368d,
- (q31_t)0x6228283f, (q31_t)0x361f82af, (q31_t)0x6232c81c, (q31_t)0x3618ccba,
- (q31_t)0x623d66a8, (q31_t)0x361214b0, (q31_t)0x624803e2, (q31_t)0x360b5a90,
- (q31_t)0x62529fca, (q31_t)0x36049e5b, (q31_t)0x625d3a5e, (q31_t)0x35fde011,
- (q31_t)0x6267d3a0, (q31_t)0x35f71fb1, (q31_t)0x62726b8e, (q31_t)0x35f05d3d,
- (q31_t)0x627d0228, (q31_t)0x35e998b5, (q31_t)0x6287976e, (q31_t)0x35e2d219,
- (q31_t)0x62922b5e, (q31_t)0x35dc0968, (q31_t)0x629cbdfa, (q31_t)0x35d53ea5,
- (q31_t)0x62a74f40, (q31_t)0x35ce71ce, (q31_t)0x62b1df30, (q31_t)0x35c7a2e3,
- (q31_t)0x62bc6dca, (q31_t)0x35c0d1e7, (q31_t)0x62c6fb0c, (q31_t)0x35b9fed7,
- (q31_t)0x62d186f8, (q31_t)0x35b329b5, (q31_t)0x62dc118c, (q31_t)0x35ac5282,
- (q31_t)0x62e69ac8, (q31_t)0x35a5793c, (q31_t)0x62f122ab, (q31_t)0x359e9de5,
- (q31_t)0x62fba936, (q31_t)0x3597c07d, (q31_t)0x63062e67, (q31_t)0x3590e104,
- (q31_t)0x6310b23e, (q31_t)0x3589ff7a, (q31_t)0x631b34bc, (q31_t)0x35831be0,
- (q31_t)0x6325b5df, (q31_t)0x357c3636, (q31_t)0x633035a7, (q31_t)0x35754e7c,
- (q31_t)0x633ab414, (q31_t)0x356e64b2, (q31_t)0x63453125, (q31_t)0x356778d9,
- (q31_t)0x634facda, (q31_t)0x35608af1, (q31_t)0x635a2733, (q31_t)0x35599afa,
- (q31_t)0x6364a02e, (q31_t)0x3552a8f4, (q31_t)0x636f17cc, (q31_t)0x354bb4e1,
- (q31_t)0x63798e0d, (q31_t)0x3544bebf, (q31_t)0x638402ef, (q31_t)0x353dc68f,
- (q31_t)0x638e7673, (q31_t)0x3536cc52, (q31_t)0x6398e898, (q31_t)0x352fd008,
- (q31_t)0x63a3595e, (q31_t)0x3528d1b1, (q31_t)0x63adc8c4, (q31_t)0x3521d14d,
- (q31_t)0x63b836ca, (q31_t)0x351acedd, (q31_t)0x63c2a36f, (q31_t)0x3513ca60,
- (q31_t)0x63cd0eb3, (q31_t)0x350cc3d8, (q31_t)0x63d77896, (q31_t)0x3505bb44,
- (q31_t)0x63e1e117, (q31_t)0x34feb0a5, (q31_t)0x63ec4837, (q31_t)0x34f7a3fb,
- (q31_t)0x63f6adf3, (q31_t)0x34f09546, (q31_t)0x6401124d, (q31_t)0x34e98487,
- (q31_t)0x640b7543, (q31_t)0x34e271bd, (q31_t)0x6415d6d5, (q31_t)0x34db5cea,
- (q31_t)0x64203704, (q31_t)0x34d4460c, (q31_t)0x642a95ce, (q31_t)0x34cd2d26,
- (q31_t)0x6434f332, (q31_t)0x34c61236, (q31_t)0x643f4f32, (q31_t)0x34bef53d,
- (q31_t)0x6449a9cc, (q31_t)0x34b7d63c, (q31_t)0x645402ff, (q31_t)0x34b0b533,
- (q31_t)0x645e5acc, (q31_t)0x34a99221, (q31_t)0x6468b132, (q31_t)0x34a26d08,
- (q31_t)0x64730631, (q31_t)0x349b45e7, (q31_t)0x647d59c8, (q31_t)0x34941cbf,
- (q31_t)0x6487abf7, (q31_t)0x348cf190, (q31_t)0x6491fcbe, (q31_t)0x3485c45b,
- (q31_t)0x649c4c1b, (q31_t)0x347e951f, (q31_t)0x64a69a0f, (q31_t)0x347763dd,
- (q31_t)0x64b0e699, (q31_t)0x34703095, (q31_t)0x64bb31ba, (q31_t)0x3468fb47,
- (q31_t)0x64c57b6f, (q31_t)0x3461c3f5, (q31_t)0x64cfc3ba, (q31_t)0x345a8a9d,
- (q31_t)0x64da0a9a, (q31_t)0x34534f41, (q31_t)0x64e4500e, (q31_t)0x344c11e0,
- (q31_t)0x64ee9415, (q31_t)0x3444d27b, (q31_t)0x64f8d6b0, (q31_t)0x343d9112,
- (q31_t)0x650317df, (q31_t)0x34364da6, (q31_t)0x650d57a0, (q31_t)0x342f0836,
- (q31_t)0x651795f3, (q31_t)0x3427c0c3, (q31_t)0x6521d2d8, (q31_t)0x3420774d,
- (q31_t)0x652c0e4f, (q31_t)0x34192bd5, (q31_t)0x65364857, (q31_t)0x3411de5b,
- (q31_t)0x654080ef, (q31_t)0x340a8edf, (q31_t)0x654ab818, (q31_t)0x34033d61,
- (q31_t)0x6554edd1, (q31_t)0x33fbe9e2, (q31_t)0x655f2219, (q31_t)0x33f49462,
- (q31_t)0x656954f1, (q31_t)0x33ed3ce1, (q31_t)0x65738657, (q31_t)0x33e5e360,
- (q31_t)0x657db64c, (q31_t)0x33de87de, (q31_t)0x6587e4cf, (q31_t)0x33d72a5d,
- (q31_t)0x659211df, (q31_t)0x33cfcadc, (q31_t)0x659c3d7c, (q31_t)0x33c8695b,
- (q31_t)0x65a667a7, (q31_t)0x33c105db, (q31_t)0x65b0905d, (q31_t)0x33b9a05d,
- (q31_t)0x65bab7a0, (q31_t)0x33b238e0, (q31_t)0x65c4dd6e, (q31_t)0x33aacf65,
- (q31_t)0x65cf01c8, (q31_t)0x33a363ec, (q31_t)0x65d924ac, (q31_t)0x339bf675,
- (q31_t)0x65e3461b, (q31_t)0x33948701, (q31_t)0x65ed6614, (q31_t)0x338d1590,
- (q31_t)0x65f78497, (q31_t)0x3385a222, (q31_t)0x6601a1a2, (q31_t)0x337e2cb7,
- (q31_t)0x660bbd37, (q31_t)0x3376b551, (q31_t)0x6615d754, (q31_t)0x336f3bee,
- (q31_t)0x661feffa, (q31_t)0x3367c090, (q31_t)0x662a0727, (q31_t)0x33604336,
- (q31_t)0x66341cdb, (q31_t)0x3358c3e2, (q31_t)0x663e3117, (q31_t)0x33514292,
- (q31_t)0x664843d9, (q31_t)0x3349bf48, (q31_t)0x66525521, (q31_t)0x33423a04,
- (q31_t)0x665c64ef, (q31_t)0x333ab2c6, (q31_t)0x66667342, (q31_t)0x3333298f,
- (q31_t)0x6670801a, (q31_t)0x332b9e5e, (q31_t)0x667a8b77, (q31_t)0x33241134,
- (q31_t)0x66849558, (q31_t)0x331c8211, (q31_t)0x668e9dbd, (q31_t)0x3314f0f6,
- (q31_t)0x6698a4a6, (q31_t)0x330d5de3, (q31_t)0x66a2aa11, (q31_t)0x3305c8d7,
- (q31_t)0x66acadff, (q31_t)0x32fe31d5, (q31_t)0x66b6b070, (q31_t)0x32f698db,
- (q31_t)0x66c0b162, (q31_t)0x32eefdea, (q31_t)0x66cab0d6, (q31_t)0x32e76102,
- (q31_t)0x66d4aecb, (q31_t)0x32dfc224, (q31_t)0x66deab41, (q31_t)0x32d82150,
- (q31_t)0x66e8a637, (q31_t)0x32d07e85, (q31_t)0x66f29fad, (q31_t)0x32c8d9c6,
- (q31_t)0x66fc97a3, (q31_t)0x32c13311, (q31_t)0x67068e18, (q31_t)0x32b98a67,
- (q31_t)0x6710830c, (q31_t)0x32b1dfc9, (q31_t)0x671a767e, (q31_t)0x32aa3336,
- (q31_t)0x6724686e, (q31_t)0x32a284b0, (q31_t)0x672e58dc, (q31_t)0x329ad435,
- (q31_t)0x673847c8, (q31_t)0x329321c7, (q31_t)0x67423530, (q31_t)0x328b6d66,
- (q31_t)0x674c2115, (q31_t)0x3283b712, (q31_t)0x67560b76, (q31_t)0x327bfecc,
- (q31_t)0x675ff452, (q31_t)0x32744493, (q31_t)0x6769dbaa, (q31_t)0x326c8868,
- (q31_t)0x6773c17d, (q31_t)0x3264ca4c, (q31_t)0x677da5cb, (q31_t)0x325d0a3e,
- (q31_t)0x67878893, (q31_t)0x32554840, (q31_t)0x679169d5, (q31_t)0x324d8450,
- (q31_t)0x679b4990, (q31_t)0x3245be70, (q31_t)0x67a527c4, (q31_t)0x323df6a0,
- (q31_t)0x67af0472, (q31_t)0x32362ce0, (q31_t)0x67b8df97, (q31_t)0x322e6130,
- (q31_t)0x67c2b934, (q31_t)0x32269391, (q31_t)0x67cc9149, (q31_t)0x321ec403,
- (q31_t)0x67d667d5, (q31_t)0x3216f287, (q31_t)0x67e03cd8, (q31_t)0x320f1f1c,
- (q31_t)0x67ea1052, (q31_t)0x320749c3, (q31_t)0x67f3e241, (q31_t)0x31ff727c,
- (q31_t)0x67fdb2a7, (q31_t)0x31f79948, (q31_t)0x68078181, (q31_t)0x31efbe27,
- (q31_t)0x68114ed0, (q31_t)0x31e7e118, (q31_t)0x681b1a94, (q31_t)0x31e0021e,
- (q31_t)0x6824e4cc, (q31_t)0x31d82137, (q31_t)0x682ead78, (q31_t)0x31d03e64,
- (q31_t)0x68387498, (q31_t)0x31c859a5, (q31_t)0x68423a2a, (q31_t)0x31c072fb,
- (q31_t)0x684bfe2f, (q31_t)0x31b88a66, (q31_t)0x6855c0a6, (q31_t)0x31b09fe7,
- (q31_t)0x685f8190, (q31_t)0x31a8b37c, (q31_t)0x686940ea, (q31_t)0x31a0c528,
- (q31_t)0x6872feb6, (q31_t)0x3198d4ea, (q31_t)0x687cbaf3, (q31_t)0x3190e2c3,
- (q31_t)0x688675a0, (q31_t)0x3188eeb2, (q31_t)0x68902ebd, (q31_t)0x3180f8b8,
- (q31_t)0x6899e64a, (q31_t)0x317900d6, (q31_t)0x68a39c46, (q31_t)0x3171070c,
- (q31_t)0x68ad50b1, (q31_t)0x31690b59, (q31_t)0x68b7038b, (q31_t)0x31610dbf,
- (q31_t)0x68c0b4d2, (q31_t)0x31590e3e, (q31_t)0x68ca6488, (q31_t)0x31510cd5,
- (q31_t)0x68d412ab, (q31_t)0x31490986, (q31_t)0x68ddbf3b, (q31_t)0x31410450,
- (q31_t)0x68e76a37, (q31_t)0x3138fd35, (q31_t)0x68f113a0, (q31_t)0x3130f433,
- (q31_t)0x68fabb75, (q31_t)0x3128e94c, (q31_t)0x690461b5, (q31_t)0x3120dc80,
- (q31_t)0x690e0661, (q31_t)0x3118cdcf, (q31_t)0x6917a977, (q31_t)0x3110bd39,
- (q31_t)0x69214af8, (q31_t)0x3108aabf, (q31_t)0x692aeae3, (q31_t)0x31009661,
- (q31_t)0x69348937, (q31_t)0x30f8801f, (q31_t)0x693e25f5, (q31_t)0x30f067fb,
- (q31_t)0x6947c11c, (q31_t)0x30e84df3, (q31_t)0x69515aab, (q31_t)0x30e03208,
- (q31_t)0x695af2a3, (q31_t)0x30d8143b, (q31_t)0x69648902, (q31_t)0x30cff48c,
- (q31_t)0x696e1dc9, (q31_t)0x30c7d2fb, (q31_t)0x6977b0f7, (q31_t)0x30bfaf89,
- (q31_t)0x6981428c, (q31_t)0x30b78a36, (q31_t)0x698ad287, (q31_t)0x30af6302,
- (q31_t)0x699460e8, (q31_t)0x30a739ed, (q31_t)0x699dedaf, (q31_t)0x309f0ef8,
- (q31_t)0x69a778db, (q31_t)0x3096e223, (q31_t)0x69b1026c, (q31_t)0x308eb36f,
- (q31_t)0x69ba8a61, (q31_t)0x308682dc, (q31_t)0x69c410ba, (q31_t)0x307e5069,
- (q31_t)0x69cd9578, (q31_t)0x30761c18, (q31_t)0x69d71899, (q31_t)0x306de5e9,
- (q31_t)0x69e09a1c, (q31_t)0x3065addb, (q31_t)0x69ea1a03, (q31_t)0x305d73f0,
- (q31_t)0x69f3984c, (q31_t)0x30553828, (q31_t)0x69fd14f6, (q31_t)0x304cfa83,
- (q31_t)0x6a069003, (q31_t)0x3044bb00, (q31_t)0x6a100970, (q31_t)0x303c79a2,
- (q31_t)0x6a19813f, (q31_t)0x30343667, (q31_t)0x6a22f76e, (q31_t)0x302bf151,
- (q31_t)0x6a2c6bfd, (q31_t)0x3023aa5f, (q31_t)0x6a35deeb, (q31_t)0x301b6193,
- (q31_t)0x6a3f503a, (q31_t)0x301316eb, (q31_t)0x6a48bfe7, (q31_t)0x300aca69,
- (q31_t)0x6a522df3, (q31_t)0x30027c0c, (q31_t)0x6a5b9a5d, (q31_t)0x2ffa2bd6,
- (q31_t)0x6a650525, (q31_t)0x2ff1d9c7, (q31_t)0x6a6e6e4b, (q31_t)0x2fe985de,
- (q31_t)0x6a77d5ce, (q31_t)0x2fe1301c, (q31_t)0x6a813bae, (q31_t)0x2fd8d882,
- (q31_t)0x6a8a9fea, (q31_t)0x2fd07f0f, (q31_t)0x6a940283, (q31_t)0x2fc823c5,
- (q31_t)0x6a9d6377, (q31_t)0x2fbfc6a3, (q31_t)0x6aa6c2c6, (q31_t)0x2fb767aa,
- (q31_t)0x6ab02071, (q31_t)0x2faf06da, (q31_t)0x6ab97c77, (q31_t)0x2fa6a433,
- (q31_t)0x6ac2d6d6, (q31_t)0x2f9e3fb6, (q31_t)0x6acc2f90, (q31_t)0x2f95d963,
- (q31_t)0x6ad586a3, (q31_t)0x2f8d713a, (q31_t)0x6adedc10, (q31_t)0x2f85073c,
- (q31_t)0x6ae82fd5, (q31_t)0x2f7c9b69, (q31_t)0x6af181f3, (q31_t)0x2f742dc1,
- (q31_t)0x6afad269, (q31_t)0x2f6bbe45, (q31_t)0x6b042137, (q31_t)0x2f634cf5,
- (q31_t)0x6b0d6e5c, (q31_t)0x2f5ad9d1, (q31_t)0x6b16b9d9, (q31_t)0x2f5264da,
- (q31_t)0x6b2003ac, (q31_t)0x2f49ee0f, (q31_t)0x6b294bd5, (q31_t)0x2f417573,
- (q31_t)0x6b329255, (q31_t)0x2f38fb03, (q31_t)0x6b3bd72a, (q31_t)0x2f307ec2,
- (q31_t)0x6b451a55, (q31_t)0x2f2800af, (q31_t)0x6b4e5bd4, (q31_t)0x2f1f80ca,
- (q31_t)0x6b579ba8, (q31_t)0x2f16ff14, (q31_t)0x6b60d9d0, (q31_t)0x2f0e7b8e,
- (q31_t)0x6b6a164d, (q31_t)0x2f05f637, (q31_t)0x6b73511c, (q31_t)0x2efd6f10,
- (q31_t)0x6b7c8a3f, (q31_t)0x2ef4e619, (q31_t)0x6b85c1b5, (q31_t)0x2eec5b53,
- (q31_t)0x6b8ef77d, (q31_t)0x2ee3cebe, (q31_t)0x6b982b97, (q31_t)0x2edb405a,
- (q31_t)0x6ba15e03, (q31_t)0x2ed2b027, (q31_t)0x6baa8ec0, (q31_t)0x2eca1e27,
- (q31_t)0x6bb3bdce, (q31_t)0x2ec18a58, (q31_t)0x6bbceb2d, (q31_t)0x2eb8f4bc,
- (q31_t)0x6bc616dd, (q31_t)0x2eb05d53, (q31_t)0x6bcf40dc, (q31_t)0x2ea7c41e,
- (q31_t)0x6bd8692b, (q31_t)0x2e9f291b, (q31_t)0x6be18fc9, (q31_t)0x2e968c4d,
- (q31_t)0x6beab4b6, (q31_t)0x2e8dedb3, (q31_t)0x6bf3d7f2, (q31_t)0x2e854d4d,
- (q31_t)0x6bfcf97c, (q31_t)0x2e7cab1c, (q31_t)0x6c061953, (q31_t)0x2e740720,
- (q31_t)0x6c0f3779, (q31_t)0x2e6b615a, (q31_t)0x6c1853eb, (q31_t)0x2e62b9ca,
- (q31_t)0x6c216eaa, (q31_t)0x2e5a1070, (q31_t)0x6c2a87b6, (q31_t)0x2e51654c,
- (q31_t)0x6c339f0e, (q31_t)0x2e48b860, (q31_t)0x6c3cb4b1, (q31_t)0x2e4009aa,
- (q31_t)0x6c45c8a0, (q31_t)0x2e37592c, (q31_t)0x6c4edada, (q31_t)0x2e2ea6e6,
- (q31_t)0x6c57eb5e, (q31_t)0x2e25f2d8, (q31_t)0x6c60fa2d, (q31_t)0x2e1d3d03,
- (q31_t)0x6c6a0746, (q31_t)0x2e148566, (q31_t)0x6c7312a9, (q31_t)0x2e0bcc03,
- (q31_t)0x6c7c1c55, (q31_t)0x2e0310d9, (q31_t)0x6c85244a, (q31_t)0x2dfa53e9,
- (q31_t)0x6c8e2a87, (q31_t)0x2df19534, (q31_t)0x6c972f0d, (q31_t)0x2de8d4b8,
- (q31_t)0x6ca031da, (q31_t)0x2de01278, (q31_t)0x6ca932ef, (q31_t)0x2dd74e73,
- (q31_t)0x6cb2324c, (q31_t)0x2dce88aa, (q31_t)0x6cbb2fef, (q31_t)0x2dc5c11c,
- (q31_t)0x6cc42bd9, (q31_t)0x2dbcf7cb, (q31_t)0x6ccd2609, (q31_t)0x2db42cb6,
- (q31_t)0x6cd61e7f, (q31_t)0x2dab5fdf, (q31_t)0x6cdf153a, (q31_t)0x2da29144,
- (q31_t)0x6ce80a3a, (q31_t)0x2d99c0e7, (q31_t)0x6cf0fd80, (q31_t)0x2d90eec8,
- (q31_t)0x6cf9ef09, (q31_t)0x2d881ae8, (q31_t)0x6d02ded7, (q31_t)0x2d7f4545,
- (q31_t)0x6d0bcce8, (q31_t)0x2d766de2, (q31_t)0x6d14b93d, (q31_t)0x2d6d94bf,
- (q31_t)0x6d1da3d5, (q31_t)0x2d64b9da, (q31_t)0x6d268cb0, (q31_t)0x2d5bdd36,
- (q31_t)0x6d2f73cd, (q31_t)0x2d52fed2, (q31_t)0x6d38592c, (q31_t)0x2d4a1eaf,
- (q31_t)0x6d413ccd, (q31_t)0x2d413ccd, (q31_t)0x6d4a1eaf, (q31_t)0x2d38592c,
- (q31_t)0x6d52fed2, (q31_t)0x2d2f73cd, (q31_t)0x6d5bdd36, (q31_t)0x2d268cb0,
- (q31_t)0x6d64b9da, (q31_t)0x2d1da3d5, (q31_t)0x6d6d94bf, (q31_t)0x2d14b93d,
- (q31_t)0x6d766de2, (q31_t)0x2d0bcce8, (q31_t)0x6d7f4545, (q31_t)0x2d02ded7,
- (q31_t)0x6d881ae8, (q31_t)0x2cf9ef09, (q31_t)0x6d90eec8, (q31_t)0x2cf0fd80,
- (q31_t)0x6d99c0e7, (q31_t)0x2ce80a3a, (q31_t)0x6da29144, (q31_t)0x2cdf153a,
- (q31_t)0x6dab5fdf, (q31_t)0x2cd61e7f, (q31_t)0x6db42cb6, (q31_t)0x2ccd2609,
- (q31_t)0x6dbcf7cb, (q31_t)0x2cc42bd9, (q31_t)0x6dc5c11c, (q31_t)0x2cbb2fef,
- (q31_t)0x6dce88aa, (q31_t)0x2cb2324c, (q31_t)0x6dd74e73, (q31_t)0x2ca932ef,
- (q31_t)0x6de01278, (q31_t)0x2ca031da, (q31_t)0x6de8d4b8, (q31_t)0x2c972f0d,
- (q31_t)0x6df19534, (q31_t)0x2c8e2a87, (q31_t)0x6dfa53e9, (q31_t)0x2c85244a,
- (q31_t)0x6e0310d9, (q31_t)0x2c7c1c55, (q31_t)0x6e0bcc03, (q31_t)0x2c7312a9,
- (q31_t)0x6e148566, (q31_t)0x2c6a0746, (q31_t)0x6e1d3d03, (q31_t)0x2c60fa2d,
- (q31_t)0x6e25f2d8, (q31_t)0x2c57eb5e, (q31_t)0x6e2ea6e6, (q31_t)0x2c4edada,
- (q31_t)0x6e37592c, (q31_t)0x2c45c8a0, (q31_t)0x6e4009aa, (q31_t)0x2c3cb4b1,
- (q31_t)0x6e48b860, (q31_t)0x2c339f0e, (q31_t)0x6e51654c, (q31_t)0x2c2a87b6,
- (q31_t)0x6e5a1070, (q31_t)0x2c216eaa, (q31_t)0x6e62b9ca, (q31_t)0x2c1853eb,
- (q31_t)0x6e6b615a, (q31_t)0x2c0f3779, (q31_t)0x6e740720, (q31_t)0x2c061953,
- (q31_t)0x6e7cab1c, (q31_t)0x2bfcf97c, (q31_t)0x6e854d4d, (q31_t)0x2bf3d7f2,
- (q31_t)0x6e8dedb3, (q31_t)0x2beab4b6, (q31_t)0x6e968c4d, (q31_t)0x2be18fc9,
- (q31_t)0x6e9f291b, (q31_t)0x2bd8692b, (q31_t)0x6ea7c41e, (q31_t)0x2bcf40dc,
- (q31_t)0x6eb05d53, (q31_t)0x2bc616dd, (q31_t)0x6eb8f4bc, (q31_t)0x2bbceb2d,
- (q31_t)0x6ec18a58, (q31_t)0x2bb3bdce, (q31_t)0x6eca1e27, (q31_t)0x2baa8ec0,
- (q31_t)0x6ed2b027, (q31_t)0x2ba15e03, (q31_t)0x6edb405a, (q31_t)0x2b982b97,
- (q31_t)0x6ee3cebe, (q31_t)0x2b8ef77d, (q31_t)0x6eec5b53, (q31_t)0x2b85c1b5,
- (q31_t)0x6ef4e619, (q31_t)0x2b7c8a3f, (q31_t)0x6efd6f10, (q31_t)0x2b73511c,
- (q31_t)0x6f05f637, (q31_t)0x2b6a164d, (q31_t)0x6f0e7b8e, (q31_t)0x2b60d9d0,
- (q31_t)0x6f16ff14, (q31_t)0x2b579ba8, (q31_t)0x6f1f80ca, (q31_t)0x2b4e5bd4,
- (q31_t)0x6f2800af, (q31_t)0x2b451a55, (q31_t)0x6f307ec2, (q31_t)0x2b3bd72a,
- (q31_t)0x6f38fb03, (q31_t)0x2b329255, (q31_t)0x6f417573, (q31_t)0x2b294bd5,
- (q31_t)0x6f49ee0f, (q31_t)0x2b2003ac, (q31_t)0x6f5264da, (q31_t)0x2b16b9d9,
- (q31_t)0x6f5ad9d1, (q31_t)0x2b0d6e5c, (q31_t)0x6f634cf5, (q31_t)0x2b042137,
- (q31_t)0x6f6bbe45, (q31_t)0x2afad269, (q31_t)0x6f742dc1, (q31_t)0x2af181f3,
- (q31_t)0x6f7c9b69, (q31_t)0x2ae82fd5, (q31_t)0x6f85073c, (q31_t)0x2adedc10,
- (q31_t)0x6f8d713a, (q31_t)0x2ad586a3, (q31_t)0x6f95d963, (q31_t)0x2acc2f90,
- (q31_t)0x6f9e3fb6, (q31_t)0x2ac2d6d6, (q31_t)0x6fa6a433, (q31_t)0x2ab97c77,
- (q31_t)0x6faf06da, (q31_t)0x2ab02071, (q31_t)0x6fb767aa, (q31_t)0x2aa6c2c6,
- (q31_t)0x6fbfc6a3, (q31_t)0x2a9d6377, (q31_t)0x6fc823c5, (q31_t)0x2a940283,
- (q31_t)0x6fd07f0f, (q31_t)0x2a8a9fea, (q31_t)0x6fd8d882, (q31_t)0x2a813bae,
- (q31_t)0x6fe1301c, (q31_t)0x2a77d5ce, (q31_t)0x6fe985de, (q31_t)0x2a6e6e4b,
- (q31_t)0x6ff1d9c7, (q31_t)0x2a650525, (q31_t)0x6ffa2bd6, (q31_t)0x2a5b9a5d,
- (q31_t)0x70027c0c, (q31_t)0x2a522df3, (q31_t)0x700aca69, (q31_t)0x2a48bfe7,
- (q31_t)0x701316eb, (q31_t)0x2a3f503a, (q31_t)0x701b6193, (q31_t)0x2a35deeb,
- (q31_t)0x7023aa5f, (q31_t)0x2a2c6bfd, (q31_t)0x702bf151, (q31_t)0x2a22f76e,
- (q31_t)0x70343667, (q31_t)0x2a19813f, (q31_t)0x703c79a2, (q31_t)0x2a100970,
- (q31_t)0x7044bb00, (q31_t)0x2a069003, (q31_t)0x704cfa83, (q31_t)0x29fd14f6,
- (q31_t)0x70553828, (q31_t)0x29f3984c, (q31_t)0x705d73f0, (q31_t)0x29ea1a03,
- (q31_t)0x7065addb, (q31_t)0x29e09a1c, (q31_t)0x706de5e9, (q31_t)0x29d71899,
- (q31_t)0x70761c18, (q31_t)0x29cd9578, (q31_t)0x707e5069, (q31_t)0x29c410ba,
- (q31_t)0x708682dc, (q31_t)0x29ba8a61, (q31_t)0x708eb36f, (q31_t)0x29b1026c,
- (q31_t)0x7096e223, (q31_t)0x29a778db, (q31_t)0x709f0ef8, (q31_t)0x299dedaf,
- (q31_t)0x70a739ed, (q31_t)0x299460e8, (q31_t)0x70af6302, (q31_t)0x298ad287,
- (q31_t)0x70b78a36, (q31_t)0x2981428c, (q31_t)0x70bfaf89, (q31_t)0x2977b0f7,
- (q31_t)0x70c7d2fb, (q31_t)0x296e1dc9, (q31_t)0x70cff48c, (q31_t)0x29648902,
- (q31_t)0x70d8143b, (q31_t)0x295af2a3, (q31_t)0x70e03208, (q31_t)0x29515aab,
- (q31_t)0x70e84df3, (q31_t)0x2947c11c, (q31_t)0x70f067fb, (q31_t)0x293e25f5,
- (q31_t)0x70f8801f, (q31_t)0x29348937, (q31_t)0x71009661, (q31_t)0x292aeae3,
- (q31_t)0x7108aabf, (q31_t)0x29214af8, (q31_t)0x7110bd39, (q31_t)0x2917a977,
- (q31_t)0x7118cdcf, (q31_t)0x290e0661, (q31_t)0x7120dc80, (q31_t)0x290461b5,
- (q31_t)0x7128e94c, (q31_t)0x28fabb75, (q31_t)0x7130f433, (q31_t)0x28f113a0,
- (q31_t)0x7138fd35, (q31_t)0x28e76a37, (q31_t)0x71410450, (q31_t)0x28ddbf3b,
- (q31_t)0x71490986, (q31_t)0x28d412ab, (q31_t)0x71510cd5, (q31_t)0x28ca6488,
- (q31_t)0x71590e3e, (q31_t)0x28c0b4d2, (q31_t)0x71610dbf, (q31_t)0x28b7038b,
- (q31_t)0x71690b59, (q31_t)0x28ad50b1, (q31_t)0x7171070c, (q31_t)0x28a39c46,
- (q31_t)0x717900d6, (q31_t)0x2899e64a, (q31_t)0x7180f8b8, (q31_t)0x28902ebd,
- (q31_t)0x7188eeb2, (q31_t)0x288675a0, (q31_t)0x7190e2c3, (q31_t)0x287cbaf3,
- (q31_t)0x7198d4ea, (q31_t)0x2872feb6, (q31_t)0x71a0c528, (q31_t)0x286940ea,
- (q31_t)0x71a8b37c, (q31_t)0x285f8190, (q31_t)0x71b09fe7, (q31_t)0x2855c0a6,
- (q31_t)0x71b88a66, (q31_t)0x284bfe2f, (q31_t)0x71c072fb, (q31_t)0x28423a2a,
- (q31_t)0x71c859a5, (q31_t)0x28387498, (q31_t)0x71d03e64, (q31_t)0x282ead78,
- (q31_t)0x71d82137, (q31_t)0x2824e4cc, (q31_t)0x71e0021e, (q31_t)0x281b1a94,
- (q31_t)0x71e7e118, (q31_t)0x28114ed0, (q31_t)0x71efbe27, (q31_t)0x28078181,
- (q31_t)0x71f79948, (q31_t)0x27fdb2a7, (q31_t)0x71ff727c, (q31_t)0x27f3e241,
- (q31_t)0x720749c3, (q31_t)0x27ea1052, (q31_t)0x720f1f1c, (q31_t)0x27e03cd8,
- (q31_t)0x7216f287, (q31_t)0x27d667d5, (q31_t)0x721ec403, (q31_t)0x27cc9149,
- (q31_t)0x72269391, (q31_t)0x27c2b934, (q31_t)0x722e6130, (q31_t)0x27b8df97,
- (q31_t)0x72362ce0, (q31_t)0x27af0472, (q31_t)0x723df6a0, (q31_t)0x27a527c4,
- (q31_t)0x7245be70, (q31_t)0x279b4990, (q31_t)0x724d8450, (q31_t)0x279169d5,
- (q31_t)0x72554840, (q31_t)0x27878893, (q31_t)0x725d0a3e, (q31_t)0x277da5cb,
- (q31_t)0x7264ca4c, (q31_t)0x2773c17d, (q31_t)0x726c8868, (q31_t)0x2769dbaa,
- (q31_t)0x72744493, (q31_t)0x275ff452, (q31_t)0x727bfecc, (q31_t)0x27560b76,
- (q31_t)0x7283b712, (q31_t)0x274c2115, (q31_t)0x728b6d66, (q31_t)0x27423530,
- (q31_t)0x729321c7, (q31_t)0x273847c8, (q31_t)0x729ad435, (q31_t)0x272e58dc,
- (q31_t)0x72a284b0, (q31_t)0x2724686e, (q31_t)0x72aa3336, (q31_t)0x271a767e,
- (q31_t)0x72b1dfc9, (q31_t)0x2710830c, (q31_t)0x72b98a67, (q31_t)0x27068e18,
- (q31_t)0x72c13311, (q31_t)0x26fc97a3, (q31_t)0x72c8d9c6, (q31_t)0x26f29fad,
- (q31_t)0x72d07e85, (q31_t)0x26e8a637, (q31_t)0x72d82150, (q31_t)0x26deab41,
- (q31_t)0x72dfc224, (q31_t)0x26d4aecb, (q31_t)0x72e76102, (q31_t)0x26cab0d6,
- (q31_t)0x72eefdea, (q31_t)0x26c0b162, (q31_t)0x72f698db, (q31_t)0x26b6b070,
- (q31_t)0x72fe31d5, (q31_t)0x26acadff, (q31_t)0x7305c8d7, (q31_t)0x26a2aa11,
- (q31_t)0x730d5de3, (q31_t)0x2698a4a6, (q31_t)0x7314f0f6, (q31_t)0x268e9dbd,
- (q31_t)0x731c8211, (q31_t)0x26849558, (q31_t)0x73241134, (q31_t)0x267a8b77,
- (q31_t)0x732b9e5e, (q31_t)0x2670801a, (q31_t)0x7333298f, (q31_t)0x26667342,
- (q31_t)0x733ab2c6, (q31_t)0x265c64ef, (q31_t)0x73423a04, (q31_t)0x26525521,
- (q31_t)0x7349bf48, (q31_t)0x264843d9, (q31_t)0x73514292, (q31_t)0x263e3117,
- (q31_t)0x7358c3e2, (q31_t)0x26341cdb, (q31_t)0x73604336, (q31_t)0x262a0727,
- (q31_t)0x7367c090, (q31_t)0x261feffa, (q31_t)0x736f3bee, (q31_t)0x2615d754,
- (q31_t)0x7376b551, (q31_t)0x260bbd37, (q31_t)0x737e2cb7, (q31_t)0x2601a1a2,
- (q31_t)0x7385a222, (q31_t)0x25f78497, (q31_t)0x738d1590, (q31_t)0x25ed6614,
- (q31_t)0x73948701, (q31_t)0x25e3461b, (q31_t)0x739bf675, (q31_t)0x25d924ac,
- (q31_t)0x73a363ec, (q31_t)0x25cf01c8, (q31_t)0x73aacf65, (q31_t)0x25c4dd6e,
- (q31_t)0x73b238e0, (q31_t)0x25bab7a0, (q31_t)0x73b9a05d, (q31_t)0x25b0905d,
- (q31_t)0x73c105db, (q31_t)0x25a667a7, (q31_t)0x73c8695b, (q31_t)0x259c3d7c,
- (q31_t)0x73cfcadc, (q31_t)0x259211df, (q31_t)0x73d72a5d, (q31_t)0x2587e4cf,
- (q31_t)0x73de87de, (q31_t)0x257db64c, (q31_t)0x73e5e360, (q31_t)0x25738657,
- (q31_t)0x73ed3ce1, (q31_t)0x256954f1, (q31_t)0x73f49462, (q31_t)0x255f2219,
- (q31_t)0x73fbe9e2, (q31_t)0x2554edd1, (q31_t)0x74033d61, (q31_t)0x254ab818,
- (q31_t)0x740a8edf, (q31_t)0x254080ef, (q31_t)0x7411de5b, (q31_t)0x25364857,
- (q31_t)0x74192bd5, (q31_t)0x252c0e4f, (q31_t)0x7420774d, (q31_t)0x2521d2d8,
- (q31_t)0x7427c0c3, (q31_t)0x251795f3, (q31_t)0x742f0836, (q31_t)0x250d57a0,
- (q31_t)0x74364da6, (q31_t)0x250317df, (q31_t)0x743d9112, (q31_t)0x24f8d6b0,
- (q31_t)0x7444d27b, (q31_t)0x24ee9415, (q31_t)0x744c11e0, (q31_t)0x24e4500e,
- (q31_t)0x74534f41, (q31_t)0x24da0a9a, (q31_t)0x745a8a9d, (q31_t)0x24cfc3ba,
- (q31_t)0x7461c3f5, (q31_t)0x24c57b6f, (q31_t)0x7468fb47, (q31_t)0x24bb31ba,
- (q31_t)0x74703095, (q31_t)0x24b0e699, (q31_t)0x747763dd, (q31_t)0x24a69a0f,
- (q31_t)0x747e951f, (q31_t)0x249c4c1b, (q31_t)0x7485c45b, (q31_t)0x2491fcbe,
- (q31_t)0x748cf190, (q31_t)0x2487abf7, (q31_t)0x74941cbf, (q31_t)0x247d59c8,
- (q31_t)0x749b45e7, (q31_t)0x24730631, (q31_t)0x74a26d08, (q31_t)0x2468b132,
- (q31_t)0x74a99221, (q31_t)0x245e5acc, (q31_t)0x74b0b533, (q31_t)0x245402ff,
- (q31_t)0x74b7d63c, (q31_t)0x2449a9cc, (q31_t)0x74bef53d, (q31_t)0x243f4f32,
- (q31_t)0x74c61236, (q31_t)0x2434f332, (q31_t)0x74cd2d26, (q31_t)0x242a95ce,
- (q31_t)0x74d4460c, (q31_t)0x24203704, (q31_t)0x74db5cea, (q31_t)0x2415d6d5,
- (q31_t)0x74e271bd, (q31_t)0x240b7543, (q31_t)0x74e98487, (q31_t)0x2401124d,
- (q31_t)0x74f09546, (q31_t)0x23f6adf3, (q31_t)0x74f7a3fb, (q31_t)0x23ec4837,
- (q31_t)0x74feb0a5, (q31_t)0x23e1e117, (q31_t)0x7505bb44, (q31_t)0x23d77896,
- (q31_t)0x750cc3d8, (q31_t)0x23cd0eb3, (q31_t)0x7513ca60, (q31_t)0x23c2a36f,
- (q31_t)0x751acedd, (q31_t)0x23b836ca, (q31_t)0x7521d14d, (q31_t)0x23adc8c4,
- (q31_t)0x7528d1b1, (q31_t)0x23a3595e, (q31_t)0x752fd008, (q31_t)0x2398e898,
- (q31_t)0x7536cc52, (q31_t)0x238e7673, (q31_t)0x753dc68f, (q31_t)0x238402ef,
- (q31_t)0x7544bebf, (q31_t)0x23798e0d, (q31_t)0x754bb4e1, (q31_t)0x236f17cc,
- (q31_t)0x7552a8f4, (q31_t)0x2364a02e, (q31_t)0x75599afa, (q31_t)0x235a2733,
- (q31_t)0x75608af1, (q31_t)0x234facda, (q31_t)0x756778d9, (q31_t)0x23453125,
- (q31_t)0x756e64b2, (q31_t)0x233ab414, (q31_t)0x75754e7c, (q31_t)0x233035a7,
- (q31_t)0x757c3636, (q31_t)0x2325b5df, (q31_t)0x75831be0, (q31_t)0x231b34bc,
- (q31_t)0x7589ff7a, (q31_t)0x2310b23e, (q31_t)0x7590e104, (q31_t)0x23062e67,
- (q31_t)0x7597c07d, (q31_t)0x22fba936, (q31_t)0x759e9de5, (q31_t)0x22f122ab,
- (q31_t)0x75a5793c, (q31_t)0x22e69ac8, (q31_t)0x75ac5282, (q31_t)0x22dc118c,
- (q31_t)0x75b329b5, (q31_t)0x22d186f8, (q31_t)0x75b9fed7, (q31_t)0x22c6fb0c,
- (q31_t)0x75c0d1e7, (q31_t)0x22bc6dca, (q31_t)0x75c7a2e3, (q31_t)0x22b1df30,
- (q31_t)0x75ce71ce, (q31_t)0x22a74f40, (q31_t)0x75d53ea5, (q31_t)0x229cbdfa,
- (q31_t)0x75dc0968, (q31_t)0x22922b5e, (q31_t)0x75e2d219, (q31_t)0x2287976e,
- (q31_t)0x75e998b5, (q31_t)0x227d0228, (q31_t)0x75f05d3d, (q31_t)0x22726b8e,
- (q31_t)0x75f71fb1, (q31_t)0x2267d3a0, (q31_t)0x75fde011, (q31_t)0x225d3a5e,
- (q31_t)0x76049e5b, (q31_t)0x22529fca, (q31_t)0x760b5a90, (q31_t)0x224803e2,
- (q31_t)0x761214b0, (q31_t)0x223d66a8, (q31_t)0x7618ccba, (q31_t)0x2232c81c,
- (q31_t)0x761f82af, (q31_t)0x2228283f, (q31_t)0x7626368d, (q31_t)0x221d8711,
- (q31_t)0x762ce855, (q31_t)0x2212e492, (q31_t)0x76339806, (q31_t)0x220840c2,
- (q31_t)0x763a45a0, (q31_t)0x21fd9ba3, (q31_t)0x7640f123, (q31_t)0x21f2f534,
- (q31_t)0x76479a8e, (q31_t)0x21e84d76, (q31_t)0x764e41e2, (q31_t)0x21dda46a,
- (q31_t)0x7654e71d, (q31_t)0x21d2fa0f, (q31_t)0x765b8a41, (q31_t)0x21c84e67,
- (q31_t)0x76622b4c, (q31_t)0x21bda171, (q31_t)0x7668ca3e, (q31_t)0x21b2f32e,
- (q31_t)0x766f6717, (q31_t)0x21a8439e, (q31_t)0x767601d7, (q31_t)0x219d92c2,
- (q31_t)0x767c9a7e, (q31_t)0x2192e09b, (q31_t)0x7683310b, (q31_t)0x21882d28,
- (q31_t)0x7689c57d, (q31_t)0x217d786a, (q31_t)0x769057d6, (q31_t)0x2172c262,
- (q31_t)0x7696e814, (q31_t)0x21680b0f, (q31_t)0x769d7637, (q31_t)0x215d5273,
- (q31_t)0x76a4023f, (q31_t)0x2152988d, (q31_t)0x76aa8c2c, (q31_t)0x2147dd5f,
- (q31_t)0x76b113fd, (q31_t)0x213d20e8, (q31_t)0x76b799b3, (q31_t)0x21326329,
- (q31_t)0x76be1d4c, (q31_t)0x2127a423, (q31_t)0x76c49ec9, (q31_t)0x211ce3d5,
- (q31_t)0x76cb1e2a, (q31_t)0x21122240, (q31_t)0x76d19b6e, (q31_t)0x21075f65,
- (q31_t)0x76d81695, (q31_t)0x20fc9b44, (q31_t)0x76de8f9e, (q31_t)0x20f1d5de,
- (q31_t)0x76e5068a, (q31_t)0x20e70f32, (q31_t)0x76eb7b58, (q31_t)0x20dc4742,
- (q31_t)0x76f1ee09, (q31_t)0x20d17e0d, (q31_t)0x76f85e9a, (q31_t)0x20c6b395,
- (q31_t)0x76fecd0e, (q31_t)0x20bbe7d8, (q31_t)0x77053962, (q31_t)0x20b11ad9,
- (q31_t)0x770ba398, (q31_t)0x20a64c97, (q31_t)0x77120bae, (q31_t)0x209b7d13,
- (q31_t)0x771871a5, (q31_t)0x2090ac4d, (q31_t)0x771ed57c, (q31_t)0x2085da46,
- (q31_t)0x77253733, (q31_t)0x207b06fe, (q31_t)0x772b96ca, (q31_t)0x20703275,
- (q31_t)0x7731f440, (q31_t)0x20655cac, (q31_t)0x77384f95, (q31_t)0x205a85a3,
- (q31_t)0x773ea8ca, (q31_t)0x204fad5b, (q31_t)0x7744ffdd, (q31_t)0x2044d3d4,
- (q31_t)0x774b54ce, (q31_t)0x2039f90f, (q31_t)0x7751a79e, (q31_t)0x202f1d0b,
- (q31_t)0x7757f84c, (q31_t)0x20243fca, (q31_t)0x775e46d8, (q31_t)0x2019614c,
- (q31_t)0x77649341, (q31_t)0x200e8190, (q31_t)0x776add88, (q31_t)0x2003a099,
- (q31_t)0x777125ac, (q31_t)0x1ff8be65, (q31_t)0x77776bac, (q31_t)0x1feddaf6,
- (q31_t)0x777daf89, (q31_t)0x1fe2f64c, (q31_t)0x7783f143, (q31_t)0x1fd81067,
- (q31_t)0x778a30d8, (q31_t)0x1fcd2948, (q31_t)0x77906e49, (q31_t)0x1fc240ef,
- (q31_t)0x7796a996, (q31_t)0x1fb7575c, (q31_t)0x779ce2be, (q31_t)0x1fac6c91,
- (q31_t)0x77a319c2, (q31_t)0x1fa1808c, (q31_t)0x77a94ea0, (q31_t)0x1f969350,
- (q31_t)0x77af8159, (q31_t)0x1f8ba4dc, (q31_t)0x77b5b1ec, (q31_t)0x1f80b531,
- (q31_t)0x77bbe05a, (q31_t)0x1f75c44e, (q31_t)0x77c20ca1, (q31_t)0x1f6ad235,
- (q31_t)0x77c836c2, (q31_t)0x1f5fdee6, (q31_t)0x77ce5ebd, (q31_t)0x1f54ea62,
- (q31_t)0x77d48490, (q31_t)0x1f49f4a8, (q31_t)0x77daa83d, (q31_t)0x1f3efdb9,
- (q31_t)0x77e0c9c3, (q31_t)0x1f340596, (q31_t)0x77e6e921, (q31_t)0x1f290c3f,
- (q31_t)0x77ed0657, (q31_t)0x1f1e11b5, (q31_t)0x77f32165, (q31_t)0x1f1315f7,
- (q31_t)0x77f93a4b, (q31_t)0x1f081907, (q31_t)0x77ff5109, (q31_t)0x1efd1ae4,
- (q31_t)0x7805659e, (q31_t)0x1ef21b90, (q31_t)0x780b780a, (q31_t)0x1ee71b0a,
- (q31_t)0x7811884d, (q31_t)0x1edc1953, (q31_t)0x78179666, (q31_t)0x1ed1166b,
- (q31_t)0x781da256, (q31_t)0x1ec61254, (q31_t)0x7823ac1d, (q31_t)0x1ebb0d0d,
- (q31_t)0x7829b3b9, (q31_t)0x1eb00696, (q31_t)0x782fb92a, (q31_t)0x1ea4fef0,
- (q31_t)0x7835bc71, (q31_t)0x1e99f61d, (q31_t)0x783bbd8e, (q31_t)0x1e8eec1b,
- (q31_t)0x7841bc7f, (q31_t)0x1e83e0eb, (q31_t)0x7847b946, (q31_t)0x1e78d48e,
- (q31_t)0x784db3e0, (q31_t)0x1e6dc705, (q31_t)0x7853ac4f, (q31_t)0x1e62b84f,
- (q31_t)0x7859a292, (q31_t)0x1e57a86d, (q31_t)0x785f96a9, (q31_t)0x1e4c9760,
- (q31_t)0x78658894, (q31_t)0x1e418528, (q31_t)0x786b7852, (q31_t)0x1e3671c5,
- (q31_t)0x787165e3, (q31_t)0x1e2b5d38, (q31_t)0x78775147, (q31_t)0x1e204781,
- (q31_t)0x787d3a7e, (q31_t)0x1e1530a1, (q31_t)0x78832187, (q31_t)0x1e0a1898,
- (q31_t)0x78890663, (q31_t)0x1dfeff67, (q31_t)0x788ee910, (q31_t)0x1df3e50d,
- (q31_t)0x7894c98f, (q31_t)0x1de8c98c, (q31_t)0x789aa7e0, (q31_t)0x1dddace4,
- (q31_t)0x78a08402, (q31_t)0x1dd28f15, (q31_t)0x78a65df6, (q31_t)0x1dc7701f,
- (q31_t)0x78ac35ba, (q31_t)0x1dbc5004, (q31_t)0x78b20b4f, (q31_t)0x1db12ec3,
- (q31_t)0x78b7deb4, (q31_t)0x1da60c5d, (q31_t)0x78bdafea, (q31_t)0x1d9ae8d2,
- (q31_t)0x78c37eef, (q31_t)0x1d8fc424, (q31_t)0x78c94bc4, (q31_t)0x1d849e51,
- (q31_t)0x78cf1669, (q31_t)0x1d79775c, (q31_t)0x78d4dedd, (q31_t)0x1d6e4f43,
- (q31_t)0x78daa520, (q31_t)0x1d632608, (q31_t)0x78e06932, (q31_t)0x1d57fbaa,
- (q31_t)0x78e62b13, (q31_t)0x1d4cd02c, (q31_t)0x78ebeac2, (q31_t)0x1d41a38c,
- (q31_t)0x78f1a840, (q31_t)0x1d3675cb, (q31_t)0x78f7638b, (q31_t)0x1d2b46ea,
- (q31_t)0x78fd1ca4, (q31_t)0x1d2016e9, (q31_t)0x7902d38b, (q31_t)0x1d14e5c9,
- (q31_t)0x7908883f, (q31_t)0x1d09b389, (q31_t)0x790e3ac0, (q31_t)0x1cfe802b,
- (q31_t)0x7913eb0e, (q31_t)0x1cf34baf, (q31_t)0x79199929, (q31_t)0x1ce81615,
- (q31_t)0x791f4510, (q31_t)0x1cdcdf5e, (q31_t)0x7924eec3, (q31_t)0x1cd1a78a,
- (q31_t)0x792a9642, (q31_t)0x1cc66e99, (q31_t)0x79303b8e, (q31_t)0x1cbb348d,
- (q31_t)0x7935dea4, (q31_t)0x1caff965, (q31_t)0x793b7f86, (q31_t)0x1ca4bd21,
- (q31_t)0x79411e33, (q31_t)0x1c997fc4, (q31_t)0x7946baac, (q31_t)0x1c8e414b,
- (q31_t)0x794c54ee, (q31_t)0x1c8301b9, (q31_t)0x7951ecfc, (q31_t)0x1c77c10e,
- (q31_t)0x795782d3, (q31_t)0x1c6c7f4a, (q31_t)0x795d1675, (q31_t)0x1c613c6d,
- (q31_t)0x7962a7e0, (q31_t)0x1c55f878, (q31_t)0x79683715, (q31_t)0x1c4ab36b,
- (q31_t)0x796dc414, (q31_t)0x1c3f6d47, (q31_t)0x79734edc, (q31_t)0x1c34260c,
- (q31_t)0x7978d76c, (q31_t)0x1c28ddbb, (q31_t)0x797e5dc6, (q31_t)0x1c1d9454,
- (q31_t)0x7983e1e8, (q31_t)0x1c1249d8, (q31_t)0x798963d2, (q31_t)0x1c06fe46,
- (q31_t)0x798ee385, (q31_t)0x1bfbb1a0, (q31_t)0x799460ff, (q31_t)0x1bf063e6,
- (q31_t)0x7999dc42, (q31_t)0x1be51518, (q31_t)0x799f554b, (q31_t)0x1bd9c537,
- (q31_t)0x79a4cc1c, (q31_t)0x1bce7442, (q31_t)0x79aa40b4, (q31_t)0x1bc3223c,
- (q31_t)0x79afb313, (q31_t)0x1bb7cf23, (q31_t)0x79b52339, (q31_t)0x1bac7af9,
- (q31_t)0x79ba9125, (q31_t)0x1ba125bd, (q31_t)0x79bffcd7, (q31_t)0x1b95cf71,
- (q31_t)0x79c5664f, (q31_t)0x1b8a7815, (q31_t)0x79cacd8d, (q31_t)0x1b7f1fa9,
- (q31_t)0x79d03291, (q31_t)0x1b73c62d, (q31_t)0x79d5955a, (q31_t)0x1b686ba3,
- (q31_t)0x79daf5e8, (q31_t)0x1b5d100a, (q31_t)0x79e0543c, (q31_t)0x1b51b363,
- (q31_t)0x79e5b054, (q31_t)0x1b4655ae, (q31_t)0x79eb0a31, (q31_t)0x1b3af6ec,
- (q31_t)0x79f061d2, (q31_t)0x1b2f971e, (q31_t)0x79f5b737, (q31_t)0x1b243643,
- (q31_t)0x79fb0a60, (q31_t)0x1b18d45c, (q31_t)0x7a005b4d, (q31_t)0x1b0d716a,
- (q31_t)0x7a05a9fd, (q31_t)0x1b020d6c, (q31_t)0x7a0af671, (q31_t)0x1af6a865,
- (q31_t)0x7a1040a8, (q31_t)0x1aeb4253, (q31_t)0x7a1588a2, (q31_t)0x1adfdb37,
- (q31_t)0x7a1ace5f, (q31_t)0x1ad47312, (q31_t)0x7a2011de, (q31_t)0x1ac909e5,
- (q31_t)0x7a25531f, (q31_t)0x1abd9faf, (q31_t)0x7a2a9223, (q31_t)0x1ab23471,
- (q31_t)0x7a2fcee8, (q31_t)0x1aa6c82b, (q31_t)0x7a350970, (q31_t)0x1a9b5adf,
- (q31_t)0x7a3a41b9, (q31_t)0x1a8fec8c, (q31_t)0x7a3f77c3, (q31_t)0x1a847d33,
- (q31_t)0x7a44ab8e, (q31_t)0x1a790cd4, (q31_t)0x7a49dd1a, (q31_t)0x1a6d9b70,
- (q31_t)0x7a4f0c67, (q31_t)0x1a622907, (q31_t)0x7a543974, (q31_t)0x1a56b599,
- (q31_t)0x7a596442, (q31_t)0x1a4b4128, (q31_t)0x7a5e8cd0, (q31_t)0x1a3fcbb3,
- (q31_t)0x7a63b31d, (q31_t)0x1a34553b, (q31_t)0x7a68d72b, (q31_t)0x1a28ddc0,
- (q31_t)0x7a6df8f8, (q31_t)0x1a1d6544, (q31_t)0x7a731884, (q31_t)0x1a11ebc5,
- (q31_t)0x7a7835cf, (q31_t)0x1a067145, (q31_t)0x7a7d50da, (q31_t)0x19faf5c5,
- (q31_t)0x7a8269a3, (q31_t)0x19ef7944, (q31_t)0x7a87802a, (q31_t)0x19e3fbc3,
- (q31_t)0x7a8c9470, (q31_t)0x19d87d42, (q31_t)0x7a91a674, (q31_t)0x19ccfdc2,
- (q31_t)0x7a96b636, (q31_t)0x19c17d44, (q31_t)0x7a9bc3b6, (q31_t)0x19b5fbc8,
- (q31_t)0x7aa0cef3, (q31_t)0x19aa794d, (q31_t)0x7aa5d7ee, (q31_t)0x199ef5d6,
- (q31_t)0x7aaadea6, (q31_t)0x19937161, (q31_t)0x7aafe31b, (q31_t)0x1987ebf0,
- (q31_t)0x7ab4e54c, (q31_t)0x197c6584, (q31_t)0x7ab9e53a, (q31_t)0x1970de1b,
- (q31_t)0x7abee2e5, (q31_t)0x196555b8, (q31_t)0x7ac3de4c, (q31_t)0x1959cc5a,
- (q31_t)0x7ac8d76f, (q31_t)0x194e4201, (q31_t)0x7acdce4d, (q31_t)0x1942b6af,
- (q31_t)0x7ad2c2e8, (q31_t)0x19372a64, (q31_t)0x7ad7b53d, (q31_t)0x192b9d1f,
- (q31_t)0x7adca54e, (q31_t)0x19200ee3, (q31_t)0x7ae1931a, (q31_t)0x19147fae,
- (q31_t)0x7ae67ea1, (q31_t)0x1908ef82, (q31_t)0x7aeb67e3, (q31_t)0x18fd5e5f,
- (q31_t)0x7af04edf, (q31_t)0x18f1cc45, (q31_t)0x7af53395, (q31_t)0x18e63935,
- (q31_t)0x7afa1605, (q31_t)0x18daa52f, (q31_t)0x7afef630, (q31_t)0x18cf1034,
- (q31_t)0x7b03d414, (q31_t)0x18c37a44, (q31_t)0x7b08afb2, (q31_t)0x18b7e35f,
- (q31_t)0x7b0d8909, (q31_t)0x18ac4b87, (q31_t)0x7b126019, (q31_t)0x18a0b2bb,
- (q31_t)0x7b1734e2, (q31_t)0x189518fc, (q31_t)0x7b1c0764, (q31_t)0x18897e4a,
- (q31_t)0x7b20d79e, (q31_t)0x187de2a7, (q31_t)0x7b25a591, (q31_t)0x18724611,
- (q31_t)0x7b2a713d, (q31_t)0x1866a88a, (q31_t)0x7b2f3aa0, (q31_t)0x185b0a13,
- (q31_t)0x7b3401bb, (q31_t)0x184f6aab, (q31_t)0x7b38c68e, (q31_t)0x1843ca53,
- (q31_t)0x7b3d8918, (q31_t)0x1838290c, (q31_t)0x7b42495a, (q31_t)0x182c86d5,
- (q31_t)0x7b470753, (q31_t)0x1820e3b0, (q31_t)0x7b4bc303, (q31_t)0x18153f9d,
- (q31_t)0x7b507c69, (q31_t)0x18099a9c, (q31_t)0x7b553386, (q31_t)0x17fdf4ae,
- (q31_t)0x7b59e85a, (q31_t)0x17f24dd3, (q31_t)0x7b5e9ae4, (q31_t)0x17e6a60c,
- (q31_t)0x7b634b23, (q31_t)0x17dafd59, (q31_t)0x7b67f919, (q31_t)0x17cf53bb,
- (q31_t)0x7b6ca4c4, (q31_t)0x17c3a931, (q31_t)0x7b714e25, (q31_t)0x17b7fdbd,
- (q31_t)0x7b75f53c, (q31_t)0x17ac515f, (q31_t)0x7b7a9a07, (q31_t)0x17a0a417,
- (q31_t)0x7b7f3c87, (q31_t)0x1794f5e6, (q31_t)0x7b83dcbc, (q31_t)0x178946cc,
- (q31_t)0x7b887aa6, (q31_t)0x177d96ca, (q31_t)0x7b8d1644, (q31_t)0x1771e5e0,
- (q31_t)0x7b91af97, (q31_t)0x1766340f, (q31_t)0x7b96469d, (q31_t)0x175a8157,
- (q31_t)0x7b9adb57, (q31_t)0x174ecdb8, (q31_t)0x7b9f6dc5, (q31_t)0x17431933,
- (q31_t)0x7ba3fde7, (q31_t)0x173763c9, (q31_t)0x7ba88bbc, (q31_t)0x172bad7a,
- (q31_t)0x7bad1744, (q31_t)0x171ff646, (q31_t)0x7bb1a080, (q31_t)0x17143e2d,
- (q31_t)0x7bb6276e, (q31_t)0x17088531, (q31_t)0x7bbaac0e, (q31_t)0x16fccb51,
- (q31_t)0x7bbf2e62, (q31_t)0x16f1108f, (q31_t)0x7bc3ae67, (q31_t)0x16e554ea,
- (q31_t)0x7bc82c1f, (q31_t)0x16d99864, (q31_t)0x7bcca789, (q31_t)0x16cddafb,
- (q31_t)0x7bd120a4, (q31_t)0x16c21cb2, (q31_t)0x7bd59771, (q31_t)0x16b65d88,
- (q31_t)0x7bda0bf0, (q31_t)0x16aa9d7e, (q31_t)0x7bde7e20, (q31_t)0x169edc94,
- (q31_t)0x7be2ee01, (q31_t)0x16931acb, (q31_t)0x7be75b93, (q31_t)0x16875823,
- (q31_t)0x7bebc6d5, (q31_t)0x167b949d, (q31_t)0x7bf02fc9, (q31_t)0x166fd039,
- (q31_t)0x7bf4966c, (q31_t)0x16640af7, (q31_t)0x7bf8fac0, (q31_t)0x165844d8,
- (q31_t)0x7bfd5cc4, (q31_t)0x164c7ddd, (q31_t)0x7c01bc78, (q31_t)0x1640b606,
- (q31_t)0x7c0619dc, (q31_t)0x1634ed53, (q31_t)0x7c0a74f0, (q31_t)0x162923c5,
- (q31_t)0x7c0ecdb2, (q31_t)0x161d595d, (q31_t)0x7c132424, (q31_t)0x16118e1a,
- (q31_t)0x7c177845, (q31_t)0x1605c1fd, (q31_t)0x7c1bca16, (q31_t)0x15f9f507,
- (q31_t)0x7c201994, (q31_t)0x15ee2738, (q31_t)0x7c2466c2, (q31_t)0x15e25890,
- (q31_t)0x7c28b19e, (q31_t)0x15d68911, (q31_t)0x7c2cfa28, (q31_t)0x15cab8ba,
- (q31_t)0x7c314060, (q31_t)0x15bee78c, (q31_t)0x7c358446, (q31_t)0x15b31587,
- (q31_t)0x7c39c5da, (q31_t)0x15a742ac, (q31_t)0x7c3e051b, (q31_t)0x159b6efb,
- (q31_t)0x7c42420a, (q31_t)0x158f9a76, (q31_t)0x7c467ca6, (q31_t)0x1583c51b,
- (q31_t)0x7c4ab4ef, (q31_t)0x1577eeec, (q31_t)0x7c4eeae5, (q31_t)0x156c17e9,
- (q31_t)0x7c531e88, (q31_t)0x15604013, (q31_t)0x7c574fd8, (q31_t)0x1554676a,
- (q31_t)0x7c5b7ed4, (q31_t)0x15488dee, (q31_t)0x7c5fab7c, (q31_t)0x153cb3a0,
- (q31_t)0x7c63d5d1, (q31_t)0x1530d881, (q31_t)0x7c67fdd1, (q31_t)0x1524fc90,
- (q31_t)0x7c6c237e, (q31_t)0x15191fcf, (q31_t)0x7c7046d6, (q31_t)0x150d423d,
- (q31_t)0x7c7467d9, (q31_t)0x150163dc, (q31_t)0x7c788688, (q31_t)0x14f584ac,
- (q31_t)0x7c7ca2e2, (q31_t)0x14e9a4ac, (q31_t)0x7c80bce7, (q31_t)0x14ddc3de,
- (q31_t)0x7c84d496, (q31_t)0x14d1e242, (q31_t)0x7c88e9f1, (q31_t)0x14c5ffd9,
- (q31_t)0x7c8cfcf6, (q31_t)0x14ba1ca3, (q31_t)0x7c910da5, (q31_t)0x14ae38a0,
- (q31_t)0x7c951bff, (q31_t)0x14a253d1, (q31_t)0x7c992803, (q31_t)0x14966e36,
- (q31_t)0x7c9d31b0, (q31_t)0x148a87d1, (q31_t)0x7ca13908, (q31_t)0x147ea0a0,
- (q31_t)0x7ca53e09, (q31_t)0x1472b8a5, (q31_t)0x7ca940b3, (q31_t)0x1466cfe1,
- (q31_t)0x7cad4107, (q31_t)0x145ae653, (q31_t)0x7cb13f04, (q31_t)0x144efbfc,
- (q31_t)0x7cb53aaa, (q31_t)0x144310dd, (q31_t)0x7cb933f9, (q31_t)0x143724f5,
- (q31_t)0x7cbd2af0, (q31_t)0x142b3846, (q31_t)0x7cc11f90, (q31_t)0x141f4ad1,
- (q31_t)0x7cc511d9, (q31_t)0x14135c94, (q31_t)0x7cc901c9, (q31_t)0x14076d91,
- (q31_t)0x7cccef62, (q31_t)0x13fb7dc9, (q31_t)0x7cd0daa2, (q31_t)0x13ef8d3c,
- (q31_t)0x7cd4c38b, (q31_t)0x13e39be9, (q31_t)0x7cd8aa1b, (q31_t)0x13d7a9d3,
- (q31_t)0x7cdc8e52, (q31_t)0x13cbb6f8, (q31_t)0x7ce07031, (q31_t)0x13bfc35b,
- (q31_t)0x7ce44fb7, (q31_t)0x13b3cefa, (q31_t)0x7ce82ce4, (q31_t)0x13a7d9d7,
- (q31_t)0x7cec07b8, (q31_t)0x139be3f2, (q31_t)0x7cefe032, (q31_t)0x138fed4b,
- (q31_t)0x7cf3b653, (q31_t)0x1383f5e3, (q31_t)0x7cf78a1b, (q31_t)0x1377fdbb,
- (q31_t)0x7cfb5b89, (q31_t)0x136c04d2, (q31_t)0x7cff2a9d, (q31_t)0x13600b2a,
- (q31_t)0x7d02f757, (q31_t)0x135410c3, (q31_t)0x7d06c1b6, (q31_t)0x1348159d,
- (q31_t)0x7d0a89bc, (q31_t)0x133c19b8, (q31_t)0x7d0e4f67, (q31_t)0x13301d16,
- (q31_t)0x7d1212b7, (q31_t)0x13241fb6, (q31_t)0x7d15d3ad, (q31_t)0x1318219a,
- (q31_t)0x7d199248, (q31_t)0x130c22c1, (q31_t)0x7d1d4e88, (q31_t)0x1300232c,
- (q31_t)0x7d21086c, (q31_t)0x12f422db, (q31_t)0x7d24bff6, (q31_t)0x12e821cf,
- (q31_t)0x7d287523, (q31_t)0x12dc2009, (q31_t)0x7d2c27f6, (q31_t)0x12d01d89,
- (q31_t)0x7d2fd86c, (q31_t)0x12c41a4f, (q31_t)0x7d338687, (q31_t)0x12b8165b,
- (q31_t)0x7d373245, (q31_t)0x12ac11af, (q31_t)0x7d3adba7, (q31_t)0x12a00c4b,
- (q31_t)0x7d3e82ae, (q31_t)0x1294062f, (q31_t)0x7d422757, (q31_t)0x1287ff5b,
- (q31_t)0x7d45c9a4, (q31_t)0x127bf7d1, (q31_t)0x7d496994, (q31_t)0x126fef90,
- (q31_t)0x7d4d0728, (q31_t)0x1263e699, (q31_t)0x7d50a25e, (q31_t)0x1257dced,
- (q31_t)0x7d543b37, (q31_t)0x124bd28c, (q31_t)0x7d57d1b3, (q31_t)0x123fc776,
- (q31_t)0x7d5b65d2, (q31_t)0x1233bbac, (q31_t)0x7d5ef793, (q31_t)0x1227af2e,
- (q31_t)0x7d6286f6, (q31_t)0x121ba1fd, (q31_t)0x7d6613fb, (q31_t)0x120f941a,
- (q31_t)0x7d699ea3, (q31_t)0x12038584, (q31_t)0x7d6d26ec, (q31_t)0x11f7763c,
- (q31_t)0x7d70acd7, (q31_t)0x11eb6643, (q31_t)0x7d743064, (q31_t)0x11df5599,
- (q31_t)0x7d77b192, (q31_t)0x11d3443f, (q31_t)0x7d7b3061, (q31_t)0x11c73235,
- (q31_t)0x7d7eacd2, (q31_t)0x11bb1f7c, (q31_t)0x7d8226e4, (q31_t)0x11af0c13,
- (q31_t)0x7d859e96, (q31_t)0x11a2f7fc, (q31_t)0x7d8913ea, (q31_t)0x1196e337,
- (q31_t)0x7d8c86de, (q31_t)0x118acdc4, (q31_t)0x7d8ff772, (q31_t)0x117eb7a4,
- (q31_t)0x7d9365a8, (q31_t)0x1172a0d7, (q31_t)0x7d96d17d, (q31_t)0x1166895f,
- (q31_t)0x7d9a3af2, (q31_t)0x115a713a, (q31_t)0x7d9da208, (q31_t)0x114e586a,
- (q31_t)0x7da106bd, (q31_t)0x11423ef0, (q31_t)0x7da46912, (q31_t)0x113624cb,
- (q31_t)0x7da7c907, (q31_t)0x112a09fc, (q31_t)0x7dab269b, (q31_t)0x111dee84,
- (q31_t)0x7dae81cf, (q31_t)0x1111d263, (q31_t)0x7db1daa2, (q31_t)0x1105b599,
- (q31_t)0x7db53113, (q31_t)0x10f99827, (q31_t)0x7db88524, (q31_t)0x10ed7a0e,
- (q31_t)0x7dbbd6d4, (q31_t)0x10e15b4e, (q31_t)0x7dbf2622, (q31_t)0x10d53be7,
- (q31_t)0x7dc2730f, (q31_t)0x10c91bda, (q31_t)0x7dc5bd9b, (q31_t)0x10bcfb28,
- (q31_t)0x7dc905c5, (q31_t)0x10b0d9d0, (q31_t)0x7dcc4b8d, (q31_t)0x10a4b7d3,
- (q31_t)0x7dcf8ef3, (q31_t)0x10989532, (q31_t)0x7dd2cff7, (q31_t)0x108c71ee,
- (q31_t)0x7dd60e99, (q31_t)0x10804e06, (q31_t)0x7dd94ad8, (q31_t)0x1074297b,
- (q31_t)0x7ddc84b5, (q31_t)0x1068044e, (q31_t)0x7ddfbc30, (q31_t)0x105bde7f,
- (q31_t)0x7de2f148, (q31_t)0x104fb80e, (q31_t)0x7de623fd, (q31_t)0x104390fd,
- (q31_t)0x7de9544f, (q31_t)0x1037694b, (q31_t)0x7dec823e, (q31_t)0x102b40f8,
- (q31_t)0x7defadca, (q31_t)0x101f1807, (q31_t)0x7df2d6f3, (q31_t)0x1012ee76,
- (q31_t)0x7df5fdb8, (q31_t)0x1006c446, (q31_t)0x7df9221a, (q31_t)0xffa9979,
- (q31_t)0x7dfc4418, (q31_t)0xfee6e0d, (q31_t)0x7dff63b2, (q31_t)0xfe24205,
- (q31_t)0x7e0280e9, (q31_t)0xfd6155f, (q31_t)0x7e059bbb, (q31_t)0xfc9e81e,
- (q31_t)0x7e08b42a, (q31_t)0xfbdba40, (q31_t)0x7e0bca34, (q31_t)0xfb18bc8,
- (q31_t)0x7e0eddd9, (q31_t)0xfa55cb4, (q31_t)0x7e11ef1b, (q31_t)0xf992d06,
- (q31_t)0x7e14fdf7, (q31_t)0xf8cfcbe, (q31_t)0x7e180a6f, (q31_t)0xf80cbdc,
- (q31_t)0x7e1b1482, (q31_t)0xf749a61, (q31_t)0x7e1e1c30, (q31_t)0xf68684e,
- (q31_t)0x7e212179, (q31_t)0xf5c35a3, (q31_t)0x7e24245d, (q31_t)0xf500260,
- (q31_t)0x7e2724db, (q31_t)0xf43ce86, (q31_t)0x7e2a22f4, (q31_t)0xf379a16,
- (q31_t)0x7e2d1ea8, (q31_t)0xf2b650f, (q31_t)0x7e3017f6, (q31_t)0xf1f2f73,
- (q31_t)0x7e330ede, (q31_t)0xf12f941, (q31_t)0x7e360360, (q31_t)0xf06c27a,
- (q31_t)0x7e38f57c, (q31_t)0xefa8b20, (q31_t)0x7e3be532, (q31_t)0xeee5331,
- (q31_t)0x7e3ed282, (q31_t)0xee21aaf, (q31_t)0x7e41bd6c, (q31_t)0xed5e19a,
- (q31_t)0x7e44a5ef, (q31_t)0xec9a7f3, (q31_t)0x7e478c0b, (q31_t)0xebd6db9,
- (q31_t)0x7e4a6fc1, (q31_t)0xeb132ef, (q31_t)0x7e4d5110, (q31_t)0xea4f793,
- (q31_t)0x7e502ff9, (q31_t)0xe98bba7, (q31_t)0x7e530c7a, (q31_t)0xe8c7f2a,
- (q31_t)0x7e55e694, (q31_t)0xe80421e, (q31_t)0x7e58be47, (q31_t)0xe740483,
- (q31_t)0x7e5b9392, (q31_t)0xe67c65a, (q31_t)0x7e5e6676, (q31_t)0xe5b87a2,
- (q31_t)0x7e6136f3, (q31_t)0xe4f485c, (q31_t)0x7e640507, (q31_t)0xe430889,
- (q31_t)0x7e66d0b4, (q31_t)0xe36c82a, (q31_t)0x7e6999fa, (q31_t)0xe2a873e,
- (q31_t)0x7e6c60d7, (q31_t)0xe1e45c6, (q31_t)0x7e6f254c, (q31_t)0xe1203c3,
- (q31_t)0x7e71e759, (q31_t)0xe05c135, (q31_t)0x7e74a6fd, (q31_t)0xdf97e1d,
- (q31_t)0x7e77643a, (q31_t)0xded3a7b, (q31_t)0x7e7a1f0d, (q31_t)0xde0f64f,
- (q31_t)0x7e7cd778, (q31_t)0xdd4b19a, (q31_t)0x7e7f8d7b, (q31_t)0xdc86c5d,
- (q31_t)0x7e824114, (q31_t)0xdbc2698, (q31_t)0x7e84f245, (q31_t)0xdafe04b,
- (q31_t)0x7e87a10c, (q31_t)0xda39978, (q31_t)0x7e8a4d6a, (q31_t)0xd97521d,
- (q31_t)0x7e8cf75f, (q31_t)0xd8b0a3d, (q31_t)0x7e8f9eeb, (q31_t)0xd7ec1d6,
- (q31_t)0x7e92440d, (q31_t)0xd7278eb, (q31_t)0x7e94e6c6, (q31_t)0xd662f7b,
- (q31_t)0x7e978715, (q31_t)0xd59e586, (q31_t)0x7e9a24fb, (q31_t)0xd4d9b0e,
- (q31_t)0x7e9cc076, (q31_t)0xd415013, (q31_t)0x7e9f5988, (q31_t)0xd350495,
- (q31_t)0x7ea1f02f, (q31_t)0xd28b894, (q31_t)0x7ea4846c, (q31_t)0xd1c6c11,
- (q31_t)0x7ea7163f, (q31_t)0xd101f0e, (q31_t)0x7ea9a5a8, (q31_t)0xd03d189,
- (q31_t)0x7eac32a6, (q31_t)0xcf78383, (q31_t)0x7eaebd3a, (q31_t)0xceb34fe,
- (q31_t)0x7eb14563, (q31_t)0xcdee5f9, (q31_t)0x7eb3cb21, (q31_t)0xcd29676,
- (q31_t)0x7eb64e75, (q31_t)0xcc64673, (q31_t)0x7eb8cf5d, (q31_t)0xcb9f5f3,
- (q31_t)0x7ebb4ddb, (q31_t)0xcada4f5, (q31_t)0x7ebdc9ed, (q31_t)0xca1537a,
- (q31_t)0x7ec04394, (q31_t)0xc950182, (q31_t)0x7ec2bad0, (q31_t)0xc88af0e,
- (q31_t)0x7ec52fa0, (q31_t)0xc7c5c1e, (q31_t)0x7ec7a205, (q31_t)0xc7008b3,
- (q31_t)0x7eca11fe, (q31_t)0xc63b4ce, (q31_t)0x7ecc7f8b, (q31_t)0xc57606e,
- (q31_t)0x7eceeaad, (q31_t)0xc4b0b94, (q31_t)0x7ed15363, (q31_t)0xc3eb641,
- (q31_t)0x7ed3b9ad, (q31_t)0xc326075, (q31_t)0x7ed61d8a, (q31_t)0xc260a31,
- (q31_t)0x7ed87efc, (q31_t)0xc19b374, (q31_t)0x7edade01, (q31_t)0xc0d5c41,
- (q31_t)0x7edd3a9a, (q31_t)0xc010496, (q31_t)0x7edf94c7, (q31_t)0xbf4ac75,
- (q31_t)0x7ee1ec87, (q31_t)0xbe853de, (q31_t)0x7ee441da, (q31_t)0xbdbfad1,
- (q31_t)0x7ee694c1, (q31_t)0xbcfa150, (q31_t)0x7ee8e53a, (q31_t)0xbc34759,
- (q31_t)0x7eeb3347, (q31_t)0xbb6ecef, (q31_t)0x7eed7ee7, (q31_t)0xbaa9211,
- (q31_t)0x7eefc81a, (q31_t)0xb9e36c0, (q31_t)0x7ef20ee0, (q31_t)0xb91dafc,
- (q31_t)0x7ef45338, (q31_t)0xb857ec7, (q31_t)0x7ef69523, (q31_t)0xb79221f,
- (q31_t)0x7ef8d4a1, (q31_t)0xb6cc506, (q31_t)0x7efb11b1, (q31_t)0xb60677c,
- (q31_t)0x7efd4c54, (q31_t)0xb540982, (q31_t)0x7eff8489, (q31_t)0xb47ab19,
- (q31_t)0x7f01ba50, (q31_t)0xb3b4c40, (q31_t)0x7f03eda9, (q31_t)0xb2eecf8,
- (q31_t)0x7f061e95, (q31_t)0xb228d42, (q31_t)0x7f084d12, (q31_t)0xb162d1d,
- (q31_t)0x7f0a7921, (q31_t)0xb09cc8c, (q31_t)0x7f0ca2c2, (q31_t)0xafd6b8d,
- (q31_t)0x7f0ec9f5, (q31_t)0xaf10a22, (q31_t)0x7f10eeb9, (q31_t)0xae4a84b,
- (q31_t)0x7f13110f, (q31_t)0xad84609, (q31_t)0x7f1530f7, (q31_t)0xacbe35b,
- (q31_t)0x7f174e70, (q31_t)0xabf8043, (q31_t)0x7f19697a, (q31_t)0xab31cc1,
- (q31_t)0x7f1b8215, (q31_t)0xaa6b8d5, (q31_t)0x7f1d9842, (q31_t)0xa9a5480,
- (q31_t)0x7f1fabff, (q31_t)0xa8defc3, (q31_t)0x7f21bd4e, (q31_t)0xa818a9d,
- (q31_t)0x7f23cc2e, (q31_t)0xa752510, (q31_t)0x7f25d89e, (q31_t)0xa68bf1b,
- (q31_t)0x7f27e29f, (q31_t)0xa5c58c0, (q31_t)0x7f29ea31, (q31_t)0xa4ff1fe,
- (q31_t)0x7f2bef53, (q31_t)0xa438ad7, (q31_t)0x7f2df206, (q31_t)0xa37234a,
- (q31_t)0x7f2ff24a, (q31_t)0xa2abb59, (q31_t)0x7f31f01d, (q31_t)0xa1e5303,
- (q31_t)0x7f33eb81, (q31_t)0xa11ea49, (q31_t)0x7f35e476, (q31_t)0xa05812c,
- (q31_t)0x7f37dafa, (q31_t)0x9f917ac, (q31_t)0x7f39cf0e, (q31_t)0x9ecadc9,
- (q31_t)0x7f3bc0b3, (q31_t)0x9e04385, (q31_t)0x7f3dafe7, (q31_t)0x9d3d8df,
- (q31_t)0x7f3f9cab, (q31_t)0x9c76dd8, (q31_t)0x7f4186ff, (q31_t)0x9bb0271,
- (q31_t)0x7f436ee3, (q31_t)0x9ae96aa, (q31_t)0x7f455456, (q31_t)0x9a22a83,
- (q31_t)0x7f473759, (q31_t)0x995bdfd, (q31_t)0x7f4917eb, (q31_t)0x9895118,
- (q31_t)0x7f4af60d, (q31_t)0x97ce3d5, (q31_t)0x7f4cd1be, (q31_t)0x9707635,
- (q31_t)0x7f4eaafe, (q31_t)0x9640837, (q31_t)0x7f5081cd, (q31_t)0x95799dd,
- (q31_t)0x7f52562c, (q31_t)0x94b2b27, (q31_t)0x7f54281a, (q31_t)0x93ebc14,
- (q31_t)0x7f55f796, (q31_t)0x9324ca7, (q31_t)0x7f57c4a2, (q31_t)0x925dcdf,
- (q31_t)0x7f598f3c, (q31_t)0x9196cbc, (q31_t)0x7f5b5765, (q31_t)0x90cfc40,
- (q31_t)0x7f5d1d1d, (q31_t)0x9008b6a, (q31_t)0x7f5ee063, (q31_t)0x8f41a3c,
- (q31_t)0x7f60a138, (q31_t)0x8e7a8b5, (q31_t)0x7f625f9b, (q31_t)0x8db36d6,
- (q31_t)0x7f641b8d, (q31_t)0x8cec4a0, (q31_t)0x7f65d50d, (q31_t)0x8c25213,
- (q31_t)0x7f678c1c, (q31_t)0x8b5df30, (q31_t)0x7f6940b8, (q31_t)0x8a96bf6,
- (q31_t)0x7f6af2e3, (q31_t)0x89cf867, (q31_t)0x7f6ca29c, (q31_t)0x8908483,
- (q31_t)0x7f6e4fe3, (q31_t)0x884104b, (q31_t)0x7f6ffab8, (q31_t)0x8779bbe,
- (q31_t)0x7f71a31b, (q31_t)0x86b26de, (q31_t)0x7f73490b, (q31_t)0x85eb1ab,
- (q31_t)0x7f74ec8a, (q31_t)0x8523c25, (q31_t)0x7f768d96, (q31_t)0x845c64d,
- (q31_t)0x7f782c30, (q31_t)0x8395024, (q31_t)0x7f79c857, (q31_t)0x82cd9a9,
- (q31_t)0x7f7b620c, (q31_t)0x82062de, (q31_t)0x7f7cf94e, (q31_t)0x813ebc2,
- (q31_t)0x7f7e8e1e, (q31_t)0x8077457, (q31_t)0x7f80207b, (q31_t)0x7fafc9c,
- (q31_t)0x7f81b065, (q31_t)0x7ee8493, (q31_t)0x7f833ddd, (q31_t)0x7e20c3b,
- (q31_t)0x7f84c8e2, (q31_t)0x7d59396, (q31_t)0x7f865174, (q31_t)0x7c91aa3,
- (q31_t)0x7f87d792, (q31_t)0x7bca163, (q31_t)0x7f895b3e, (q31_t)0x7b027d7,
- (q31_t)0x7f8adc77, (q31_t)0x7a3adff, (q31_t)0x7f8c5b3d, (q31_t)0x79733dc,
- (q31_t)0x7f8dd78f, (q31_t)0x78ab96e, (q31_t)0x7f8f516e, (q31_t)0x77e3eb5,
- (q31_t)0x7f90c8da, (q31_t)0x771c3b3, (q31_t)0x7f923dd2, (q31_t)0x7654867,
- (q31_t)0x7f93b058, (q31_t)0x758ccd2, (q31_t)0x7f952069, (q31_t)0x74c50f4,
- (q31_t)0x7f968e07, (q31_t)0x73fd4cf, (q31_t)0x7f97f932, (q31_t)0x7335862,
- (q31_t)0x7f9961e8, (q31_t)0x726dbae, (q31_t)0x7f9ac82c, (q31_t)0x71a5eb3,
- (q31_t)0x7f9c2bfb, (q31_t)0x70de172, (q31_t)0x7f9d8d56, (q31_t)0x70163eb,
- (q31_t)0x7f9eec3e, (q31_t)0x6f4e620, (q31_t)0x7fa048b2, (q31_t)0x6e86810,
- (q31_t)0x7fa1a2b2, (q31_t)0x6dbe9bb, (q31_t)0x7fa2fa3d, (q31_t)0x6cf6b23,
- (q31_t)0x7fa44f55, (q31_t)0x6c2ec48, (q31_t)0x7fa5a1f9, (q31_t)0x6b66d29,
- (q31_t)0x7fa6f228, (q31_t)0x6a9edc9, (q31_t)0x7fa83fe3, (q31_t)0x69d6e27,
- (q31_t)0x7fa98b2a, (q31_t)0x690ee44, (q31_t)0x7faad3fd, (q31_t)0x6846e1f,
- (q31_t)0x7fac1a5b, (q31_t)0x677edbb, (q31_t)0x7fad5e45, (q31_t)0x66b6d16,
- (q31_t)0x7fae9fbb, (q31_t)0x65eec33, (q31_t)0x7fafdebb, (q31_t)0x6526b10,
- (q31_t)0x7fb11b48, (q31_t)0x645e9af, (q31_t)0x7fb2555f, (q31_t)0x6396810,
- (q31_t)0x7fb38d02, (q31_t)0x62ce634, (q31_t)0x7fb4c231, (q31_t)0x620641a,
- (q31_t)0x7fb5f4ea, (q31_t)0x613e1c5, (q31_t)0x7fb7252f, (q31_t)0x6075f33,
- (q31_t)0x7fb852ff, (q31_t)0x5fadc66, (q31_t)0x7fb97e5a, (q31_t)0x5ee595d,
- (q31_t)0x7fbaa740, (q31_t)0x5e1d61b, (q31_t)0x7fbbcdb1, (q31_t)0x5d5529e,
- (q31_t)0x7fbcf1ad, (q31_t)0x5c8cee7, (q31_t)0x7fbe1334, (q31_t)0x5bc4af8,
- (q31_t)0x7fbf3246, (q31_t)0x5afc6d0, (q31_t)0x7fc04ee3, (q31_t)0x5a3426f,
- (q31_t)0x7fc1690a, (q31_t)0x596bdd7, (q31_t)0x7fc280bc, (q31_t)0x58a3908,
- (q31_t)0x7fc395f9, (q31_t)0x57db403, (q31_t)0x7fc4a8c1, (q31_t)0x5712ec7,
- (q31_t)0x7fc5b913, (q31_t)0x564a955, (q31_t)0x7fc6c6f0, (q31_t)0x55823ae,
- (q31_t)0x7fc7d258, (q31_t)0x54b9dd3, (q31_t)0x7fc8db4a, (q31_t)0x53f17c3,
- (q31_t)0x7fc9e1c6, (q31_t)0x532917f, (q31_t)0x7fcae5cd, (q31_t)0x5260b08,
- (q31_t)0x7fcbe75e, (q31_t)0x519845e, (q31_t)0x7fcce67a, (q31_t)0x50cfd82,
- (q31_t)0x7fcde320, (q31_t)0x5007674, (q31_t)0x7fcedd50, (q31_t)0x4f3ef35,
- (q31_t)0x7fcfd50b, (q31_t)0x4e767c5, (q31_t)0x7fd0ca4f, (q31_t)0x4dae024,
- (q31_t)0x7fd1bd1e, (q31_t)0x4ce5854, (q31_t)0x7fd2ad77, (q31_t)0x4c1d054,
- (q31_t)0x7fd39b5a, (q31_t)0x4b54825, (q31_t)0x7fd486c7, (q31_t)0x4a8bfc7,
- (q31_t)0x7fd56fbe, (q31_t)0x49c373c, (q31_t)0x7fd6563f, (q31_t)0x48fae83,
- (q31_t)0x7fd73a4a, (q31_t)0x483259d, (q31_t)0x7fd81bdf, (q31_t)0x4769c8b,
- (q31_t)0x7fd8fafe, (q31_t)0x46a134c, (q31_t)0x7fd9d7a7, (q31_t)0x45d89e2,
- (q31_t)0x7fdab1d9, (q31_t)0x451004d, (q31_t)0x7fdb8996, (q31_t)0x444768d,
- (q31_t)0x7fdc5edc, (q31_t)0x437eca4, (q31_t)0x7fdd31ac, (q31_t)0x42b6290,
- (q31_t)0x7fde0205, (q31_t)0x41ed854, (q31_t)0x7fdecfe8, (q31_t)0x4124dee,
- (q31_t)0x7fdf9b55, (q31_t)0x405c361, (q31_t)0x7fe0644b, (q31_t)0x3f938ac,
- (q31_t)0x7fe12acb, (q31_t)0x3ecadcf, (q31_t)0x7fe1eed5, (q31_t)0x3e022cc,
- (q31_t)0x7fe2b067, (q31_t)0x3d397a3, (q31_t)0x7fe36f84, (q31_t)0x3c70c54,
- (q31_t)0x7fe42c2a, (q31_t)0x3ba80df, (q31_t)0x7fe4e659, (q31_t)0x3adf546,
- (q31_t)0x7fe59e12, (q31_t)0x3a16988, (q31_t)0x7fe65354, (q31_t)0x394dda7,
- (q31_t)0x7fe7061f, (q31_t)0x38851a2, (q31_t)0x7fe7b674, (q31_t)0x37bc57b,
- (q31_t)0x7fe86452, (q31_t)0x36f3931, (q31_t)0x7fe90fb9, (q31_t)0x362acc5,
- (q31_t)0x7fe9b8a9, (q31_t)0x3562038, (q31_t)0x7fea5f23, (q31_t)0x3499389,
- (q31_t)0x7feb0326, (q31_t)0x33d06bb, (q31_t)0x7feba4b2, (q31_t)0x33079cc,
- (q31_t)0x7fec43c7, (q31_t)0x323ecbe, (q31_t)0x7fece065, (q31_t)0x3175f91,
- (q31_t)0x7fed7a8c, (q31_t)0x30ad245, (q31_t)0x7fee123d, (q31_t)0x2fe44dc,
- (q31_t)0x7feea776, (q31_t)0x2f1b755, (q31_t)0x7fef3a39, (q31_t)0x2e529b0,
- (q31_t)0x7fefca84, (q31_t)0x2d89bf0, (q31_t)0x7ff05858, (q31_t)0x2cc0e13,
- (q31_t)0x7ff0e3b6, (q31_t)0x2bf801a, (q31_t)0x7ff16c9c, (q31_t)0x2b2f207,
- (q31_t)0x7ff1f30b, (q31_t)0x2a663d8, (q31_t)0x7ff27703, (q31_t)0x299d590,
- (q31_t)0x7ff2f884, (q31_t)0x28d472e, (q31_t)0x7ff3778e, (q31_t)0x280b8b3,
- (q31_t)0x7ff3f420, (q31_t)0x2742a1f, (q31_t)0x7ff46e3c, (q31_t)0x2679b73,
- (q31_t)0x7ff4e5e0, (q31_t)0x25b0caf, (q31_t)0x7ff55b0d, (q31_t)0x24e7dd4,
- (q31_t)0x7ff5cdc3, (q31_t)0x241eee2, (q31_t)0x7ff63e01, (q31_t)0x2355fd9,
- (q31_t)0x7ff6abc8, (q31_t)0x228d0bb, (q31_t)0x7ff71718, (q31_t)0x21c4188,
- (q31_t)0x7ff77ff1, (q31_t)0x20fb240, (q31_t)0x7ff7e652, (q31_t)0x20322e3,
- (q31_t)0x7ff84a3c, (q31_t)0x1f69373, (q31_t)0x7ff8abae, (q31_t)0x1ea03ef,
- (q31_t)0x7ff90aaa, (q31_t)0x1dd7459, (q31_t)0x7ff9672d, (q31_t)0x1d0e4b0,
- (q31_t)0x7ff9c13a, (q31_t)0x1c454f5, (q31_t)0x7ffa18cf, (q31_t)0x1b7c528,
- (q31_t)0x7ffa6dec, (q31_t)0x1ab354b, (q31_t)0x7ffac092, (q31_t)0x19ea55d,
- (q31_t)0x7ffb10c1, (q31_t)0x192155f, (q31_t)0x7ffb5e78, (q31_t)0x1858552,
- (q31_t)0x7ffba9b8, (q31_t)0x178f536, (q31_t)0x7ffbf280, (q31_t)0x16c650b,
- (q31_t)0x7ffc38d1, (q31_t)0x15fd4d2, (q31_t)0x7ffc7caa, (q31_t)0x153448c,
- (q31_t)0x7ffcbe0c, (q31_t)0x146b438, (q31_t)0x7ffcfcf6, (q31_t)0x13a23d8,
- (q31_t)0x7ffd3969, (q31_t)0x12d936c, (q31_t)0x7ffd7364, (q31_t)0x12102f4,
- (q31_t)0x7ffdaae7, (q31_t)0x1147271, (q31_t)0x7ffddff3, (q31_t)0x107e1e3,
- (q31_t)0x7ffe1288, (q31_t)0xfb514b, (q31_t)0x7ffe42a4, (q31_t)0xeec0aa,
- (q31_t)0x7ffe704a, (q31_t)0xe22fff, (q31_t)0x7ffe9b77, (q31_t)0xd59f4c,
- (q31_t)0x7ffec42d, (q31_t)0xc90e90, (q31_t)0x7ffeea6c, (q31_t)0xbc7dcc,
- (q31_t)0x7fff0e32, (q31_t)0xafed02, (q31_t)0x7fff2f82, (q31_t)0xa35c30,
- (q31_t)0x7fff4e59, (q31_t)0x96cb58, (q31_t)0x7fff6ab9, (q31_t)0x8a3a7b,
- (q31_t)0x7fff84a1, (q31_t)0x7da998, (q31_t)0x7fff9c12, (q31_t)0x7118b0,
- (q31_t)0x7fffb10b, (q31_t)0x6487c4, (q31_t)0x7fffc38c, (q31_t)0x57f6d4,
- (q31_t)0x7fffd396, (q31_t)0x4b65e1, (q31_t)0x7fffe128, (q31_t)0x3ed4ea,
- (q31_t)0x7fffec43, (q31_t)0x3243f1, (q31_t)0x7ffff4e6, (q31_t)0x25b2f7,
- (q31_t)0x7ffffb11, (q31_t)0x1921fb, (q31_t)0x7ffffec4, (q31_t)0xc90fe,
- (q31_t)0x7fffffff, (q31_t)0x0, (q31_t)0x7ffffec4, (q31_t)0xfff36f02,
- (q31_t)0x7ffffb11, (q31_t)0xffe6de05, (q31_t)0x7ffff4e6, (q31_t)0xffda4d09,
- (q31_t)0x7fffec43, (q31_t)0xffcdbc0f, (q31_t)0x7fffe128, (q31_t)0xffc12b16,
- (q31_t)0x7fffd396, (q31_t)0xffb49a1f, (q31_t)0x7fffc38c, (q31_t)0xffa8092c,
- (q31_t)0x7fffb10b, (q31_t)0xff9b783c, (q31_t)0x7fff9c12, (q31_t)0xff8ee750,
- (q31_t)0x7fff84a1, (q31_t)0xff825668, (q31_t)0x7fff6ab9, (q31_t)0xff75c585,
- (q31_t)0x7fff4e59, (q31_t)0xff6934a8, (q31_t)0x7fff2f82, (q31_t)0xff5ca3d0,
- (q31_t)0x7fff0e32, (q31_t)0xff5012fe, (q31_t)0x7ffeea6c, (q31_t)0xff438234,
- (q31_t)0x7ffec42d, (q31_t)0xff36f170, (q31_t)0x7ffe9b77, (q31_t)0xff2a60b4,
- (q31_t)0x7ffe704a, (q31_t)0xff1dd001, (q31_t)0x7ffe42a4, (q31_t)0xff113f56,
- (q31_t)0x7ffe1288, (q31_t)0xff04aeb5, (q31_t)0x7ffddff3, (q31_t)0xfef81e1d,
- (q31_t)0x7ffdaae7, (q31_t)0xfeeb8d8f, (q31_t)0x7ffd7364, (q31_t)0xfedefd0c,
- (q31_t)0x7ffd3969, (q31_t)0xfed26c94, (q31_t)0x7ffcfcf6, (q31_t)0xfec5dc28,
- (q31_t)0x7ffcbe0c, (q31_t)0xfeb94bc8, (q31_t)0x7ffc7caa, (q31_t)0xfeacbb74,
- (q31_t)0x7ffc38d1, (q31_t)0xfea02b2e, (q31_t)0x7ffbf280, (q31_t)0xfe939af5,
- (q31_t)0x7ffba9b8, (q31_t)0xfe870aca, (q31_t)0x7ffb5e78, (q31_t)0xfe7a7aae,
- (q31_t)0x7ffb10c1, (q31_t)0xfe6deaa1, (q31_t)0x7ffac092, (q31_t)0xfe615aa3,
- (q31_t)0x7ffa6dec, (q31_t)0xfe54cab5, (q31_t)0x7ffa18cf, (q31_t)0xfe483ad8,
- (q31_t)0x7ff9c13a, (q31_t)0xfe3bab0b, (q31_t)0x7ff9672d, (q31_t)0xfe2f1b50,
- (q31_t)0x7ff90aaa, (q31_t)0xfe228ba7, (q31_t)0x7ff8abae, (q31_t)0xfe15fc11,
- (q31_t)0x7ff84a3c, (q31_t)0xfe096c8d, (q31_t)0x7ff7e652, (q31_t)0xfdfcdd1d,
- (q31_t)0x7ff77ff1, (q31_t)0xfdf04dc0, (q31_t)0x7ff71718, (q31_t)0xfde3be78,
- (q31_t)0x7ff6abc8, (q31_t)0xfdd72f45, (q31_t)0x7ff63e01, (q31_t)0xfdcaa027,
- (q31_t)0x7ff5cdc3, (q31_t)0xfdbe111e, (q31_t)0x7ff55b0d, (q31_t)0xfdb1822c,
- (q31_t)0x7ff4e5e0, (q31_t)0xfda4f351, (q31_t)0x7ff46e3c, (q31_t)0xfd98648d,
- (q31_t)0x7ff3f420, (q31_t)0xfd8bd5e1, (q31_t)0x7ff3778e, (q31_t)0xfd7f474d,
- (q31_t)0x7ff2f884, (q31_t)0xfd72b8d2, (q31_t)0x7ff27703, (q31_t)0xfd662a70,
- (q31_t)0x7ff1f30b, (q31_t)0xfd599c28, (q31_t)0x7ff16c9c, (q31_t)0xfd4d0df9,
- (q31_t)0x7ff0e3b6, (q31_t)0xfd407fe6, (q31_t)0x7ff05858, (q31_t)0xfd33f1ed,
- (q31_t)0x7fefca84, (q31_t)0xfd276410, (q31_t)0x7fef3a39, (q31_t)0xfd1ad650,
- (q31_t)0x7feea776, (q31_t)0xfd0e48ab, (q31_t)0x7fee123d, (q31_t)0xfd01bb24,
- (q31_t)0x7fed7a8c, (q31_t)0xfcf52dbb, (q31_t)0x7fece065, (q31_t)0xfce8a06f,
- (q31_t)0x7fec43c7, (q31_t)0xfcdc1342, (q31_t)0x7feba4b2, (q31_t)0xfccf8634,
- (q31_t)0x7feb0326, (q31_t)0xfcc2f945, (q31_t)0x7fea5f23, (q31_t)0xfcb66c77,
- (q31_t)0x7fe9b8a9, (q31_t)0xfca9dfc8, (q31_t)0x7fe90fb9, (q31_t)0xfc9d533b,
- (q31_t)0x7fe86452, (q31_t)0xfc90c6cf, (q31_t)0x7fe7b674, (q31_t)0xfc843a85,
- (q31_t)0x7fe7061f, (q31_t)0xfc77ae5e, (q31_t)0x7fe65354, (q31_t)0xfc6b2259,
- (q31_t)0x7fe59e12, (q31_t)0xfc5e9678, (q31_t)0x7fe4e659, (q31_t)0xfc520aba,
- (q31_t)0x7fe42c2a, (q31_t)0xfc457f21, (q31_t)0x7fe36f84, (q31_t)0xfc38f3ac,
- (q31_t)0x7fe2b067, (q31_t)0xfc2c685d, (q31_t)0x7fe1eed5, (q31_t)0xfc1fdd34,
- (q31_t)0x7fe12acb, (q31_t)0xfc135231, (q31_t)0x7fe0644b, (q31_t)0xfc06c754,
- (q31_t)0x7fdf9b55, (q31_t)0xfbfa3c9f, (q31_t)0x7fdecfe8, (q31_t)0xfbedb212,
- (q31_t)0x7fde0205, (q31_t)0xfbe127ac, (q31_t)0x7fdd31ac, (q31_t)0xfbd49d70,
- (q31_t)0x7fdc5edc, (q31_t)0xfbc8135c, (q31_t)0x7fdb8996, (q31_t)0xfbbb8973,
- (q31_t)0x7fdab1d9, (q31_t)0xfbaeffb3, (q31_t)0x7fd9d7a7, (q31_t)0xfba2761e,
- (q31_t)0x7fd8fafe, (q31_t)0xfb95ecb4, (q31_t)0x7fd81bdf, (q31_t)0xfb896375,
- (q31_t)0x7fd73a4a, (q31_t)0xfb7cda63, (q31_t)0x7fd6563f, (q31_t)0xfb70517d,
- (q31_t)0x7fd56fbe, (q31_t)0xfb63c8c4, (q31_t)0x7fd486c7, (q31_t)0xfb574039,
- (q31_t)0x7fd39b5a, (q31_t)0xfb4ab7db, (q31_t)0x7fd2ad77, (q31_t)0xfb3e2fac,
- (q31_t)0x7fd1bd1e, (q31_t)0xfb31a7ac, (q31_t)0x7fd0ca4f, (q31_t)0xfb251fdc,
- (q31_t)0x7fcfd50b, (q31_t)0xfb18983b, (q31_t)0x7fcedd50, (q31_t)0xfb0c10cb,
- (q31_t)0x7fcde320, (q31_t)0xfaff898c, (q31_t)0x7fcce67a, (q31_t)0xfaf3027e,
- (q31_t)0x7fcbe75e, (q31_t)0xfae67ba2, (q31_t)0x7fcae5cd, (q31_t)0xfad9f4f8,
- (q31_t)0x7fc9e1c6, (q31_t)0xfacd6e81, (q31_t)0x7fc8db4a, (q31_t)0xfac0e83d,
- (q31_t)0x7fc7d258, (q31_t)0xfab4622d, (q31_t)0x7fc6c6f0, (q31_t)0xfaa7dc52,
- (q31_t)0x7fc5b913, (q31_t)0xfa9b56ab, (q31_t)0x7fc4a8c1, (q31_t)0xfa8ed139,
- (q31_t)0x7fc395f9, (q31_t)0xfa824bfd, (q31_t)0x7fc280bc, (q31_t)0xfa75c6f8,
- (q31_t)0x7fc1690a, (q31_t)0xfa694229, (q31_t)0x7fc04ee3, (q31_t)0xfa5cbd91,
- (q31_t)0x7fbf3246, (q31_t)0xfa503930, (q31_t)0x7fbe1334, (q31_t)0xfa43b508,
- (q31_t)0x7fbcf1ad, (q31_t)0xfa373119, (q31_t)0x7fbbcdb1, (q31_t)0xfa2aad62,
- (q31_t)0x7fbaa740, (q31_t)0xfa1e29e5, (q31_t)0x7fb97e5a, (q31_t)0xfa11a6a3,
- (q31_t)0x7fb852ff, (q31_t)0xfa05239a, (q31_t)0x7fb7252f, (q31_t)0xf9f8a0cd,
- (q31_t)0x7fb5f4ea, (q31_t)0xf9ec1e3b, (q31_t)0x7fb4c231, (q31_t)0xf9df9be6,
- (q31_t)0x7fb38d02, (q31_t)0xf9d319cc, (q31_t)0x7fb2555f, (q31_t)0xf9c697f0,
- (q31_t)0x7fb11b48, (q31_t)0xf9ba1651, (q31_t)0x7fafdebb, (q31_t)0xf9ad94f0,
- (q31_t)0x7fae9fbb, (q31_t)0xf9a113cd, (q31_t)0x7fad5e45, (q31_t)0xf99492ea,
- (q31_t)0x7fac1a5b, (q31_t)0xf9881245, (q31_t)0x7faad3fd, (q31_t)0xf97b91e1,
- (q31_t)0x7fa98b2a, (q31_t)0xf96f11bc, (q31_t)0x7fa83fe3, (q31_t)0xf96291d9,
- (q31_t)0x7fa6f228, (q31_t)0xf9561237, (q31_t)0x7fa5a1f9, (q31_t)0xf94992d7,
- (q31_t)0x7fa44f55, (q31_t)0xf93d13b8, (q31_t)0x7fa2fa3d, (q31_t)0xf93094dd,
- (q31_t)0x7fa1a2b2, (q31_t)0xf9241645, (q31_t)0x7fa048b2, (q31_t)0xf91797f0,
- (q31_t)0x7f9eec3e, (q31_t)0xf90b19e0, (q31_t)0x7f9d8d56, (q31_t)0xf8fe9c15,
- (q31_t)0x7f9c2bfb, (q31_t)0xf8f21e8e, (q31_t)0x7f9ac82c, (q31_t)0xf8e5a14d,
- (q31_t)0x7f9961e8, (q31_t)0xf8d92452, (q31_t)0x7f97f932, (q31_t)0xf8cca79e,
- (q31_t)0x7f968e07, (q31_t)0xf8c02b31, (q31_t)0x7f952069, (q31_t)0xf8b3af0c,
- (q31_t)0x7f93b058, (q31_t)0xf8a7332e, (q31_t)0x7f923dd2, (q31_t)0xf89ab799,
- (q31_t)0x7f90c8da, (q31_t)0xf88e3c4d, (q31_t)0x7f8f516e, (q31_t)0xf881c14b,
- (q31_t)0x7f8dd78f, (q31_t)0xf8754692, (q31_t)0x7f8c5b3d, (q31_t)0xf868cc24,
- (q31_t)0x7f8adc77, (q31_t)0xf85c5201, (q31_t)0x7f895b3e, (q31_t)0xf84fd829,
- (q31_t)0x7f87d792, (q31_t)0xf8435e9d, (q31_t)0x7f865174, (q31_t)0xf836e55d,
- (q31_t)0x7f84c8e2, (q31_t)0xf82a6c6a, (q31_t)0x7f833ddd, (q31_t)0xf81df3c5,
- (q31_t)0x7f81b065, (q31_t)0xf8117b6d, (q31_t)0x7f80207b, (q31_t)0xf8050364,
- (q31_t)0x7f7e8e1e, (q31_t)0xf7f88ba9, (q31_t)0x7f7cf94e, (q31_t)0xf7ec143e,
- (q31_t)0x7f7b620c, (q31_t)0xf7df9d22, (q31_t)0x7f79c857, (q31_t)0xf7d32657,
- (q31_t)0x7f782c30, (q31_t)0xf7c6afdc, (q31_t)0x7f768d96, (q31_t)0xf7ba39b3,
- (q31_t)0x7f74ec8a, (q31_t)0xf7adc3db, (q31_t)0x7f73490b, (q31_t)0xf7a14e55,
- (q31_t)0x7f71a31b, (q31_t)0xf794d922, (q31_t)0x7f6ffab8, (q31_t)0xf7886442,
- (q31_t)0x7f6e4fe3, (q31_t)0xf77befb5, (q31_t)0x7f6ca29c, (q31_t)0xf76f7b7d,
- (q31_t)0x7f6af2e3, (q31_t)0xf7630799, (q31_t)0x7f6940b8, (q31_t)0xf756940a,
- (q31_t)0x7f678c1c, (q31_t)0xf74a20d0, (q31_t)0x7f65d50d, (q31_t)0xf73daded,
- (q31_t)0x7f641b8d, (q31_t)0xf7313b60, (q31_t)0x7f625f9b, (q31_t)0xf724c92a,
- (q31_t)0x7f60a138, (q31_t)0xf718574b, (q31_t)0x7f5ee063, (q31_t)0xf70be5c4,
- (q31_t)0x7f5d1d1d, (q31_t)0xf6ff7496, (q31_t)0x7f5b5765, (q31_t)0xf6f303c0,
- (q31_t)0x7f598f3c, (q31_t)0xf6e69344, (q31_t)0x7f57c4a2, (q31_t)0xf6da2321,
- (q31_t)0x7f55f796, (q31_t)0xf6cdb359, (q31_t)0x7f54281a, (q31_t)0xf6c143ec,
- (q31_t)0x7f52562c, (q31_t)0xf6b4d4d9, (q31_t)0x7f5081cd, (q31_t)0xf6a86623,
- (q31_t)0x7f4eaafe, (q31_t)0xf69bf7c9, (q31_t)0x7f4cd1be, (q31_t)0xf68f89cb,
- (q31_t)0x7f4af60d, (q31_t)0xf6831c2b, (q31_t)0x7f4917eb, (q31_t)0xf676aee8,
- (q31_t)0x7f473759, (q31_t)0xf66a4203, (q31_t)0x7f455456, (q31_t)0xf65dd57d,
- (q31_t)0x7f436ee3, (q31_t)0xf6516956, (q31_t)0x7f4186ff, (q31_t)0xf644fd8f,
- (q31_t)0x7f3f9cab, (q31_t)0xf6389228, (q31_t)0x7f3dafe7, (q31_t)0xf62c2721,
- (q31_t)0x7f3bc0b3, (q31_t)0xf61fbc7b, (q31_t)0x7f39cf0e, (q31_t)0xf6135237,
- (q31_t)0x7f37dafa, (q31_t)0xf606e854, (q31_t)0x7f35e476, (q31_t)0xf5fa7ed4,
- (q31_t)0x7f33eb81, (q31_t)0xf5ee15b7, (q31_t)0x7f31f01d, (q31_t)0xf5e1acfd,
- (q31_t)0x7f2ff24a, (q31_t)0xf5d544a7, (q31_t)0x7f2df206, (q31_t)0xf5c8dcb6,
- (q31_t)0x7f2bef53, (q31_t)0xf5bc7529, (q31_t)0x7f29ea31, (q31_t)0xf5b00e02,
- (q31_t)0x7f27e29f, (q31_t)0xf5a3a740, (q31_t)0x7f25d89e, (q31_t)0xf59740e5,
- (q31_t)0x7f23cc2e, (q31_t)0xf58adaf0, (q31_t)0x7f21bd4e, (q31_t)0xf57e7563,
- (q31_t)0x7f1fabff, (q31_t)0xf572103d, (q31_t)0x7f1d9842, (q31_t)0xf565ab80,
- (q31_t)0x7f1b8215, (q31_t)0xf559472b, (q31_t)0x7f19697a, (q31_t)0xf54ce33f,
- (q31_t)0x7f174e70, (q31_t)0xf5407fbd, (q31_t)0x7f1530f7, (q31_t)0xf5341ca5,
- (q31_t)0x7f13110f, (q31_t)0xf527b9f7, (q31_t)0x7f10eeb9, (q31_t)0xf51b57b5,
- (q31_t)0x7f0ec9f5, (q31_t)0xf50ef5de, (q31_t)0x7f0ca2c2, (q31_t)0xf5029473,
- (q31_t)0x7f0a7921, (q31_t)0xf4f63374, (q31_t)0x7f084d12, (q31_t)0xf4e9d2e3,
- (q31_t)0x7f061e95, (q31_t)0xf4dd72be, (q31_t)0x7f03eda9, (q31_t)0xf4d11308,
- (q31_t)0x7f01ba50, (q31_t)0xf4c4b3c0, (q31_t)0x7eff8489, (q31_t)0xf4b854e7,
- (q31_t)0x7efd4c54, (q31_t)0xf4abf67e, (q31_t)0x7efb11b1, (q31_t)0xf49f9884,
- (q31_t)0x7ef8d4a1, (q31_t)0xf4933afa, (q31_t)0x7ef69523, (q31_t)0xf486dde1,
- (q31_t)0x7ef45338, (q31_t)0xf47a8139, (q31_t)0x7ef20ee0, (q31_t)0xf46e2504,
- (q31_t)0x7eefc81a, (q31_t)0xf461c940, (q31_t)0x7eed7ee7, (q31_t)0xf4556def,
- (q31_t)0x7eeb3347, (q31_t)0xf4491311, (q31_t)0x7ee8e53a, (q31_t)0xf43cb8a7,
- (q31_t)0x7ee694c1, (q31_t)0xf4305eb0, (q31_t)0x7ee441da, (q31_t)0xf424052f,
- (q31_t)0x7ee1ec87, (q31_t)0xf417ac22, (q31_t)0x7edf94c7, (q31_t)0xf40b538b,
- (q31_t)0x7edd3a9a, (q31_t)0xf3fefb6a, (q31_t)0x7edade01, (q31_t)0xf3f2a3bf,
- (q31_t)0x7ed87efc, (q31_t)0xf3e64c8c, (q31_t)0x7ed61d8a, (q31_t)0xf3d9f5cf,
- (q31_t)0x7ed3b9ad, (q31_t)0xf3cd9f8b, (q31_t)0x7ed15363, (q31_t)0xf3c149bf,
- (q31_t)0x7eceeaad, (q31_t)0xf3b4f46c, (q31_t)0x7ecc7f8b, (q31_t)0xf3a89f92,
- (q31_t)0x7eca11fe, (q31_t)0xf39c4b32, (q31_t)0x7ec7a205, (q31_t)0xf38ff74d,
- (q31_t)0x7ec52fa0, (q31_t)0xf383a3e2, (q31_t)0x7ec2bad0, (q31_t)0xf37750f2,
- (q31_t)0x7ec04394, (q31_t)0xf36afe7e, (q31_t)0x7ebdc9ed, (q31_t)0xf35eac86,
- (q31_t)0x7ebb4ddb, (q31_t)0xf3525b0b, (q31_t)0x7eb8cf5d, (q31_t)0xf3460a0d,
- (q31_t)0x7eb64e75, (q31_t)0xf339b98d, (q31_t)0x7eb3cb21, (q31_t)0xf32d698a,
- (q31_t)0x7eb14563, (q31_t)0xf3211a07, (q31_t)0x7eaebd3a, (q31_t)0xf314cb02,
- (q31_t)0x7eac32a6, (q31_t)0xf3087c7d, (q31_t)0x7ea9a5a8, (q31_t)0xf2fc2e77,
- (q31_t)0x7ea7163f, (q31_t)0xf2efe0f2, (q31_t)0x7ea4846c, (q31_t)0xf2e393ef,
- (q31_t)0x7ea1f02f, (q31_t)0xf2d7476c, (q31_t)0x7e9f5988, (q31_t)0xf2cafb6b,
- (q31_t)0x7e9cc076, (q31_t)0xf2beafed, (q31_t)0x7e9a24fb, (q31_t)0xf2b264f2,
- (q31_t)0x7e978715, (q31_t)0xf2a61a7a, (q31_t)0x7e94e6c6, (q31_t)0xf299d085,
- (q31_t)0x7e92440d, (q31_t)0xf28d8715, (q31_t)0x7e8f9eeb, (q31_t)0xf2813e2a,
- (q31_t)0x7e8cf75f, (q31_t)0xf274f5c3, (q31_t)0x7e8a4d6a, (q31_t)0xf268ade3,
- (q31_t)0x7e87a10c, (q31_t)0xf25c6688, (q31_t)0x7e84f245, (q31_t)0xf2501fb5,
- (q31_t)0x7e824114, (q31_t)0xf243d968, (q31_t)0x7e7f8d7b, (q31_t)0xf23793a3,
- (q31_t)0x7e7cd778, (q31_t)0xf22b4e66, (q31_t)0x7e7a1f0d, (q31_t)0xf21f09b1,
- (q31_t)0x7e77643a, (q31_t)0xf212c585, (q31_t)0x7e74a6fd, (q31_t)0xf20681e3,
- (q31_t)0x7e71e759, (q31_t)0xf1fa3ecb, (q31_t)0x7e6f254c, (q31_t)0xf1edfc3d,
- (q31_t)0x7e6c60d7, (q31_t)0xf1e1ba3a, (q31_t)0x7e6999fa, (q31_t)0xf1d578c2,
- (q31_t)0x7e66d0b4, (q31_t)0xf1c937d6, (q31_t)0x7e640507, (q31_t)0xf1bcf777,
- (q31_t)0x7e6136f3, (q31_t)0xf1b0b7a4, (q31_t)0x7e5e6676, (q31_t)0xf1a4785e,
- (q31_t)0x7e5b9392, (q31_t)0xf19839a6, (q31_t)0x7e58be47, (q31_t)0xf18bfb7d,
- (q31_t)0x7e55e694, (q31_t)0xf17fbde2, (q31_t)0x7e530c7a, (q31_t)0xf17380d6,
- (q31_t)0x7e502ff9, (q31_t)0xf1674459, (q31_t)0x7e4d5110, (q31_t)0xf15b086d,
- (q31_t)0x7e4a6fc1, (q31_t)0xf14ecd11, (q31_t)0x7e478c0b, (q31_t)0xf1429247,
- (q31_t)0x7e44a5ef, (q31_t)0xf136580d, (q31_t)0x7e41bd6c, (q31_t)0xf12a1e66,
- (q31_t)0x7e3ed282, (q31_t)0xf11de551, (q31_t)0x7e3be532, (q31_t)0xf111accf,
- (q31_t)0x7e38f57c, (q31_t)0xf10574e0, (q31_t)0x7e360360, (q31_t)0xf0f93d86,
- (q31_t)0x7e330ede, (q31_t)0xf0ed06bf, (q31_t)0x7e3017f6, (q31_t)0xf0e0d08d,
- (q31_t)0x7e2d1ea8, (q31_t)0xf0d49af1, (q31_t)0x7e2a22f4, (q31_t)0xf0c865ea,
- (q31_t)0x7e2724db, (q31_t)0xf0bc317a, (q31_t)0x7e24245d, (q31_t)0xf0affda0,
- (q31_t)0x7e212179, (q31_t)0xf0a3ca5d, (q31_t)0x7e1e1c30, (q31_t)0xf09797b2,
- (q31_t)0x7e1b1482, (q31_t)0xf08b659f, (q31_t)0x7e180a6f, (q31_t)0xf07f3424,
- (q31_t)0x7e14fdf7, (q31_t)0xf0730342, (q31_t)0x7e11ef1b, (q31_t)0xf066d2fa,
- (q31_t)0x7e0eddd9, (q31_t)0xf05aa34c, (q31_t)0x7e0bca34, (q31_t)0xf04e7438,
- (q31_t)0x7e08b42a, (q31_t)0xf04245c0, (q31_t)0x7e059bbb, (q31_t)0xf03617e2,
- (q31_t)0x7e0280e9, (q31_t)0xf029eaa1, (q31_t)0x7dff63b2, (q31_t)0xf01dbdfb,
- (q31_t)0x7dfc4418, (q31_t)0xf01191f3, (q31_t)0x7df9221a, (q31_t)0xf0056687,
- (q31_t)0x7df5fdb8, (q31_t)0xeff93bba, (q31_t)0x7df2d6f3, (q31_t)0xefed118a,
- (q31_t)0x7defadca, (q31_t)0xefe0e7f9, (q31_t)0x7dec823e, (q31_t)0xefd4bf08,
- (q31_t)0x7de9544f, (q31_t)0xefc896b5, (q31_t)0x7de623fd, (q31_t)0xefbc6f03,
- (q31_t)0x7de2f148, (q31_t)0xefb047f2, (q31_t)0x7ddfbc30, (q31_t)0xefa42181,
- (q31_t)0x7ddc84b5, (q31_t)0xef97fbb2, (q31_t)0x7dd94ad8, (q31_t)0xef8bd685,
- (q31_t)0x7dd60e99, (q31_t)0xef7fb1fa, (q31_t)0x7dd2cff7, (q31_t)0xef738e12,
- (q31_t)0x7dcf8ef3, (q31_t)0xef676ace, (q31_t)0x7dcc4b8d, (q31_t)0xef5b482d,
- (q31_t)0x7dc905c5, (q31_t)0xef4f2630, (q31_t)0x7dc5bd9b, (q31_t)0xef4304d8,
- (q31_t)0x7dc2730f, (q31_t)0xef36e426, (q31_t)0x7dbf2622, (q31_t)0xef2ac419,
- (q31_t)0x7dbbd6d4, (q31_t)0xef1ea4b2, (q31_t)0x7db88524, (q31_t)0xef1285f2,
- (q31_t)0x7db53113, (q31_t)0xef0667d9, (q31_t)0x7db1daa2, (q31_t)0xeefa4a67,
- (q31_t)0x7dae81cf, (q31_t)0xeeee2d9d, (q31_t)0x7dab269b, (q31_t)0xeee2117c,
- (q31_t)0x7da7c907, (q31_t)0xeed5f604, (q31_t)0x7da46912, (q31_t)0xeec9db35,
- (q31_t)0x7da106bd, (q31_t)0xeebdc110, (q31_t)0x7d9da208, (q31_t)0xeeb1a796,
- (q31_t)0x7d9a3af2, (q31_t)0xeea58ec6, (q31_t)0x7d96d17d, (q31_t)0xee9976a1,
- (q31_t)0x7d9365a8, (q31_t)0xee8d5f29, (q31_t)0x7d8ff772, (q31_t)0xee81485c,
- (q31_t)0x7d8c86de, (q31_t)0xee75323c, (q31_t)0x7d8913ea, (q31_t)0xee691cc9,
- (q31_t)0x7d859e96, (q31_t)0xee5d0804, (q31_t)0x7d8226e4, (q31_t)0xee50f3ed,
- (q31_t)0x7d7eacd2, (q31_t)0xee44e084, (q31_t)0x7d7b3061, (q31_t)0xee38cdcb,
- (q31_t)0x7d77b192, (q31_t)0xee2cbbc1, (q31_t)0x7d743064, (q31_t)0xee20aa67,
- (q31_t)0x7d70acd7, (q31_t)0xee1499bd, (q31_t)0x7d6d26ec, (q31_t)0xee0889c4,
- (q31_t)0x7d699ea3, (q31_t)0xedfc7a7c, (q31_t)0x7d6613fb, (q31_t)0xedf06be6,
- (q31_t)0x7d6286f6, (q31_t)0xede45e03, (q31_t)0x7d5ef793, (q31_t)0xedd850d2,
- (q31_t)0x7d5b65d2, (q31_t)0xedcc4454, (q31_t)0x7d57d1b3, (q31_t)0xedc0388a,
- (q31_t)0x7d543b37, (q31_t)0xedb42d74, (q31_t)0x7d50a25e, (q31_t)0xeda82313,
- (q31_t)0x7d4d0728, (q31_t)0xed9c1967, (q31_t)0x7d496994, (q31_t)0xed901070,
- (q31_t)0x7d45c9a4, (q31_t)0xed84082f, (q31_t)0x7d422757, (q31_t)0xed7800a5,
- (q31_t)0x7d3e82ae, (q31_t)0xed6bf9d1, (q31_t)0x7d3adba7, (q31_t)0xed5ff3b5,
- (q31_t)0x7d373245, (q31_t)0xed53ee51, (q31_t)0x7d338687, (q31_t)0xed47e9a5,
- (q31_t)0x7d2fd86c, (q31_t)0xed3be5b1, (q31_t)0x7d2c27f6, (q31_t)0xed2fe277,
- (q31_t)0x7d287523, (q31_t)0xed23dff7, (q31_t)0x7d24bff6, (q31_t)0xed17de31,
- (q31_t)0x7d21086c, (q31_t)0xed0bdd25, (q31_t)0x7d1d4e88, (q31_t)0xecffdcd4,
- (q31_t)0x7d199248, (q31_t)0xecf3dd3f, (q31_t)0x7d15d3ad, (q31_t)0xece7de66,
- (q31_t)0x7d1212b7, (q31_t)0xecdbe04a, (q31_t)0x7d0e4f67, (q31_t)0xeccfe2ea,
- (q31_t)0x7d0a89bc, (q31_t)0xecc3e648, (q31_t)0x7d06c1b6, (q31_t)0xecb7ea63,
- (q31_t)0x7d02f757, (q31_t)0xecabef3d, (q31_t)0x7cff2a9d, (q31_t)0xec9ff4d6,
- (q31_t)0x7cfb5b89, (q31_t)0xec93fb2e, (q31_t)0x7cf78a1b, (q31_t)0xec880245,
- (q31_t)0x7cf3b653, (q31_t)0xec7c0a1d, (q31_t)0x7cefe032, (q31_t)0xec7012b5,
- (q31_t)0x7cec07b8, (q31_t)0xec641c0e, (q31_t)0x7ce82ce4, (q31_t)0xec582629,
- (q31_t)0x7ce44fb7, (q31_t)0xec4c3106, (q31_t)0x7ce07031, (q31_t)0xec403ca5,
- (q31_t)0x7cdc8e52, (q31_t)0xec344908, (q31_t)0x7cd8aa1b, (q31_t)0xec28562d,
- (q31_t)0x7cd4c38b, (q31_t)0xec1c6417, (q31_t)0x7cd0daa2, (q31_t)0xec1072c4,
- (q31_t)0x7cccef62, (q31_t)0xec048237, (q31_t)0x7cc901c9, (q31_t)0xebf8926f,
- (q31_t)0x7cc511d9, (q31_t)0xebeca36c, (q31_t)0x7cc11f90, (q31_t)0xebe0b52f,
- (q31_t)0x7cbd2af0, (q31_t)0xebd4c7ba, (q31_t)0x7cb933f9, (q31_t)0xebc8db0b,
- (q31_t)0x7cb53aaa, (q31_t)0xebbcef23, (q31_t)0x7cb13f04, (q31_t)0xebb10404,
- (q31_t)0x7cad4107, (q31_t)0xeba519ad, (q31_t)0x7ca940b3, (q31_t)0xeb99301f,
- (q31_t)0x7ca53e09, (q31_t)0xeb8d475b, (q31_t)0x7ca13908, (q31_t)0xeb815f60,
- (q31_t)0x7c9d31b0, (q31_t)0xeb75782f, (q31_t)0x7c992803, (q31_t)0xeb6991ca,
- (q31_t)0x7c951bff, (q31_t)0xeb5dac2f, (q31_t)0x7c910da5, (q31_t)0xeb51c760,
- (q31_t)0x7c8cfcf6, (q31_t)0xeb45e35d, (q31_t)0x7c88e9f1, (q31_t)0xeb3a0027,
- (q31_t)0x7c84d496, (q31_t)0xeb2e1dbe, (q31_t)0x7c80bce7, (q31_t)0xeb223c22,
- (q31_t)0x7c7ca2e2, (q31_t)0xeb165b54, (q31_t)0x7c788688, (q31_t)0xeb0a7b54,
- (q31_t)0x7c7467d9, (q31_t)0xeafe9c24, (q31_t)0x7c7046d6, (q31_t)0xeaf2bdc3,
- (q31_t)0x7c6c237e, (q31_t)0xeae6e031, (q31_t)0x7c67fdd1, (q31_t)0xeadb0370,
- (q31_t)0x7c63d5d1, (q31_t)0xeacf277f, (q31_t)0x7c5fab7c, (q31_t)0xeac34c60,
- (q31_t)0x7c5b7ed4, (q31_t)0xeab77212, (q31_t)0x7c574fd8, (q31_t)0xeaab9896,
- (q31_t)0x7c531e88, (q31_t)0xea9fbfed, (q31_t)0x7c4eeae5, (q31_t)0xea93e817,
- (q31_t)0x7c4ab4ef, (q31_t)0xea881114, (q31_t)0x7c467ca6, (q31_t)0xea7c3ae5,
- (q31_t)0x7c42420a, (q31_t)0xea70658a, (q31_t)0x7c3e051b, (q31_t)0xea649105,
- (q31_t)0x7c39c5da, (q31_t)0xea58bd54, (q31_t)0x7c358446, (q31_t)0xea4cea79,
- (q31_t)0x7c314060, (q31_t)0xea411874, (q31_t)0x7c2cfa28, (q31_t)0xea354746,
- (q31_t)0x7c28b19e, (q31_t)0xea2976ef, (q31_t)0x7c2466c2, (q31_t)0xea1da770,
- (q31_t)0x7c201994, (q31_t)0xea11d8c8, (q31_t)0x7c1bca16, (q31_t)0xea060af9,
- (q31_t)0x7c177845, (q31_t)0xe9fa3e03, (q31_t)0x7c132424, (q31_t)0xe9ee71e6,
- (q31_t)0x7c0ecdb2, (q31_t)0xe9e2a6a3, (q31_t)0x7c0a74f0, (q31_t)0xe9d6dc3b,
- (q31_t)0x7c0619dc, (q31_t)0xe9cb12ad, (q31_t)0x7c01bc78, (q31_t)0xe9bf49fa,
- (q31_t)0x7bfd5cc4, (q31_t)0xe9b38223, (q31_t)0x7bf8fac0, (q31_t)0xe9a7bb28,
- (q31_t)0x7bf4966c, (q31_t)0xe99bf509, (q31_t)0x7bf02fc9, (q31_t)0xe9902fc7,
- (q31_t)0x7bebc6d5, (q31_t)0xe9846b63, (q31_t)0x7be75b93, (q31_t)0xe978a7dd,
- (q31_t)0x7be2ee01, (q31_t)0xe96ce535, (q31_t)0x7bde7e20, (q31_t)0xe961236c,
- (q31_t)0x7bda0bf0, (q31_t)0xe9556282, (q31_t)0x7bd59771, (q31_t)0xe949a278,
- (q31_t)0x7bd120a4, (q31_t)0xe93de34e, (q31_t)0x7bcca789, (q31_t)0xe9322505,
- (q31_t)0x7bc82c1f, (q31_t)0xe926679c, (q31_t)0x7bc3ae67, (q31_t)0xe91aab16,
- (q31_t)0x7bbf2e62, (q31_t)0xe90eef71, (q31_t)0x7bbaac0e, (q31_t)0xe90334af,
- (q31_t)0x7bb6276e, (q31_t)0xe8f77acf, (q31_t)0x7bb1a080, (q31_t)0xe8ebc1d3,
- (q31_t)0x7bad1744, (q31_t)0xe8e009ba, (q31_t)0x7ba88bbc, (q31_t)0xe8d45286,
- (q31_t)0x7ba3fde7, (q31_t)0xe8c89c37, (q31_t)0x7b9f6dc5, (q31_t)0xe8bce6cd,
- (q31_t)0x7b9adb57, (q31_t)0xe8b13248, (q31_t)0x7b96469d, (q31_t)0xe8a57ea9,
- (q31_t)0x7b91af97, (q31_t)0xe899cbf1, (q31_t)0x7b8d1644, (q31_t)0xe88e1a20,
- (q31_t)0x7b887aa6, (q31_t)0xe8826936, (q31_t)0x7b83dcbc, (q31_t)0xe876b934,
- (q31_t)0x7b7f3c87, (q31_t)0xe86b0a1a, (q31_t)0x7b7a9a07, (q31_t)0xe85f5be9,
- (q31_t)0x7b75f53c, (q31_t)0xe853aea1, (q31_t)0x7b714e25, (q31_t)0xe8480243,
- (q31_t)0x7b6ca4c4, (q31_t)0xe83c56cf, (q31_t)0x7b67f919, (q31_t)0xe830ac45,
- (q31_t)0x7b634b23, (q31_t)0xe82502a7, (q31_t)0x7b5e9ae4, (q31_t)0xe81959f4,
- (q31_t)0x7b59e85a, (q31_t)0xe80db22d, (q31_t)0x7b553386, (q31_t)0xe8020b52,
- (q31_t)0x7b507c69, (q31_t)0xe7f66564, (q31_t)0x7b4bc303, (q31_t)0xe7eac063,
- (q31_t)0x7b470753, (q31_t)0xe7df1c50, (q31_t)0x7b42495a, (q31_t)0xe7d3792b,
- (q31_t)0x7b3d8918, (q31_t)0xe7c7d6f4, (q31_t)0x7b38c68e, (q31_t)0xe7bc35ad,
- (q31_t)0x7b3401bb, (q31_t)0xe7b09555, (q31_t)0x7b2f3aa0, (q31_t)0xe7a4f5ed,
- (q31_t)0x7b2a713d, (q31_t)0xe7995776, (q31_t)0x7b25a591, (q31_t)0xe78db9ef,
- (q31_t)0x7b20d79e, (q31_t)0xe7821d59, (q31_t)0x7b1c0764, (q31_t)0xe77681b6,
- (q31_t)0x7b1734e2, (q31_t)0xe76ae704, (q31_t)0x7b126019, (q31_t)0xe75f4d45,
- (q31_t)0x7b0d8909, (q31_t)0xe753b479, (q31_t)0x7b08afb2, (q31_t)0xe7481ca1,
- (q31_t)0x7b03d414, (q31_t)0xe73c85bc, (q31_t)0x7afef630, (q31_t)0xe730efcc,
- (q31_t)0x7afa1605, (q31_t)0xe7255ad1, (q31_t)0x7af53395, (q31_t)0xe719c6cb,
- (q31_t)0x7af04edf, (q31_t)0xe70e33bb, (q31_t)0x7aeb67e3, (q31_t)0xe702a1a1,
- (q31_t)0x7ae67ea1, (q31_t)0xe6f7107e, (q31_t)0x7ae1931a, (q31_t)0xe6eb8052,
- (q31_t)0x7adca54e, (q31_t)0xe6dff11d, (q31_t)0x7ad7b53d, (q31_t)0xe6d462e1,
- (q31_t)0x7ad2c2e8, (q31_t)0xe6c8d59c, (q31_t)0x7acdce4d, (q31_t)0xe6bd4951,
- (q31_t)0x7ac8d76f, (q31_t)0xe6b1bdff, (q31_t)0x7ac3de4c, (q31_t)0xe6a633a6,
- (q31_t)0x7abee2e5, (q31_t)0xe69aaa48, (q31_t)0x7ab9e53a, (q31_t)0xe68f21e5,
- (q31_t)0x7ab4e54c, (q31_t)0xe6839a7c, (q31_t)0x7aafe31b, (q31_t)0xe6781410,
- (q31_t)0x7aaadea6, (q31_t)0xe66c8e9f, (q31_t)0x7aa5d7ee, (q31_t)0xe6610a2a,
- (q31_t)0x7aa0cef3, (q31_t)0xe65586b3, (q31_t)0x7a9bc3b6, (q31_t)0xe64a0438,
- (q31_t)0x7a96b636, (q31_t)0xe63e82bc, (q31_t)0x7a91a674, (q31_t)0xe633023e,
- (q31_t)0x7a8c9470, (q31_t)0xe62782be, (q31_t)0x7a87802a, (q31_t)0xe61c043d,
- (q31_t)0x7a8269a3, (q31_t)0xe61086bc, (q31_t)0x7a7d50da, (q31_t)0xe6050a3b,
- (q31_t)0x7a7835cf, (q31_t)0xe5f98ebb, (q31_t)0x7a731884, (q31_t)0xe5ee143b,
- (q31_t)0x7a6df8f8, (q31_t)0xe5e29abc, (q31_t)0x7a68d72b, (q31_t)0xe5d72240,
- (q31_t)0x7a63b31d, (q31_t)0xe5cbaac5, (q31_t)0x7a5e8cd0, (q31_t)0xe5c0344d,
- (q31_t)0x7a596442, (q31_t)0xe5b4bed8, (q31_t)0x7a543974, (q31_t)0xe5a94a67,
- (q31_t)0x7a4f0c67, (q31_t)0xe59dd6f9, (q31_t)0x7a49dd1a, (q31_t)0xe5926490,
- (q31_t)0x7a44ab8e, (q31_t)0xe586f32c, (q31_t)0x7a3f77c3, (q31_t)0xe57b82cd,
- (q31_t)0x7a3a41b9, (q31_t)0xe5701374, (q31_t)0x7a350970, (q31_t)0xe564a521,
- (q31_t)0x7a2fcee8, (q31_t)0xe55937d5, (q31_t)0x7a2a9223, (q31_t)0xe54dcb8f,
- (q31_t)0x7a25531f, (q31_t)0xe5426051, (q31_t)0x7a2011de, (q31_t)0xe536f61b,
- (q31_t)0x7a1ace5f, (q31_t)0xe52b8cee, (q31_t)0x7a1588a2, (q31_t)0xe52024c9,
- (q31_t)0x7a1040a8, (q31_t)0xe514bdad, (q31_t)0x7a0af671, (q31_t)0xe509579b,
- (q31_t)0x7a05a9fd, (q31_t)0xe4fdf294, (q31_t)0x7a005b4d, (q31_t)0xe4f28e96,
- (q31_t)0x79fb0a60, (q31_t)0xe4e72ba4, (q31_t)0x79f5b737, (q31_t)0xe4dbc9bd,
- (q31_t)0x79f061d2, (q31_t)0xe4d068e2, (q31_t)0x79eb0a31, (q31_t)0xe4c50914,
- (q31_t)0x79e5b054, (q31_t)0xe4b9aa52, (q31_t)0x79e0543c, (q31_t)0xe4ae4c9d,
- (q31_t)0x79daf5e8, (q31_t)0xe4a2eff6, (q31_t)0x79d5955a, (q31_t)0xe497945d,
- (q31_t)0x79d03291, (q31_t)0xe48c39d3, (q31_t)0x79cacd8d, (q31_t)0xe480e057,
- (q31_t)0x79c5664f, (q31_t)0xe47587eb, (q31_t)0x79bffcd7, (q31_t)0xe46a308f,
- (q31_t)0x79ba9125, (q31_t)0xe45eda43, (q31_t)0x79b52339, (q31_t)0xe4538507,
- (q31_t)0x79afb313, (q31_t)0xe44830dd, (q31_t)0x79aa40b4, (q31_t)0xe43cddc4,
- (q31_t)0x79a4cc1c, (q31_t)0xe4318bbe, (q31_t)0x799f554b, (q31_t)0xe4263ac9,
- (q31_t)0x7999dc42, (q31_t)0xe41aeae8, (q31_t)0x799460ff, (q31_t)0xe40f9c1a,
- (q31_t)0x798ee385, (q31_t)0xe4044e60, (q31_t)0x798963d2, (q31_t)0xe3f901ba,
- (q31_t)0x7983e1e8, (q31_t)0xe3edb628, (q31_t)0x797e5dc6, (q31_t)0xe3e26bac,
- (q31_t)0x7978d76c, (q31_t)0xe3d72245, (q31_t)0x79734edc, (q31_t)0xe3cbd9f4,
- (q31_t)0x796dc414, (q31_t)0xe3c092b9, (q31_t)0x79683715, (q31_t)0xe3b54c95,
- (q31_t)0x7962a7e0, (q31_t)0xe3aa0788, (q31_t)0x795d1675, (q31_t)0xe39ec393,
- (q31_t)0x795782d3, (q31_t)0xe39380b6, (q31_t)0x7951ecfc, (q31_t)0xe3883ef2,
- (q31_t)0x794c54ee, (q31_t)0xe37cfe47, (q31_t)0x7946baac, (q31_t)0xe371beb5,
- (q31_t)0x79411e33, (q31_t)0xe366803c, (q31_t)0x793b7f86, (q31_t)0xe35b42df,
- (q31_t)0x7935dea4, (q31_t)0xe350069b, (q31_t)0x79303b8e, (q31_t)0xe344cb73,
- (q31_t)0x792a9642, (q31_t)0xe3399167, (q31_t)0x7924eec3, (q31_t)0xe32e5876,
- (q31_t)0x791f4510, (q31_t)0xe32320a2, (q31_t)0x79199929, (q31_t)0xe317e9eb,
- (q31_t)0x7913eb0e, (q31_t)0xe30cb451, (q31_t)0x790e3ac0, (q31_t)0xe3017fd5,
- (q31_t)0x7908883f, (q31_t)0xe2f64c77, (q31_t)0x7902d38b, (q31_t)0xe2eb1a37,
- (q31_t)0x78fd1ca4, (q31_t)0xe2dfe917, (q31_t)0x78f7638b, (q31_t)0xe2d4b916,
- (q31_t)0x78f1a840, (q31_t)0xe2c98a35, (q31_t)0x78ebeac2, (q31_t)0xe2be5c74,
- (q31_t)0x78e62b13, (q31_t)0xe2b32fd4, (q31_t)0x78e06932, (q31_t)0xe2a80456,
- (q31_t)0x78daa520, (q31_t)0xe29cd9f8, (q31_t)0x78d4dedd, (q31_t)0xe291b0bd,
- (q31_t)0x78cf1669, (q31_t)0xe28688a4, (q31_t)0x78c94bc4, (q31_t)0xe27b61af,
- (q31_t)0x78c37eef, (q31_t)0xe2703bdc, (q31_t)0x78bdafea, (q31_t)0xe265172e,
- (q31_t)0x78b7deb4, (q31_t)0xe259f3a3, (q31_t)0x78b20b4f, (q31_t)0xe24ed13d,
- (q31_t)0x78ac35ba, (q31_t)0xe243affc, (q31_t)0x78a65df6, (q31_t)0xe2388fe1,
- (q31_t)0x78a08402, (q31_t)0xe22d70eb, (q31_t)0x789aa7e0, (q31_t)0xe222531c,
- (q31_t)0x7894c98f, (q31_t)0xe2173674, (q31_t)0x788ee910, (q31_t)0xe20c1af3,
- (q31_t)0x78890663, (q31_t)0xe2010099, (q31_t)0x78832187, (q31_t)0xe1f5e768,
- (q31_t)0x787d3a7e, (q31_t)0xe1eacf5f, (q31_t)0x78775147, (q31_t)0xe1dfb87f,
- (q31_t)0x787165e3, (q31_t)0xe1d4a2c8, (q31_t)0x786b7852, (q31_t)0xe1c98e3b,
- (q31_t)0x78658894, (q31_t)0xe1be7ad8, (q31_t)0x785f96a9, (q31_t)0xe1b368a0,
- (q31_t)0x7859a292, (q31_t)0xe1a85793, (q31_t)0x7853ac4f, (q31_t)0xe19d47b1,
- (q31_t)0x784db3e0, (q31_t)0xe19238fb, (q31_t)0x7847b946, (q31_t)0xe1872b72,
- (q31_t)0x7841bc7f, (q31_t)0xe17c1f15, (q31_t)0x783bbd8e, (q31_t)0xe17113e5,
- (q31_t)0x7835bc71, (q31_t)0xe16609e3, (q31_t)0x782fb92a, (q31_t)0xe15b0110,
- (q31_t)0x7829b3b9, (q31_t)0xe14ff96a, (q31_t)0x7823ac1d, (q31_t)0xe144f2f3,
- (q31_t)0x781da256, (q31_t)0xe139edac, (q31_t)0x78179666, (q31_t)0xe12ee995,
- (q31_t)0x7811884d, (q31_t)0xe123e6ad, (q31_t)0x780b780a, (q31_t)0xe118e4f6,
- (q31_t)0x7805659e, (q31_t)0xe10de470, (q31_t)0x77ff5109, (q31_t)0xe102e51c,
- (q31_t)0x77f93a4b, (q31_t)0xe0f7e6f9, (q31_t)0x77f32165, (q31_t)0xe0ecea09,
- (q31_t)0x77ed0657, (q31_t)0xe0e1ee4b, (q31_t)0x77e6e921, (q31_t)0xe0d6f3c1,
- (q31_t)0x77e0c9c3, (q31_t)0xe0cbfa6a, (q31_t)0x77daa83d, (q31_t)0xe0c10247,
- (q31_t)0x77d48490, (q31_t)0xe0b60b58, (q31_t)0x77ce5ebd, (q31_t)0xe0ab159e,
- (q31_t)0x77c836c2, (q31_t)0xe0a0211a, (q31_t)0x77c20ca1, (q31_t)0xe0952dcb,
- (q31_t)0x77bbe05a, (q31_t)0xe08a3bb2, (q31_t)0x77b5b1ec, (q31_t)0xe07f4acf,
- (q31_t)0x77af8159, (q31_t)0xe0745b24, (q31_t)0x77a94ea0, (q31_t)0xe0696cb0,
- (q31_t)0x77a319c2, (q31_t)0xe05e7f74, (q31_t)0x779ce2be, (q31_t)0xe053936f,
- (q31_t)0x7796a996, (q31_t)0xe048a8a4, (q31_t)0x77906e49, (q31_t)0xe03dbf11,
- (q31_t)0x778a30d8, (q31_t)0xe032d6b8, (q31_t)0x7783f143, (q31_t)0xe027ef99,
- (q31_t)0x777daf89, (q31_t)0xe01d09b4, (q31_t)0x77776bac, (q31_t)0xe012250a,
- (q31_t)0x777125ac, (q31_t)0xe007419b, (q31_t)0x776add88, (q31_t)0xdffc5f67,
- (q31_t)0x77649341, (q31_t)0xdff17e70, (q31_t)0x775e46d8, (q31_t)0xdfe69eb4,
- (q31_t)0x7757f84c, (q31_t)0xdfdbc036, (q31_t)0x7751a79e, (q31_t)0xdfd0e2f5,
- (q31_t)0x774b54ce, (q31_t)0xdfc606f1, (q31_t)0x7744ffdd, (q31_t)0xdfbb2c2c,
- (q31_t)0x773ea8ca, (q31_t)0xdfb052a5, (q31_t)0x77384f95, (q31_t)0xdfa57a5d,
- (q31_t)0x7731f440, (q31_t)0xdf9aa354, (q31_t)0x772b96ca, (q31_t)0xdf8fcd8b,
- (q31_t)0x77253733, (q31_t)0xdf84f902, (q31_t)0x771ed57c, (q31_t)0xdf7a25ba,
- (q31_t)0x771871a5, (q31_t)0xdf6f53b3, (q31_t)0x77120bae, (q31_t)0xdf6482ed,
- (q31_t)0x770ba398, (q31_t)0xdf59b369, (q31_t)0x77053962, (q31_t)0xdf4ee527,
- (q31_t)0x76fecd0e, (q31_t)0xdf441828, (q31_t)0x76f85e9a, (q31_t)0xdf394c6b,
- (q31_t)0x76f1ee09, (q31_t)0xdf2e81f3, (q31_t)0x76eb7b58, (q31_t)0xdf23b8be,
- (q31_t)0x76e5068a, (q31_t)0xdf18f0ce, (q31_t)0x76de8f9e, (q31_t)0xdf0e2a22,
- (q31_t)0x76d81695, (q31_t)0xdf0364bc, (q31_t)0x76d19b6e, (q31_t)0xdef8a09b,
- (q31_t)0x76cb1e2a, (q31_t)0xdeedddc0, (q31_t)0x76c49ec9, (q31_t)0xdee31c2b,
- (q31_t)0x76be1d4c, (q31_t)0xded85bdd, (q31_t)0x76b799b3, (q31_t)0xdecd9cd7,
- (q31_t)0x76b113fd, (q31_t)0xdec2df18, (q31_t)0x76aa8c2c, (q31_t)0xdeb822a1,
- (q31_t)0x76a4023f, (q31_t)0xdead6773, (q31_t)0x769d7637, (q31_t)0xdea2ad8d,
- (q31_t)0x7696e814, (q31_t)0xde97f4f1, (q31_t)0x769057d6, (q31_t)0xde8d3d9e,
- (q31_t)0x7689c57d, (q31_t)0xde828796, (q31_t)0x7683310b, (q31_t)0xde77d2d8,
- (q31_t)0x767c9a7e, (q31_t)0xde6d1f65, (q31_t)0x767601d7, (q31_t)0xde626d3e,
- (q31_t)0x766f6717, (q31_t)0xde57bc62, (q31_t)0x7668ca3e, (q31_t)0xde4d0cd2,
- (q31_t)0x76622b4c, (q31_t)0xde425e8f, (q31_t)0x765b8a41, (q31_t)0xde37b199,
- (q31_t)0x7654e71d, (q31_t)0xde2d05f1, (q31_t)0x764e41e2, (q31_t)0xde225b96,
- (q31_t)0x76479a8e, (q31_t)0xde17b28a, (q31_t)0x7640f123, (q31_t)0xde0d0acc,
- (q31_t)0x763a45a0, (q31_t)0xde02645d, (q31_t)0x76339806, (q31_t)0xddf7bf3e,
- (q31_t)0x762ce855, (q31_t)0xdded1b6e, (q31_t)0x7626368d, (q31_t)0xdde278ef,
- (q31_t)0x761f82af, (q31_t)0xddd7d7c1, (q31_t)0x7618ccba, (q31_t)0xddcd37e4,
- (q31_t)0x761214b0, (q31_t)0xddc29958, (q31_t)0x760b5a90, (q31_t)0xddb7fc1e,
- (q31_t)0x76049e5b, (q31_t)0xddad6036, (q31_t)0x75fde011, (q31_t)0xdda2c5a2,
- (q31_t)0x75f71fb1, (q31_t)0xdd982c60, (q31_t)0x75f05d3d, (q31_t)0xdd8d9472,
- (q31_t)0x75e998b5, (q31_t)0xdd82fdd8, (q31_t)0x75e2d219, (q31_t)0xdd786892,
- (q31_t)0x75dc0968, (q31_t)0xdd6dd4a2, (q31_t)0x75d53ea5, (q31_t)0xdd634206,
- (q31_t)0x75ce71ce, (q31_t)0xdd58b0c0, (q31_t)0x75c7a2e3, (q31_t)0xdd4e20d0,
- (q31_t)0x75c0d1e7, (q31_t)0xdd439236, (q31_t)0x75b9fed7, (q31_t)0xdd3904f4,
- (q31_t)0x75b329b5, (q31_t)0xdd2e7908, (q31_t)0x75ac5282, (q31_t)0xdd23ee74,
- (q31_t)0x75a5793c, (q31_t)0xdd196538, (q31_t)0x759e9de5, (q31_t)0xdd0edd55,
- (q31_t)0x7597c07d, (q31_t)0xdd0456ca, (q31_t)0x7590e104, (q31_t)0xdcf9d199,
- (q31_t)0x7589ff7a, (q31_t)0xdcef4dc2, (q31_t)0x75831be0, (q31_t)0xdce4cb44,
- (q31_t)0x757c3636, (q31_t)0xdcda4a21, (q31_t)0x75754e7c, (q31_t)0xdccfca59,
- (q31_t)0x756e64b2, (q31_t)0xdcc54bec, (q31_t)0x756778d9, (q31_t)0xdcbacedb,
- (q31_t)0x75608af1, (q31_t)0xdcb05326, (q31_t)0x75599afa, (q31_t)0xdca5d8cd,
- (q31_t)0x7552a8f4, (q31_t)0xdc9b5fd2, (q31_t)0x754bb4e1, (q31_t)0xdc90e834,
- (q31_t)0x7544bebf, (q31_t)0xdc8671f3, (q31_t)0x753dc68f, (q31_t)0xdc7bfd11,
- (q31_t)0x7536cc52, (q31_t)0xdc71898d, (q31_t)0x752fd008, (q31_t)0xdc671768,
- (q31_t)0x7528d1b1, (q31_t)0xdc5ca6a2, (q31_t)0x7521d14d, (q31_t)0xdc52373c,
- (q31_t)0x751acedd, (q31_t)0xdc47c936, (q31_t)0x7513ca60, (q31_t)0xdc3d5c91,
- (q31_t)0x750cc3d8, (q31_t)0xdc32f14d, (q31_t)0x7505bb44, (q31_t)0xdc28876a,
- (q31_t)0x74feb0a5, (q31_t)0xdc1e1ee9, (q31_t)0x74f7a3fb, (q31_t)0xdc13b7c9,
- (q31_t)0x74f09546, (q31_t)0xdc09520d, (q31_t)0x74e98487, (q31_t)0xdbfeedb3,
- (q31_t)0x74e271bd, (q31_t)0xdbf48abd, (q31_t)0x74db5cea, (q31_t)0xdbea292b,
- (q31_t)0x74d4460c, (q31_t)0xdbdfc8fc, (q31_t)0x74cd2d26, (q31_t)0xdbd56a32,
- (q31_t)0x74c61236, (q31_t)0xdbcb0cce, (q31_t)0x74bef53d, (q31_t)0xdbc0b0ce,
- (q31_t)0x74b7d63c, (q31_t)0xdbb65634, (q31_t)0x74b0b533, (q31_t)0xdbabfd01,
- (q31_t)0x74a99221, (q31_t)0xdba1a534, (q31_t)0x74a26d08, (q31_t)0xdb974ece,
- (q31_t)0x749b45e7, (q31_t)0xdb8cf9cf, (q31_t)0x74941cbf, (q31_t)0xdb82a638,
- (q31_t)0x748cf190, (q31_t)0xdb785409, (q31_t)0x7485c45b, (q31_t)0xdb6e0342,
- (q31_t)0x747e951f, (q31_t)0xdb63b3e5, (q31_t)0x747763dd, (q31_t)0xdb5965f1,
- (q31_t)0x74703095, (q31_t)0xdb4f1967, (q31_t)0x7468fb47, (q31_t)0xdb44ce46,
- (q31_t)0x7461c3f5, (q31_t)0xdb3a8491, (q31_t)0x745a8a9d, (q31_t)0xdb303c46,
- (q31_t)0x74534f41, (q31_t)0xdb25f566, (q31_t)0x744c11e0, (q31_t)0xdb1baff2,
- (q31_t)0x7444d27b, (q31_t)0xdb116beb, (q31_t)0x743d9112, (q31_t)0xdb072950,
- (q31_t)0x74364da6, (q31_t)0xdafce821, (q31_t)0x742f0836, (q31_t)0xdaf2a860,
- (q31_t)0x7427c0c3, (q31_t)0xdae86a0d, (q31_t)0x7420774d, (q31_t)0xdade2d28,
- (q31_t)0x74192bd5, (q31_t)0xdad3f1b1, (q31_t)0x7411de5b, (q31_t)0xdac9b7a9,
- (q31_t)0x740a8edf, (q31_t)0xdabf7f11, (q31_t)0x74033d61, (q31_t)0xdab547e8,
- (q31_t)0x73fbe9e2, (q31_t)0xdaab122f, (q31_t)0x73f49462, (q31_t)0xdaa0dde7,
- (q31_t)0x73ed3ce1, (q31_t)0xda96ab0f, (q31_t)0x73e5e360, (q31_t)0xda8c79a9,
- (q31_t)0x73de87de, (q31_t)0xda8249b4, (q31_t)0x73d72a5d, (q31_t)0xda781b31,
- (q31_t)0x73cfcadc, (q31_t)0xda6dee21, (q31_t)0x73c8695b, (q31_t)0xda63c284,
- (q31_t)0x73c105db, (q31_t)0xda599859, (q31_t)0x73b9a05d, (q31_t)0xda4f6fa3,
- (q31_t)0x73b238e0, (q31_t)0xda454860, (q31_t)0x73aacf65, (q31_t)0xda3b2292,
- (q31_t)0x73a363ec, (q31_t)0xda30fe38, (q31_t)0x739bf675, (q31_t)0xda26db54,
- (q31_t)0x73948701, (q31_t)0xda1cb9e5, (q31_t)0x738d1590, (q31_t)0xda1299ec,
- (q31_t)0x7385a222, (q31_t)0xda087b69, (q31_t)0x737e2cb7, (q31_t)0xd9fe5e5e,
- (q31_t)0x7376b551, (q31_t)0xd9f442c9, (q31_t)0x736f3bee, (q31_t)0xd9ea28ac,
- (q31_t)0x7367c090, (q31_t)0xd9e01006, (q31_t)0x73604336, (q31_t)0xd9d5f8d9,
- (q31_t)0x7358c3e2, (q31_t)0xd9cbe325, (q31_t)0x73514292, (q31_t)0xd9c1cee9,
- (q31_t)0x7349bf48, (q31_t)0xd9b7bc27, (q31_t)0x73423a04, (q31_t)0xd9adaadf,
- (q31_t)0x733ab2c6, (q31_t)0xd9a39b11, (q31_t)0x7333298f, (q31_t)0xd9998cbe,
- (q31_t)0x732b9e5e, (q31_t)0xd98f7fe6, (q31_t)0x73241134, (q31_t)0xd9857489,
- (q31_t)0x731c8211, (q31_t)0xd97b6aa8, (q31_t)0x7314f0f6, (q31_t)0xd9716243,
- (q31_t)0x730d5de3, (q31_t)0xd9675b5a, (q31_t)0x7305c8d7, (q31_t)0xd95d55ef,
- (q31_t)0x72fe31d5, (q31_t)0xd9535201, (q31_t)0x72f698db, (q31_t)0xd9494f90,
- (q31_t)0x72eefdea, (q31_t)0xd93f4e9e, (q31_t)0x72e76102, (q31_t)0xd9354f2a,
- (q31_t)0x72dfc224, (q31_t)0xd92b5135, (q31_t)0x72d82150, (q31_t)0xd92154bf,
- (q31_t)0x72d07e85, (q31_t)0xd91759c9, (q31_t)0x72c8d9c6, (q31_t)0xd90d6053,
- (q31_t)0x72c13311, (q31_t)0xd903685d, (q31_t)0x72b98a67, (q31_t)0xd8f971e8,
- (q31_t)0x72b1dfc9, (q31_t)0xd8ef7cf4, (q31_t)0x72aa3336, (q31_t)0xd8e58982,
- (q31_t)0x72a284b0, (q31_t)0xd8db9792, (q31_t)0x729ad435, (q31_t)0xd8d1a724,
- (q31_t)0x729321c7, (q31_t)0xd8c7b838, (q31_t)0x728b6d66, (q31_t)0xd8bdcad0,
- (q31_t)0x7283b712, (q31_t)0xd8b3deeb, (q31_t)0x727bfecc, (q31_t)0xd8a9f48a,
- (q31_t)0x72744493, (q31_t)0xd8a00bae, (q31_t)0x726c8868, (q31_t)0xd8962456,
- (q31_t)0x7264ca4c, (q31_t)0xd88c3e83, (q31_t)0x725d0a3e, (q31_t)0xd8825a35,
- (q31_t)0x72554840, (q31_t)0xd878776d, (q31_t)0x724d8450, (q31_t)0xd86e962b,
- (q31_t)0x7245be70, (q31_t)0xd864b670, (q31_t)0x723df6a0, (q31_t)0xd85ad83c,
- (q31_t)0x72362ce0, (q31_t)0xd850fb8e, (q31_t)0x722e6130, (q31_t)0xd8472069,
- (q31_t)0x72269391, (q31_t)0xd83d46cc, (q31_t)0x721ec403, (q31_t)0xd8336eb7,
- (q31_t)0x7216f287, (q31_t)0xd829982b, (q31_t)0x720f1f1c, (q31_t)0xd81fc328,
- (q31_t)0x720749c3, (q31_t)0xd815efae, (q31_t)0x71ff727c, (q31_t)0xd80c1dbf,
- (q31_t)0x71f79948, (q31_t)0xd8024d59, (q31_t)0x71efbe27, (q31_t)0xd7f87e7f,
- (q31_t)0x71e7e118, (q31_t)0xd7eeb130, (q31_t)0x71e0021e, (q31_t)0xd7e4e56c,
- (q31_t)0x71d82137, (q31_t)0xd7db1b34, (q31_t)0x71d03e64, (q31_t)0xd7d15288,
- (q31_t)0x71c859a5, (q31_t)0xd7c78b68, (q31_t)0x71c072fb, (q31_t)0xd7bdc5d6,
- (q31_t)0x71b88a66, (q31_t)0xd7b401d1, (q31_t)0x71b09fe7, (q31_t)0xd7aa3f5a,
- (q31_t)0x71a8b37c, (q31_t)0xd7a07e70, (q31_t)0x71a0c528, (q31_t)0xd796bf16,
- (q31_t)0x7198d4ea, (q31_t)0xd78d014a, (q31_t)0x7190e2c3, (q31_t)0xd783450d,
- (q31_t)0x7188eeb2, (q31_t)0xd7798a60, (q31_t)0x7180f8b8, (q31_t)0xd76fd143,
- (q31_t)0x717900d6, (q31_t)0xd76619b6, (q31_t)0x7171070c, (q31_t)0xd75c63ba,
- (q31_t)0x71690b59, (q31_t)0xd752af4f, (q31_t)0x71610dbf, (q31_t)0xd748fc75,
- (q31_t)0x71590e3e, (q31_t)0xd73f4b2e, (q31_t)0x71510cd5, (q31_t)0xd7359b78,
- (q31_t)0x71490986, (q31_t)0xd72bed55, (q31_t)0x71410450, (q31_t)0xd72240c5,
- (q31_t)0x7138fd35, (q31_t)0xd71895c9, (q31_t)0x7130f433, (q31_t)0xd70eec60,
- (q31_t)0x7128e94c, (q31_t)0xd705448b, (q31_t)0x7120dc80, (q31_t)0xd6fb9e4b,
- (q31_t)0x7118cdcf, (q31_t)0xd6f1f99f, (q31_t)0x7110bd39, (q31_t)0xd6e85689,
- (q31_t)0x7108aabf, (q31_t)0xd6deb508, (q31_t)0x71009661, (q31_t)0xd6d5151d,
- (q31_t)0x70f8801f, (q31_t)0xd6cb76c9, (q31_t)0x70f067fb, (q31_t)0xd6c1da0b,
- (q31_t)0x70e84df3, (q31_t)0xd6b83ee4, (q31_t)0x70e03208, (q31_t)0xd6aea555,
- (q31_t)0x70d8143b, (q31_t)0xd6a50d5d, (q31_t)0x70cff48c, (q31_t)0xd69b76fe,
- (q31_t)0x70c7d2fb, (q31_t)0xd691e237, (q31_t)0x70bfaf89, (q31_t)0xd6884f09,
- (q31_t)0x70b78a36, (q31_t)0xd67ebd74, (q31_t)0x70af6302, (q31_t)0xd6752d79,
- (q31_t)0x70a739ed, (q31_t)0xd66b9f18, (q31_t)0x709f0ef8, (q31_t)0xd6621251,
- (q31_t)0x7096e223, (q31_t)0xd6588725, (q31_t)0x708eb36f, (q31_t)0xd64efd94,
- (q31_t)0x708682dc, (q31_t)0xd645759f, (q31_t)0x707e5069, (q31_t)0xd63bef46,
- (q31_t)0x70761c18, (q31_t)0xd6326a88, (q31_t)0x706de5e9, (q31_t)0xd628e767,
- (q31_t)0x7065addb, (q31_t)0xd61f65e4, (q31_t)0x705d73f0, (q31_t)0xd615e5fd,
- (q31_t)0x70553828, (q31_t)0xd60c67b4, (q31_t)0x704cfa83, (q31_t)0xd602eb0a,
- (q31_t)0x7044bb00, (q31_t)0xd5f96ffd, (q31_t)0x703c79a2, (q31_t)0xd5eff690,
- (q31_t)0x70343667, (q31_t)0xd5e67ec1, (q31_t)0x702bf151, (q31_t)0xd5dd0892,
- (q31_t)0x7023aa5f, (q31_t)0xd5d39403, (q31_t)0x701b6193, (q31_t)0xd5ca2115,
- (q31_t)0x701316eb, (q31_t)0xd5c0afc6, (q31_t)0x700aca69, (q31_t)0xd5b74019,
- (q31_t)0x70027c0c, (q31_t)0xd5add20d, (q31_t)0x6ffa2bd6, (q31_t)0xd5a465a3,
- (q31_t)0x6ff1d9c7, (q31_t)0xd59afadb, (q31_t)0x6fe985de, (q31_t)0xd59191b5,
- (q31_t)0x6fe1301c, (q31_t)0xd5882a32, (q31_t)0x6fd8d882, (q31_t)0xd57ec452,
- (q31_t)0x6fd07f0f, (q31_t)0xd5756016, (q31_t)0x6fc823c5, (q31_t)0xd56bfd7d,
- (q31_t)0x6fbfc6a3, (q31_t)0xd5629c89, (q31_t)0x6fb767aa, (q31_t)0xd5593d3a,
- (q31_t)0x6faf06da, (q31_t)0xd54fdf8f, (q31_t)0x6fa6a433, (q31_t)0xd5468389,
- (q31_t)0x6f9e3fb6, (q31_t)0xd53d292a, (q31_t)0x6f95d963, (q31_t)0xd533d070,
- (q31_t)0x6f8d713a, (q31_t)0xd52a795d, (q31_t)0x6f85073c, (q31_t)0xd52123f0,
- (q31_t)0x6f7c9b69, (q31_t)0xd517d02b, (q31_t)0x6f742dc1, (q31_t)0xd50e7e0d,
- (q31_t)0x6f6bbe45, (q31_t)0xd5052d97, (q31_t)0x6f634cf5, (q31_t)0xd4fbdec9,
- (q31_t)0x6f5ad9d1, (q31_t)0xd4f291a4, (q31_t)0x6f5264da, (q31_t)0xd4e94627,
- (q31_t)0x6f49ee0f, (q31_t)0xd4dffc54, (q31_t)0x6f417573, (q31_t)0xd4d6b42b,
- (q31_t)0x6f38fb03, (q31_t)0xd4cd6dab, (q31_t)0x6f307ec2, (q31_t)0xd4c428d6,
- (q31_t)0x6f2800af, (q31_t)0xd4bae5ab, (q31_t)0x6f1f80ca, (q31_t)0xd4b1a42c,
- (q31_t)0x6f16ff14, (q31_t)0xd4a86458, (q31_t)0x6f0e7b8e, (q31_t)0xd49f2630,
- (q31_t)0x6f05f637, (q31_t)0xd495e9b3, (q31_t)0x6efd6f10, (q31_t)0xd48caee4,
- (q31_t)0x6ef4e619, (q31_t)0xd48375c1, (q31_t)0x6eec5b53, (q31_t)0xd47a3e4b,
- (q31_t)0x6ee3cebe, (q31_t)0xd4710883, (q31_t)0x6edb405a, (q31_t)0xd467d469,
- (q31_t)0x6ed2b027, (q31_t)0xd45ea1fd, (q31_t)0x6eca1e27, (q31_t)0xd4557140,
- (q31_t)0x6ec18a58, (q31_t)0xd44c4232, (q31_t)0x6eb8f4bc, (q31_t)0xd44314d3,
- (q31_t)0x6eb05d53, (q31_t)0xd439e923, (q31_t)0x6ea7c41e, (q31_t)0xd430bf24,
- (q31_t)0x6e9f291b, (q31_t)0xd42796d5, (q31_t)0x6e968c4d, (q31_t)0xd41e7037,
- (q31_t)0x6e8dedb3, (q31_t)0xd4154b4a, (q31_t)0x6e854d4d, (q31_t)0xd40c280e,
- (q31_t)0x6e7cab1c, (q31_t)0xd4030684, (q31_t)0x6e740720, (q31_t)0xd3f9e6ad,
- (q31_t)0x6e6b615a, (q31_t)0xd3f0c887, (q31_t)0x6e62b9ca, (q31_t)0xd3e7ac15,
- (q31_t)0x6e5a1070, (q31_t)0xd3de9156, (q31_t)0x6e51654c, (q31_t)0xd3d5784a,
- (q31_t)0x6e48b860, (q31_t)0xd3cc60f2, (q31_t)0x6e4009aa, (q31_t)0xd3c34b4f,
- (q31_t)0x6e37592c, (q31_t)0xd3ba3760, (q31_t)0x6e2ea6e6, (q31_t)0xd3b12526,
- (q31_t)0x6e25f2d8, (q31_t)0xd3a814a2, (q31_t)0x6e1d3d03, (q31_t)0xd39f05d3,
- (q31_t)0x6e148566, (q31_t)0xd395f8ba, (q31_t)0x6e0bcc03, (q31_t)0xd38ced57,
- (q31_t)0x6e0310d9, (q31_t)0xd383e3ab, (q31_t)0x6dfa53e9, (q31_t)0xd37adbb6,
- (q31_t)0x6df19534, (q31_t)0xd371d579, (q31_t)0x6de8d4b8, (q31_t)0xd368d0f3,
- (q31_t)0x6de01278, (q31_t)0xd35fce26, (q31_t)0x6dd74e73, (q31_t)0xd356cd11,
- (q31_t)0x6dce88aa, (q31_t)0xd34dcdb4, (q31_t)0x6dc5c11c, (q31_t)0xd344d011,
- (q31_t)0x6dbcf7cb, (q31_t)0xd33bd427, (q31_t)0x6db42cb6, (q31_t)0xd332d9f7,
- (q31_t)0x6dab5fdf, (q31_t)0xd329e181, (q31_t)0x6da29144, (q31_t)0xd320eac6,
- (q31_t)0x6d99c0e7, (q31_t)0xd317f5c6, (q31_t)0x6d90eec8, (q31_t)0xd30f0280,
- (q31_t)0x6d881ae8, (q31_t)0xd30610f7, (q31_t)0x6d7f4545, (q31_t)0xd2fd2129,
- (q31_t)0x6d766de2, (q31_t)0xd2f43318, (q31_t)0x6d6d94bf, (q31_t)0xd2eb46c3,
- (q31_t)0x6d64b9da, (q31_t)0xd2e25c2b, (q31_t)0x6d5bdd36, (q31_t)0xd2d97350,
- (q31_t)0x6d52fed2, (q31_t)0xd2d08c33, (q31_t)0x6d4a1eaf, (q31_t)0xd2c7a6d4,
- (q31_t)0x6d413ccd, (q31_t)0xd2bec333, (q31_t)0x6d38592c, (q31_t)0xd2b5e151,
- (q31_t)0x6d2f73cd, (q31_t)0xd2ad012e, (q31_t)0x6d268cb0, (q31_t)0xd2a422ca,
- (q31_t)0x6d1da3d5, (q31_t)0xd29b4626, (q31_t)0x6d14b93d, (q31_t)0xd2926b41,
- (q31_t)0x6d0bcce8, (q31_t)0xd289921e, (q31_t)0x6d02ded7, (q31_t)0xd280babb,
- (q31_t)0x6cf9ef09, (q31_t)0xd277e518, (q31_t)0x6cf0fd80, (q31_t)0xd26f1138,
- (q31_t)0x6ce80a3a, (q31_t)0xd2663f19, (q31_t)0x6cdf153a, (q31_t)0xd25d6ebc,
- (q31_t)0x6cd61e7f, (q31_t)0xd254a021, (q31_t)0x6ccd2609, (q31_t)0xd24bd34a,
- (q31_t)0x6cc42bd9, (q31_t)0xd2430835, (q31_t)0x6cbb2fef, (q31_t)0xd23a3ee4,
- (q31_t)0x6cb2324c, (q31_t)0xd2317756, (q31_t)0x6ca932ef, (q31_t)0xd228b18d,
- (q31_t)0x6ca031da, (q31_t)0xd21fed88, (q31_t)0x6c972f0d, (q31_t)0xd2172b48,
- (q31_t)0x6c8e2a87, (q31_t)0xd20e6acc, (q31_t)0x6c85244a, (q31_t)0xd205ac17,
- (q31_t)0x6c7c1c55, (q31_t)0xd1fcef27, (q31_t)0x6c7312a9, (q31_t)0xd1f433fd,
- (q31_t)0x6c6a0746, (q31_t)0xd1eb7a9a, (q31_t)0x6c60fa2d, (q31_t)0xd1e2c2fd,
- (q31_t)0x6c57eb5e, (q31_t)0xd1da0d28, (q31_t)0x6c4edada, (q31_t)0xd1d1591a,
- (q31_t)0x6c45c8a0, (q31_t)0xd1c8a6d4, (q31_t)0x6c3cb4b1, (q31_t)0xd1bff656,
- (q31_t)0x6c339f0e, (q31_t)0xd1b747a0, (q31_t)0x6c2a87b6, (q31_t)0xd1ae9ab4,
- (q31_t)0x6c216eaa, (q31_t)0xd1a5ef90, (q31_t)0x6c1853eb, (q31_t)0xd19d4636,
- (q31_t)0x6c0f3779, (q31_t)0xd1949ea6, (q31_t)0x6c061953, (q31_t)0xd18bf8e0,
- (q31_t)0x6bfcf97c, (q31_t)0xd18354e4, (q31_t)0x6bf3d7f2, (q31_t)0xd17ab2b3,
- (q31_t)0x6beab4b6, (q31_t)0xd172124d, (q31_t)0x6be18fc9, (q31_t)0xd16973b3,
- (q31_t)0x6bd8692b, (q31_t)0xd160d6e5, (q31_t)0x6bcf40dc, (q31_t)0xd1583be2,
- (q31_t)0x6bc616dd, (q31_t)0xd14fa2ad, (q31_t)0x6bbceb2d, (q31_t)0xd1470b44,
- (q31_t)0x6bb3bdce, (q31_t)0xd13e75a8, (q31_t)0x6baa8ec0, (q31_t)0xd135e1d9,
- (q31_t)0x6ba15e03, (q31_t)0xd12d4fd9, (q31_t)0x6b982b97, (q31_t)0xd124bfa6,
- (q31_t)0x6b8ef77d, (q31_t)0xd11c3142, (q31_t)0x6b85c1b5, (q31_t)0xd113a4ad,
- (q31_t)0x6b7c8a3f, (q31_t)0xd10b19e7, (q31_t)0x6b73511c, (q31_t)0xd10290f0,
- (q31_t)0x6b6a164d, (q31_t)0xd0fa09c9, (q31_t)0x6b60d9d0, (q31_t)0xd0f18472,
- (q31_t)0x6b579ba8, (q31_t)0xd0e900ec, (q31_t)0x6b4e5bd4, (q31_t)0xd0e07f36,
- (q31_t)0x6b451a55, (q31_t)0xd0d7ff51, (q31_t)0x6b3bd72a, (q31_t)0xd0cf813e,
- (q31_t)0x6b329255, (q31_t)0xd0c704fd, (q31_t)0x6b294bd5, (q31_t)0xd0be8a8d,
- (q31_t)0x6b2003ac, (q31_t)0xd0b611f1, (q31_t)0x6b16b9d9, (q31_t)0xd0ad9b26,
- (q31_t)0x6b0d6e5c, (q31_t)0xd0a5262f, (q31_t)0x6b042137, (q31_t)0xd09cb30b,
- (q31_t)0x6afad269, (q31_t)0xd09441bb, (q31_t)0x6af181f3, (q31_t)0xd08bd23f,
- (q31_t)0x6ae82fd5, (q31_t)0xd0836497, (q31_t)0x6adedc10, (q31_t)0xd07af8c4,
- (q31_t)0x6ad586a3, (q31_t)0xd0728ec6, (q31_t)0x6acc2f90, (q31_t)0xd06a269d,
- (q31_t)0x6ac2d6d6, (q31_t)0xd061c04a, (q31_t)0x6ab97c77, (q31_t)0xd0595bcd,
- (q31_t)0x6ab02071, (q31_t)0xd050f926, (q31_t)0x6aa6c2c6, (q31_t)0xd0489856,
- (q31_t)0x6a9d6377, (q31_t)0xd040395d, (q31_t)0x6a940283, (q31_t)0xd037dc3b,
- (q31_t)0x6a8a9fea, (q31_t)0xd02f80f1, (q31_t)0x6a813bae, (q31_t)0xd027277e,
- (q31_t)0x6a77d5ce, (q31_t)0xd01ecfe4, (q31_t)0x6a6e6e4b, (q31_t)0xd0167a22,
- (q31_t)0x6a650525, (q31_t)0xd00e2639, (q31_t)0x6a5b9a5d, (q31_t)0xd005d42a,
- (q31_t)0x6a522df3, (q31_t)0xcffd83f4, (q31_t)0x6a48bfe7, (q31_t)0xcff53597,
- (q31_t)0x6a3f503a, (q31_t)0xcfece915, (q31_t)0x6a35deeb, (q31_t)0xcfe49e6d,
- (q31_t)0x6a2c6bfd, (q31_t)0xcfdc55a1, (q31_t)0x6a22f76e, (q31_t)0xcfd40eaf,
- (q31_t)0x6a19813f, (q31_t)0xcfcbc999, (q31_t)0x6a100970, (q31_t)0xcfc3865e,
- (q31_t)0x6a069003, (q31_t)0xcfbb4500, (q31_t)0x69fd14f6, (q31_t)0xcfb3057d,
- (q31_t)0x69f3984c, (q31_t)0xcfaac7d8, (q31_t)0x69ea1a03, (q31_t)0xcfa28c10,
- (q31_t)0x69e09a1c, (q31_t)0xcf9a5225, (q31_t)0x69d71899, (q31_t)0xcf921a17,
- (q31_t)0x69cd9578, (q31_t)0xcf89e3e8, (q31_t)0x69c410ba, (q31_t)0xcf81af97,
- (q31_t)0x69ba8a61, (q31_t)0xcf797d24, (q31_t)0x69b1026c, (q31_t)0xcf714c91,
- (q31_t)0x69a778db, (q31_t)0xcf691ddd, (q31_t)0x699dedaf, (q31_t)0xcf60f108,
- (q31_t)0x699460e8, (q31_t)0xcf58c613, (q31_t)0x698ad287, (q31_t)0xcf509cfe,
- (q31_t)0x6981428c, (q31_t)0xcf4875ca, (q31_t)0x6977b0f7, (q31_t)0xcf405077,
- (q31_t)0x696e1dc9, (q31_t)0xcf382d05, (q31_t)0x69648902, (q31_t)0xcf300b74,
- (q31_t)0x695af2a3, (q31_t)0xcf27ebc5, (q31_t)0x69515aab, (q31_t)0xcf1fcdf8,
- (q31_t)0x6947c11c, (q31_t)0xcf17b20d, (q31_t)0x693e25f5, (q31_t)0xcf0f9805,
- (q31_t)0x69348937, (q31_t)0xcf077fe1, (q31_t)0x692aeae3, (q31_t)0xceff699f,
- (q31_t)0x69214af8, (q31_t)0xcef75541, (q31_t)0x6917a977, (q31_t)0xceef42c7,
- (q31_t)0x690e0661, (q31_t)0xcee73231, (q31_t)0x690461b5, (q31_t)0xcedf2380,
- (q31_t)0x68fabb75, (q31_t)0xced716b4, (q31_t)0x68f113a0, (q31_t)0xcecf0bcd,
- (q31_t)0x68e76a37, (q31_t)0xcec702cb, (q31_t)0x68ddbf3b, (q31_t)0xcebefbb0,
- (q31_t)0x68d412ab, (q31_t)0xceb6f67a, (q31_t)0x68ca6488, (q31_t)0xceaef32b,
- (q31_t)0x68c0b4d2, (q31_t)0xcea6f1c2, (q31_t)0x68b7038b, (q31_t)0xce9ef241,
- (q31_t)0x68ad50b1, (q31_t)0xce96f4a7, (q31_t)0x68a39c46, (q31_t)0xce8ef8f4,
- (q31_t)0x6899e64a, (q31_t)0xce86ff2a, (q31_t)0x68902ebd, (q31_t)0xce7f0748,
- (q31_t)0x688675a0, (q31_t)0xce77114e, (q31_t)0x687cbaf3, (q31_t)0xce6f1d3d,
- (q31_t)0x6872feb6, (q31_t)0xce672b16, (q31_t)0x686940ea, (q31_t)0xce5f3ad8,
- (q31_t)0x685f8190, (q31_t)0xce574c84, (q31_t)0x6855c0a6, (q31_t)0xce4f6019,
- (q31_t)0x684bfe2f, (q31_t)0xce47759a, (q31_t)0x68423a2a, (q31_t)0xce3f8d05,
- (q31_t)0x68387498, (q31_t)0xce37a65b, (q31_t)0x682ead78, (q31_t)0xce2fc19c,
- (q31_t)0x6824e4cc, (q31_t)0xce27dec9, (q31_t)0x681b1a94, (q31_t)0xce1ffde2,
- (q31_t)0x68114ed0, (q31_t)0xce181ee8, (q31_t)0x68078181, (q31_t)0xce1041d9,
- (q31_t)0x67fdb2a7, (q31_t)0xce0866b8, (q31_t)0x67f3e241, (q31_t)0xce008d84,
- (q31_t)0x67ea1052, (q31_t)0xcdf8b63d, (q31_t)0x67e03cd8, (q31_t)0xcdf0e0e4,
- (q31_t)0x67d667d5, (q31_t)0xcde90d79, (q31_t)0x67cc9149, (q31_t)0xcde13bfd,
- (q31_t)0x67c2b934, (q31_t)0xcdd96c6f, (q31_t)0x67b8df97, (q31_t)0xcdd19ed0,
- (q31_t)0x67af0472, (q31_t)0xcdc9d320, (q31_t)0x67a527c4, (q31_t)0xcdc20960,
- (q31_t)0x679b4990, (q31_t)0xcdba4190, (q31_t)0x679169d5, (q31_t)0xcdb27bb0,
- (q31_t)0x67878893, (q31_t)0xcdaab7c0, (q31_t)0x677da5cb, (q31_t)0xcda2f5c2,
- (q31_t)0x6773c17d, (q31_t)0xcd9b35b4, (q31_t)0x6769dbaa, (q31_t)0xcd937798,
- (q31_t)0x675ff452, (q31_t)0xcd8bbb6d, (q31_t)0x67560b76, (q31_t)0xcd840134,
- (q31_t)0x674c2115, (q31_t)0xcd7c48ee, (q31_t)0x67423530, (q31_t)0xcd74929a,
- (q31_t)0x673847c8, (q31_t)0xcd6cde39, (q31_t)0x672e58dc, (q31_t)0xcd652bcb,
- (q31_t)0x6724686e, (q31_t)0xcd5d7b50, (q31_t)0x671a767e, (q31_t)0xcd55ccca,
- (q31_t)0x6710830c, (q31_t)0xcd4e2037, (q31_t)0x67068e18, (q31_t)0xcd467599,
- (q31_t)0x66fc97a3, (q31_t)0xcd3eccef, (q31_t)0x66f29fad, (q31_t)0xcd37263a,
- (q31_t)0x66e8a637, (q31_t)0xcd2f817b, (q31_t)0x66deab41, (q31_t)0xcd27deb0,
- (q31_t)0x66d4aecb, (q31_t)0xcd203ddc, (q31_t)0x66cab0d6, (q31_t)0xcd189efe,
- (q31_t)0x66c0b162, (q31_t)0xcd110216, (q31_t)0x66b6b070, (q31_t)0xcd096725,
- (q31_t)0x66acadff, (q31_t)0xcd01ce2b, (q31_t)0x66a2aa11, (q31_t)0xccfa3729,
- (q31_t)0x6698a4a6, (q31_t)0xccf2a21d, (q31_t)0x668e9dbd, (q31_t)0xcceb0f0a,
- (q31_t)0x66849558, (q31_t)0xcce37def, (q31_t)0x667a8b77, (q31_t)0xccdbeecc,
- (q31_t)0x6670801a, (q31_t)0xccd461a2, (q31_t)0x66667342, (q31_t)0xccccd671,
- (q31_t)0x665c64ef, (q31_t)0xccc54d3a, (q31_t)0x66525521, (q31_t)0xccbdc5fc,
- (q31_t)0x664843d9, (q31_t)0xccb640b8, (q31_t)0x663e3117, (q31_t)0xccaebd6e,
- (q31_t)0x66341cdb, (q31_t)0xcca73c1e, (q31_t)0x662a0727, (q31_t)0xcc9fbcca,
- (q31_t)0x661feffa, (q31_t)0xcc983f70, (q31_t)0x6615d754, (q31_t)0xcc90c412,
- (q31_t)0x660bbd37, (q31_t)0xcc894aaf, (q31_t)0x6601a1a2, (q31_t)0xcc81d349,
- (q31_t)0x65f78497, (q31_t)0xcc7a5dde, (q31_t)0x65ed6614, (q31_t)0xcc72ea70,
- (q31_t)0x65e3461b, (q31_t)0xcc6b78ff, (q31_t)0x65d924ac, (q31_t)0xcc64098b,
- (q31_t)0x65cf01c8, (q31_t)0xcc5c9c14, (q31_t)0x65c4dd6e, (q31_t)0xcc55309b,
- (q31_t)0x65bab7a0, (q31_t)0xcc4dc720, (q31_t)0x65b0905d, (q31_t)0xcc465fa3,
- (q31_t)0x65a667a7, (q31_t)0xcc3efa25, (q31_t)0x659c3d7c, (q31_t)0xcc3796a5,
- (q31_t)0x659211df, (q31_t)0xcc303524, (q31_t)0x6587e4cf, (q31_t)0xcc28d5a3,
- (q31_t)0x657db64c, (q31_t)0xcc217822, (q31_t)0x65738657, (q31_t)0xcc1a1ca0,
- (q31_t)0x656954f1, (q31_t)0xcc12c31f, (q31_t)0x655f2219, (q31_t)0xcc0b6b9e,
- (q31_t)0x6554edd1, (q31_t)0xcc04161e, (q31_t)0x654ab818, (q31_t)0xcbfcc29f,
- (q31_t)0x654080ef, (q31_t)0xcbf57121, (q31_t)0x65364857, (q31_t)0xcbee21a5,
- (q31_t)0x652c0e4f, (q31_t)0xcbe6d42b, (q31_t)0x6521d2d8, (q31_t)0xcbdf88b3,
- (q31_t)0x651795f3, (q31_t)0xcbd83f3d, (q31_t)0x650d57a0, (q31_t)0xcbd0f7ca,
- (q31_t)0x650317df, (q31_t)0xcbc9b25a, (q31_t)0x64f8d6b0, (q31_t)0xcbc26eee,
- (q31_t)0x64ee9415, (q31_t)0xcbbb2d85, (q31_t)0x64e4500e, (q31_t)0xcbb3ee20,
- (q31_t)0x64da0a9a, (q31_t)0xcbacb0bf, (q31_t)0x64cfc3ba, (q31_t)0xcba57563,
- (q31_t)0x64c57b6f, (q31_t)0xcb9e3c0b, (q31_t)0x64bb31ba, (q31_t)0xcb9704b9,
- (q31_t)0x64b0e699, (q31_t)0xcb8fcf6b, (q31_t)0x64a69a0f, (q31_t)0xcb889c23,
- (q31_t)0x649c4c1b, (q31_t)0xcb816ae1, (q31_t)0x6491fcbe, (q31_t)0xcb7a3ba5,
- (q31_t)0x6487abf7, (q31_t)0xcb730e70, (q31_t)0x647d59c8, (q31_t)0xcb6be341,
- (q31_t)0x64730631, (q31_t)0xcb64ba19, (q31_t)0x6468b132, (q31_t)0xcb5d92f8,
- (q31_t)0x645e5acc, (q31_t)0xcb566ddf, (q31_t)0x645402ff, (q31_t)0xcb4f4acd,
- (q31_t)0x6449a9cc, (q31_t)0xcb4829c4, (q31_t)0x643f4f32, (q31_t)0xcb410ac3,
- (q31_t)0x6434f332, (q31_t)0xcb39edca, (q31_t)0x642a95ce, (q31_t)0xcb32d2da,
- (q31_t)0x64203704, (q31_t)0xcb2bb9f4, (q31_t)0x6415d6d5, (q31_t)0xcb24a316,
- (q31_t)0x640b7543, (q31_t)0xcb1d8e43, (q31_t)0x6401124d, (q31_t)0xcb167b79,
- (q31_t)0x63f6adf3, (q31_t)0xcb0f6aba, (q31_t)0x63ec4837, (q31_t)0xcb085c05,
- (q31_t)0x63e1e117, (q31_t)0xcb014f5b, (q31_t)0x63d77896, (q31_t)0xcafa44bc,
- (q31_t)0x63cd0eb3, (q31_t)0xcaf33c28, (q31_t)0x63c2a36f, (q31_t)0xcaec35a0,
- (q31_t)0x63b836ca, (q31_t)0xcae53123, (q31_t)0x63adc8c4, (q31_t)0xcade2eb3,
- (q31_t)0x63a3595e, (q31_t)0xcad72e4f, (q31_t)0x6398e898, (q31_t)0xcad02ff8,
- (q31_t)0x638e7673, (q31_t)0xcac933ae, (q31_t)0x638402ef, (q31_t)0xcac23971,
- (q31_t)0x63798e0d, (q31_t)0xcabb4141, (q31_t)0x636f17cc, (q31_t)0xcab44b1f,
- (q31_t)0x6364a02e, (q31_t)0xcaad570c, (q31_t)0x635a2733, (q31_t)0xcaa66506,
- (q31_t)0x634facda, (q31_t)0xca9f750f, (q31_t)0x63453125, (q31_t)0xca988727,
- (q31_t)0x633ab414, (q31_t)0xca919b4e, (q31_t)0x633035a7, (q31_t)0xca8ab184,
- (q31_t)0x6325b5df, (q31_t)0xca83c9ca, (q31_t)0x631b34bc, (q31_t)0xca7ce420,
- (q31_t)0x6310b23e, (q31_t)0xca760086, (q31_t)0x63062e67, (q31_t)0xca6f1efc,
- (q31_t)0x62fba936, (q31_t)0xca683f83, (q31_t)0x62f122ab, (q31_t)0xca61621b,
- (q31_t)0x62e69ac8, (q31_t)0xca5a86c4, (q31_t)0x62dc118c, (q31_t)0xca53ad7e,
- (q31_t)0x62d186f8, (q31_t)0xca4cd64b, (q31_t)0x62c6fb0c, (q31_t)0xca460129,
- (q31_t)0x62bc6dca, (q31_t)0xca3f2e19, (q31_t)0x62b1df30, (q31_t)0xca385d1d,
- (q31_t)0x62a74f40, (q31_t)0xca318e32, (q31_t)0x629cbdfa, (q31_t)0xca2ac15b,
- (q31_t)0x62922b5e, (q31_t)0xca23f698, (q31_t)0x6287976e, (q31_t)0xca1d2de7,
- (q31_t)0x627d0228, (q31_t)0xca16674b, (q31_t)0x62726b8e, (q31_t)0xca0fa2c3,
- (q31_t)0x6267d3a0, (q31_t)0xca08e04f, (q31_t)0x625d3a5e, (q31_t)0xca021fef,
- (q31_t)0x62529fca, (q31_t)0xc9fb61a5, (q31_t)0x624803e2, (q31_t)0xc9f4a570,
- (q31_t)0x623d66a8, (q31_t)0xc9edeb50, (q31_t)0x6232c81c, (q31_t)0xc9e73346,
- (q31_t)0x6228283f, (q31_t)0xc9e07d51, (q31_t)0x621d8711, (q31_t)0xc9d9c973,
- (q31_t)0x6212e492, (q31_t)0xc9d317ab, (q31_t)0x620840c2, (q31_t)0xc9cc67fa,
- (q31_t)0x61fd9ba3, (q31_t)0xc9c5ba60, (q31_t)0x61f2f534, (q31_t)0xc9bf0edd,
- (q31_t)0x61e84d76, (q31_t)0xc9b86572, (q31_t)0x61dda46a, (q31_t)0xc9b1be1e,
- (q31_t)0x61d2fa0f, (q31_t)0xc9ab18e3, (q31_t)0x61c84e67, (q31_t)0xc9a475bf,
- (q31_t)0x61bda171, (q31_t)0xc99dd4b4, (q31_t)0x61b2f32e, (q31_t)0xc99735c2,
- (q31_t)0x61a8439e, (q31_t)0xc99098e9, (q31_t)0x619d92c2, (q31_t)0xc989fe29,
- (q31_t)0x6192e09b, (q31_t)0xc9836582, (q31_t)0x61882d28, (q31_t)0xc97ccef5,
- (q31_t)0x617d786a, (q31_t)0xc9763a83, (q31_t)0x6172c262, (q31_t)0xc96fa82a,
- (q31_t)0x61680b0f, (q31_t)0xc96917ec, (q31_t)0x615d5273, (q31_t)0xc96289c9,
- (q31_t)0x6152988d, (q31_t)0xc95bfdc1, (q31_t)0x6147dd5f, (q31_t)0xc95573d4,
- (q31_t)0x613d20e8, (q31_t)0xc94eec03, (q31_t)0x61326329, (q31_t)0xc948664d,
- (q31_t)0x6127a423, (q31_t)0xc941e2b4, (q31_t)0x611ce3d5, (q31_t)0xc93b6137,
- (q31_t)0x61122240, (q31_t)0xc934e1d6, (q31_t)0x61075f65, (q31_t)0xc92e6492,
- (q31_t)0x60fc9b44, (q31_t)0xc927e96b, (q31_t)0x60f1d5de, (q31_t)0xc9217062,
- (q31_t)0x60e70f32, (q31_t)0xc91af976, (q31_t)0x60dc4742, (q31_t)0xc91484a8,
- (q31_t)0x60d17e0d, (q31_t)0xc90e11f7, (q31_t)0x60c6b395, (q31_t)0xc907a166,
- (q31_t)0x60bbe7d8, (q31_t)0xc90132f2, (q31_t)0x60b11ad9, (q31_t)0xc8fac69e,
- (q31_t)0x60a64c97, (q31_t)0xc8f45c68, (q31_t)0x609b7d13, (q31_t)0xc8edf452,
- (q31_t)0x6090ac4d, (q31_t)0xc8e78e5b, (q31_t)0x6085da46, (q31_t)0xc8e12a84,
- (q31_t)0x607b06fe, (q31_t)0xc8dac8cd, (q31_t)0x60703275, (q31_t)0xc8d46936,
- (q31_t)0x60655cac, (q31_t)0xc8ce0bc0, (q31_t)0x605a85a3, (q31_t)0xc8c7b06b,
- (q31_t)0x604fad5b, (q31_t)0xc8c15736, (q31_t)0x6044d3d4, (q31_t)0xc8bb0023,
- (q31_t)0x6039f90f, (q31_t)0xc8b4ab32, (q31_t)0x602f1d0b, (q31_t)0xc8ae5862,
- (q31_t)0x60243fca, (q31_t)0xc8a807b4, (q31_t)0x6019614c, (q31_t)0xc8a1b928,
- (q31_t)0x600e8190, (q31_t)0xc89b6cbf, (q31_t)0x6003a099, (q31_t)0xc8952278,
- (q31_t)0x5ff8be65, (q31_t)0xc88eda54, (q31_t)0x5feddaf6, (q31_t)0xc8889454,
- (q31_t)0x5fe2f64c, (q31_t)0xc8825077, (q31_t)0x5fd81067, (q31_t)0xc87c0ebd,
- (q31_t)0x5fcd2948, (q31_t)0xc875cf28, (q31_t)0x5fc240ef, (q31_t)0xc86f91b7,
- (q31_t)0x5fb7575c, (q31_t)0xc869566a, (q31_t)0x5fac6c91, (q31_t)0xc8631d42,
- (q31_t)0x5fa1808c, (q31_t)0xc85ce63e, (q31_t)0x5f969350, (q31_t)0xc856b160,
- (q31_t)0x5f8ba4dc, (q31_t)0xc8507ea7, (q31_t)0x5f80b531, (q31_t)0xc84a4e14,
- (q31_t)0x5f75c44e, (q31_t)0xc8441fa6, (q31_t)0x5f6ad235, (q31_t)0xc83df35f,
- (q31_t)0x5f5fdee6, (q31_t)0xc837c93e, (q31_t)0x5f54ea62, (q31_t)0xc831a143,
- (q31_t)0x5f49f4a8, (q31_t)0xc82b7b70, (q31_t)0x5f3efdb9, (q31_t)0xc82557c3,
- (q31_t)0x5f340596, (q31_t)0xc81f363d, (q31_t)0x5f290c3f, (q31_t)0xc81916df,
- (q31_t)0x5f1e11b5, (q31_t)0xc812f9a9, (q31_t)0x5f1315f7, (q31_t)0xc80cde9b,
- (q31_t)0x5f081907, (q31_t)0xc806c5b5, (q31_t)0x5efd1ae4, (q31_t)0xc800aef7,
- (q31_t)0x5ef21b90, (q31_t)0xc7fa9a62, (q31_t)0x5ee71b0a, (q31_t)0xc7f487f6,
- (q31_t)0x5edc1953, (q31_t)0xc7ee77b3, (q31_t)0x5ed1166b, (q31_t)0xc7e8699a,
- (q31_t)0x5ec61254, (q31_t)0xc7e25daa, (q31_t)0x5ebb0d0d, (q31_t)0xc7dc53e3,
- (q31_t)0x5eb00696, (q31_t)0xc7d64c47, (q31_t)0x5ea4fef0, (q31_t)0xc7d046d6,
- (q31_t)0x5e99f61d, (q31_t)0xc7ca438f, (q31_t)0x5e8eec1b, (q31_t)0xc7c44272,
- (q31_t)0x5e83e0eb, (q31_t)0xc7be4381, (q31_t)0x5e78d48e, (q31_t)0xc7b846ba,
- (q31_t)0x5e6dc705, (q31_t)0xc7b24c20, (q31_t)0x5e62b84f, (q31_t)0xc7ac53b1,
- (q31_t)0x5e57a86d, (q31_t)0xc7a65d6e, (q31_t)0x5e4c9760, (q31_t)0xc7a06957,
- (q31_t)0x5e418528, (q31_t)0xc79a776c, (q31_t)0x5e3671c5, (q31_t)0xc79487ae,
- (q31_t)0x5e2b5d38, (q31_t)0xc78e9a1d, (q31_t)0x5e204781, (q31_t)0xc788aeb9,
- (q31_t)0x5e1530a1, (q31_t)0xc782c582, (q31_t)0x5e0a1898, (q31_t)0xc77cde79,
- (q31_t)0x5dfeff67, (q31_t)0xc776f99d, (q31_t)0x5df3e50d, (q31_t)0xc77116f0,
- (q31_t)0x5de8c98c, (q31_t)0xc76b3671, (q31_t)0x5dddace4, (q31_t)0xc7655820,
- (q31_t)0x5dd28f15, (q31_t)0xc75f7bfe, (q31_t)0x5dc7701f, (q31_t)0xc759a20a,
- (q31_t)0x5dbc5004, (q31_t)0xc753ca46, (q31_t)0x5db12ec3, (q31_t)0xc74df4b1,
- (q31_t)0x5da60c5d, (q31_t)0xc748214c, (q31_t)0x5d9ae8d2, (q31_t)0xc7425016,
- (q31_t)0x5d8fc424, (q31_t)0xc73c8111, (q31_t)0x5d849e51, (q31_t)0xc736b43c,
- (q31_t)0x5d79775c, (q31_t)0xc730e997, (q31_t)0x5d6e4f43, (q31_t)0xc72b2123,
- (q31_t)0x5d632608, (q31_t)0xc7255ae0, (q31_t)0x5d57fbaa, (q31_t)0xc71f96ce,
- (q31_t)0x5d4cd02c, (q31_t)0xc719d4ed, (q31_t)0x5d41a38c, (q31_t)0xc714153e,
- (q31_t)0x5d3675cb, (q31_t)0xc70e57c0, (q31_t)0x5d2b46ea, (q31_t)0xc7089c75,
- (q31_t)0x5d2016e9, (q31_t)0xc702e35c, (q31_t)0x5d14e5c9, (q31_t)0xc6fd2c75,
- (q31_t)0x5d09b389, (q31_t)0xc6f777c1, (q31_t)0x5cfe802b, (q31_t)0xc6f1c540,
- (q31_t)0x5cf34baf, (q31_t)0xc6ec14f2, (q31_t)0x5ce81615, (q31_t)0xc6e666d7,
- (q31_t)0x5cdcdf5e, (q31_t)0xc6e0baf0, (q31_t)0x5cd1a78a, (q31_t)0xc6db113d,
- (q31_t)0x5cc66e99, (q31_t)0xc6d569be, (q31_t)0x5cbb348d, (q31_t)0xc6cfc472,
- (q31_t)0x5caff965, (q31_t)0xc6ca215c, (q31_t)0x5ca4bd21, (q31_t)0xc6c4807a,
- (q31_t)0x5c997fc4, (q31_t)0xc6bee1cd, (q31_t)0x5c8e414b, (q31_t)0xc6b94554,
- (q31_t)0x5c8301b9, (q31_t)0xc6b3ab12, (q31_t)0x5c77c10e, (q31_t)0xc6ae1304,
- (q31_t)0x5c6c7f4a, (q31_t)0xc6a87d2d, (q31_t)0x5c613c6d, (q31_t)0xc6a2e98b,
- (q31_t)0x5c55f878, (q31_t)0xc69d5820, (q31_t)0x5c4ab36b, (q31_t)0xc697c8eb,
- (q31_t)0x5c3f6d47, (q31_t)0xc6923bec, (q31_t)0x5c34260c, (q31_t)0xc68cb124,
- (q31_t)0x5c28ddbb, (q31_t)0xc6872894, (q31_t)0x5c1d9454, (q31_t)0xc681a23a,
- (q31_t)0x5c1249d8, (q31_t)0xc67c1e18, (q31_t)0x5c06fe46, (q31_t)0xc6769c2e,
- (q31_t)0x5bfbb1a0, (q31_t)0xc6711c7b, (q31_t)0x5bf063e6, (q31_t)0xc66b9f01,
- (q31_t)0x5be51518, (q31_t)0xc66623be, (q31_t)0x5bd9c537, (q31_t)0xc660aab5,
- (q31_t)0x5bce7442, (q31_t)0xc65b33e4, (q31_t)0x5bc3223c, (q31_t)0xc655bf4c,
- (q31_t)0x5bb7cf23, (q31_t)0xc6504ced, (q31_t)0x5bac7af9, (q31_t)0xc64adcc7,
- (q31_t)0x5ba125bd, (q31_t)0xc6456edb, (q31_t)0x5b95cf71, (q31_t)0xc6400329,
- (q31_t)0x5b8a7815, (q31_t)0xc63a99b1, (q31_t)0x5b7f1fa9, (q31_t)0xc6353273,
- (q31_t)0x5b73c62d, (q31_t)0xc62fcd6f, (q31_t)0x5b686ba3, (q31_t)0xc62a6aa6,
- (q31_t)0x5b5d100a, (q31_t)0xc6250a18, (q31_t)0x5b51b363, (q31_t)0xc61fabc4,
- (q31_t)0x5b4655ae, (q31_t)0xc61a4fac, (q31_t)0x5b3af6ec, (q31_t)0xc614f5cf,
- (q31_t)0x5b2f971e, (q31_t)0xc60f9e2e, (q31_t)0x5b243643, (q31_t)0xc60a48c9,
- (q31_t)0x5b18d45c, (q31_t)0xc604f5a0, (q31_t)0x5b0d716a, (q31_t)0xc5ffa4b3,
- (q31_t)0x5b020d6c, (q31_t)0xc5fa5603, (q31_t)0x5af6a865, (q31_t)0xc5f5098f,
- (q31_t)0x5aeb4253, (q31_t)0xc5efbf58, (q31_t)0x5adfdb37, (q31_t)0xc5ea775e,
- (q31_t)0x5ad47312, (q31_t)0xc5e531a1, (q31_t)0x5ac909e5, (q31_t)0xc5dfee22,
- (q31_t)0x5abd9faf, (q31_t)0xc5daace1, (q31_t)0x5ab23471, (q31_t)0xc5d56ddd,
- (q31_t)0x5aa6c82b, (q31_t)0xc5d03118, (q31_t)0x5a9b5adf, (q31_t)0xc5caf690,
- (q31_t)0x5a8fec8c, (q31_t)0xc5c5be47, (q31_t)0x5a847d33, (q31_t)0xc5c0883d,
- (q31_t)0x5a790cd4, (q31_t)0xc5bb5472, (q31_t)0x5a6d9b70, (q31_t)0xc5b622e6,
- (q31_t)0x5a622907, (q31_t)0xc5b0f399, (q31_t)0x5a56b599, (q31_t)0xc5abc68c,
- (q31_t)0x5a4b4128, (q31_t)0xc5a69bbe, (q31_t)0x5a3fcbb3, (q31_t)0xc5a17330,
- (q31_t)0x5a34553b, (q31_t)0xc59c4ce3, (q31_t)0x5a28ddc0, (q31_t)0xc59728d5,
- (q31_t)0x5a1d6544, (q31_t)0xc5920708, (q31_t)0x5a11ebc5, (q31_t)0xc58ce77c,
- (q31_t)0x5a067145, (q31_t)0xc587ca31, (q31_t)0x59faf5c5, (q31_t)0xc582af26,
- (q31_t)0x59ef7944, (q31_t)0xc57d965d, (q31_t)0x59e3fbc3, (q31_t)0xc5787fd6,
- (q31_t)0x59d87d42, (q31_t)0xc5736b90, (q31_t)0x59ccfdc2, (q31_t)0xc56e598c,
- (q31_t)0x59c17d44, (q31_t)0xc56949ca, (q31_t)0x59b5fbc8, (q31_t)0xc5643c4a,
- (q31_t)0x59aa794d, (q31_t)0xc55f310d, (q31_t)0x599ef5d6, (q31_t)0xc55a2812,
- (q31_t)0x59937161, (q31_t)0xc555215a, (q31_t)0x5987ebf0, (q31_t)0xc5501ce5,
- (q31_t)0x597c6584, (q31_t)0xc54b1ab4, (q31_t)0x5970de1b, (q31_t)0xc5461ac6,
- (q31_t)0x596555b8, (q31_t)0xc5411d1b, (q31_t)0x5959cc5a, (q31_t)0xc53c21b4,
- (q31_t)0x594e4201, (q31_t)0xc5372891, (q31_t)0x5942b6af, (q31_t)0xc53231b3,
- (q31_t)0x59372a64, (q31_t)0xc52d3d18, (q31_t)0x592b9d1f, (q31_t)0xc5284ac3,
- (q31_t)0x59200ee3, (q31_t)0xc5235ab2, (q31_t)0x59147fae, (q31_t)0xc51e6ce6,
- (q31_t)0x5908ef82, (q31_t)0xc519815f, (q31_t)0x58fd5e5f, (q31_t)0xc514981d,
- (q31_t)0x58f1cc45, (q31_t)0xc50fb121, (q31_t)0x58e63935, (q31_t)0xc50acc6b,
- (q31_t)0x58daa52f, (q31_t)0xc505e9fb, (q31_t)0x58cf1034, (q31_t)0xc50109d0,
- (q31_t)0x58c37a44, (q31_t)0xc4fc2bec, (q31_t)0x58b7e35f, (q31_t)0xc4f7504e,
- (q31_t)0x58ac4b87, (q31_t)0xc4f276f7, (q31_t)0x58a0b2bb, (q31_t)0xc4ed9fe7,
- (q31_t)0x589518fc, (q31_t)0xc4e8cb1e, (q31_t)0x58897e4a, (q31_t)0xc4e3f89c,
- (q31_t)0x587de2a7, (q31_t)0xc4df2862, (q31_t)0x58724611, (q31_t)0xc4da5a6f,
- (q31_t)0x5866a88a, (q31_t)0xc4d58ec3, (q31_t)0x585b0a13, (q31_t)0xc4d0c560,
- (q31_t)0x584f6aab, (q31_t)0xc4cbfe45, (q31_t)0x5843ca53, (q31_t)0xc4c73972,
- (q31_t)0x5838290c, (q31_t)0xc4c276e8, (q31_t)0x582c86d5, (q31_t)0xc4bdb6a6,
- (q31_t)0x5820e3b0, (q31_t)0xc4b8f8ad, (q31_t)0x58153f9d, (q31_t)0xc4b43cfd,
- (q31_t)0x58099a9c, (q31_t)0xc4af8397, (q31_t)0x57fdf4ae, (q31_t)0xc4aacc7a,
- (q31_t)0x57f24dd3, (q31_t)0xc4a617a6, (q31_t)0x57e6a60c, (q31_t)0xc4a1651c,
- (q31_t)0x57dafd59, (q31_t)0xc49cb4dd, (q31_t)0x57cf53bb, (q31_t)0xc49806e7,
- (q31_t)0x57c3a931, (q31_t)0xc4935b3c, (q31_t)0x57b7fdbd, (q31_t)0xc48eb1db,
- (q31_t)0x57ac515f, (q31_t)0xc48a0ac4, (q31_t)0x57a0a417, (q31_t)0xc48565f9,
- (q31_t)0x5794f5e6, (q31_t)0xc480c379, (q31_t)0x578946cc, (q31_t)0xc47c2344,
- (q31_t)0x577d96ca, (q31_t)0xc477855a, (q31_t)0x5771e5e0, (q31_t)0xc472e9bc,
- (q31_t)0x5766340f, (q31_t)0xc46e5069, (q31_t)0x575a8157, (q31_t)0xc469b963,
- (q31_t)0x574ecdb8, (q31_t)0xc46524a9, (q31_t)0x57431933, (q31_t)0xc460923b,
- (q31_t)0x573763c9, (q31_t)0xc45c0219, (q31_t)0x572bad7a, (q31_t)0xc4577444,
- (q31_t)0x571ff646, (q31_t)0xc452e8bc, (q31_t)0x57143e2d, (q31_t)0xc44e5f80,
- (q31_t)0x57088531, (q31_t)0xc449d892, (q31_t)0x56fccb51, (q31_t)0xc44553f2,
- (q31_t)0x56f1108f, (q31_t)0xc440d19e, (q31_t)0x56e554ea, (q31_t)0xc43c5199,
- (q31_t)0x56d99864, (q31_t)0xc437d3e1, (q31_t)0x56cddafb, (q31_t)0xc4335877,
- (q31_t)0x56c21cb2, (q31_t)0xc42edf5c, (q31_t)0x56b65d88, (q31_t)0xc42a688f,
- (q31_t)0x56aa9d7e, (q31_t)0xc425f410, (q31_t)0x569edc94, (q31_t)0xc42181e0,
- (q31_t)0x56931acb, (q31_t)0xc41d11ff, (q31_t)0x56875823, (q31_t)0xc418a46d,
- (q31_t)0x567b949d, (q31_t)0xc414392b, (q31_t)0x566fd039, (q31_t)0xc40fd037,
- (q31_t)0x56640af7, (q31_t)0xc40b6994, (q31_t)0x565844d8, (q31_t)0xc4070540,
- (q31_t)0x564c7ddd, (q31_t)0xc402a33c, (q31_t)0x5640b606, (q31_t)0xc3fe4388,
- (q31_t)0x5634ed53, (q31_t)0xc3f9e624, (q31_t)0x562923c5, (q31_t)0xc3f58b10,
- (q31_t)0x561d595d, (q31_t)0xc3f1324e, (q31_t)0x56118e1a, (q31_t)0xc3ecdbdc,
- (q31_t)0x5605c1fd, (q31_t)0xc3e887bb, (q31_t)0x55f9f507, (q31_t)0xc3e435ea,
- (q31_t)0x55ee2738, (q31_t)0xc3dfe66c, (q31_t)0x55e25890, (q31_t)0xc3db993e,
- (q31_t)0x55d68911, (q31_t)0xc3d74e62, (q31_t)0x55cab8ba, (q31_t)0xc3d305d8,
- (q31_t)0x55bee78c, (q31_t)0xc3cebfa0, (q31_t)0x55b31587, (q31_t)0xc3ca7bba,
- (q31_t)0x55a742ac, (q31_t)0xc3c63a26, (q31_t)0x559b6efb, (q31_t)0xc3c1fae5,
- (q31_t)0x558f9a76, (q31_t)0xc3bdbdf6, (q31_t)0x5583c51b, (q31_t)0xc3b9835a,
- (q31_t)0x5577eeec, (q31_t)0xc3b54b11, (q31_t)0x556c17e9, (q31_t)0xc3b1151b,
- (q31_t)0x55604013, (q31_t)0xc3ace178, (q31_t)0x5554676a, (q31_t)0xc3a8b028,
- (q31_t)0x55488dee, (q31_t)0xc3a4812c, (q31_t)0x553cb3a0, (q31_t)0xc3a05484,
- (q31_t)0x5530d881, (q31_t)0xc39c2a2f, (q31_t)0x5524fc90, (q31_t)0xc398022f,
- (q31_t)0x55191fcf, (q31_t)0xc393dc82, (q31_t)0x550d423d, (q31_t)0xc38fb92a,
- (q31_t)0x550163dc, (q31_t)0xc38b9827, (q31_t)0x54f584ac, (q31_t)0xc3877978,
- (q31_t)0x54e9a4ac, (q31_t)0xc3835d1e, (q31_t)0x54ddc3de, (q31_t)0xc37f4319,
- (q31_t)0x54d1e242, (q31_t)0xc37b2b6a, (q31_t)0x54c5ffd9, (q31_t)0xc377160f,
- (q31_t)0x54ba1ca3, (q31_t)0xc373030a, (q31_t)0x54ae38a0, (q31_t)0xc36ef25b,
- (q31_t)0x54a253d1, (q31_t)0xc36ae401, (q31_t)0x54966e36, (q31_t)0xc366d7fd,
- (q31_t)0x548a87d1, (q31_t)0xc362ce50, (q31_t)0x547ea0a0, (q31_t)0xc35ec6f8,
- (q31_t)0x5472b8a5, (q31_t)0xc35ac1f7, (q31_t)0x5466cfe1, (q31_t)0xc356bf4d,
- (q31_t)0x545ae653, (q31_t)0xc352bef9, (q31_t)0x544efbfc, (q31_t)0xc34ec0fc,
- (q31_t)0x544310dd, (q31_t)0xc34ac556, (q31_t)0x543724f5, (q31_t)0xc346cc07,
- (q31_t)0x542b3846, (q31_t)0xc342d510, (q31_t)0x541f4ad1, (q31_t)0xc33ee070,
- (q31_t)0x54135c94, (q31_t)0xc33aee27, (q31_t)0x54076d91, (q31_t)0xc336fe37,
- (q31_t)0x53fb7dc9, (q31_t)0xc333109e, (q31_t)0x53ef8d3c, (q31_t)0xc32f255e,
- (q31_t)0x53e39be9, (q31_t)0xc32b3c75, (q31_t)0x53d7a9d3, (q31_t)0xc32755e5,
- (q31_t)0x53cbb6f8, (q31_t)0xc32371ae, (q31_t)0x53bfc35b, (q31_t)0xc31f8fcf,
- (q31_t)0x53b3cefa, (q31_t)0xc31bb049, (q31_t)0x53a7d9d7, (q31_t)0xc317d31c,
- (q31_t)0x539be3f2, (q31_t)0xc313f848, (q31_t)0x538fed4b, (q31_t)0xc3101fce,
- (q31_t)0x5383f5e3, (q31_t)0xc30c49ad, (q31_t)0x5377fdbb, (q31_t)0xc30875e5,
- (q31_t)0x536c04d2, (q31_t)0xc304a477, (q31_t)0x53600b2a, (q31_t)0xc300d563,
- (q31_t)0x535410c3, (q31_t)0xc2fd08a9, (q31_t)0x5348159d, (q31_t)0xc2f93e4a,
- (q31_t)0x533c19b8, (q31_t)0xc2f57644, (q31_t)0x53301d16, (q31_t)0xc2f1b099,
- (q31_t)0x53241fb6, (q31_t)0xc2eded49, (q31_t)0x5318219a, (q31_t)0xc2ea2c53,
- (q31_t)0x530c22c1, (q31_t)0xc2e66db8, (q31_t)0x5300232c, (q31_t)0xc2e2b178,
- (q31_t)0x52f422db, (q31_t)0xc2def794, (q31_t)0x52e821cf, (q31_t)0xc2db400a,
- (q31_t)0x52dc2009, (q31_t)0xc2d78add, (q31_t)0x52d01d89, (q31_t)0xc2d3d80a,
- (q31_t)0x52c41a4f, (q31_t)0xc2d02794, (q31_t)0x52b8165b, (q31_t)0xc2cc7979,
- (q31_t)0x52ac11af, (q31_t)0xc2c8cdbb, (q31_t)0x52a00c4b, (q31_t)0xc2c52459,
- (q31_t)0x5294062f, (q31_t)0xc2c17d52, (q31_t)0x5287ff5b, (q31_t)0xc2bdd8a9,
- (q31_t)0x527bf7d1, (q31_t)0xc2ba365c, (q31_t)0x526fef90, (q31_t)0xc2b6966c,
- (q31_t)0x5263e699, (q31_t)0xc2b2f8d8, (q31_t)0x5257dced, (q31_t)0xc2af5da2,
- (q31_t)0x524bd28c, (q31_t)0xc2abc4c9, (q31_t)0x523fc776, (q31_t)0xc2a82e4d,
- (q31_t)0x5233bbac, (q31_t)0xc2a49a2e, (q31_t)0x5227af2e, (q31_t)0xc2a1086d,
- (q31_t)0x521ba1fd, (q31_t)0xc29d790a, (q31_t)0x520f941a, (q31_t)0xc299ec05,
- (q31_t)0x52038584, (q31_t)0xc296615d, (q31_t)0x51f7763c, (q31_t)0xc292d914,
- (q31_t)0x51eb6643, (q31_t)0xc28f5329, (q31_t)0x51df5599, (q31_t)0xc28bcf9c,
- (q31_t)0x51d3443f, (q31_t)0xc2884e6e, (q31_t)0x51c73235, (q31_t)0xc284cf9f,
- (q31_t)0x51bb1f7c, (q31_t)0xc281532e, (q31_t)0x51af0c13, (q31_t)0xc27dd91c,
- (q31_t)0x51a2f7fc, (q31_t)0xc27a616a, (q31_t)0x5196e337, (q31_t)0xc276ec16,
- (q31_t)0x518acdc4, (q31_t)0xc2737922, (q31_t)0x517eb7a4, (q31_t)0xc270088e,
- (q31_t)0x5172a0d7, (q31_t)0xc26c9a58, (q31_t)0x5166895f, (q31_t)0xc2692e83,
- (q31_t)0x515a713a, (q31_t)0xc265c50e, (q31_t)0x514e586a, (q31_t)0xc2625df8,
- (q31_t)0x51423ef0, (q31_t)0xc25ef943, (q31_t)0x513624cb, (q31_t)0xc25b96ee,
- (q31_t)0x512a09fc, (q31_t)0xc25836f9, (q31_t)0x511dee84, (q31_t)0xc254d965,
- (q31_t)0x5111d263, (q31_t)0xc2517e31, (q31_t)0x5105b599, (q31_t)0xc24e255e,
- (q31_t)0x50f99827, (q31_t)0xc24aceed, (q31_t)0x50ed7a0e, (q31_t)0xc2477adc,
- (q31_t)0x50e15b4e, (q31_t)0xc244292c, (q31_t)0x50d53be7, (q31_t)0xc240d9de,
- (q31_t)0x50c91bda, (q31_t)0xc23d8cf1, (q31_t)0x50bcfb28, (q31_t)0xc23a4265,
- (q31_t)0x50b0d9d0, (q31_t)0xc236fa3b, (q31_t)0x50a4b7d3, (q31_t)0xc233b473,
- (q31_t)0x50989532, (q31_t)0xc230710d, (q31_t)0x508c71ee, (q31_t)0xc22d3009,
- (q31_t)0x50804e06, (q31_t)0xc229f167, (q31_t)0x5074297b, (q31_t)0xc226b528,
- (q31_t)0x5068044e, (q31_t)0xc2237b4b, (q31_t)0x505bde7f, (q31_t)0xc22043d0,
- (q31_t)0x504fb80e, (q31_t)0xc21d0eb8, (q31_t)0x504390fd, (q31_t)0xc219dc03,
- (q31_t)0x5037694b, (q31_t)0xc216abb1, (q31_t)0x502b40f8, (q31_t)0xc2137dc2,
- (q31_t)0x501f1807, (q31_t)0xc2105236, (q31_t)0x5012ee76, (q31_t)0xc20d290d,
- (q31_t)0x5006c446, (q31_t)0xc20a0248, (q31_t)0x4ffa9979, (q31_t)0xc206dde6,
- (q31_t)0x4fee6e0d, (q31_t)0xc203bbe8, (q31_t)0x4fe24205, (q31_t)0xc2009c4e,
- (q31_t)0x4fd6155f, (q31_t)0xc1fd7f17, (q31_t)0x4fc9e81e, (q31_t)0xc1fa6445,
- (q31_t)0x4fbdba40, (q31_t)0xc1f74bd6, (q31_t)0x4fb18bc8, (q31_t)0xc1f435cc,
- (q31_t)0x4fa55cb4, (q31_t)0xc1f12227, (q31_t)0x4f992d06, (q31_t)0xc1ee10e5,
- (q31_t)0x4f8cfcbe, (q31_t)0xc1eb0209, (q31_t)0x4f80cbdc, (q31_t)0xc1e7f591,
- (q31_t)0x4f749a61, (q31_t)0xc1e4eb7e, (q31_t)0x4f68684e, (q31_t)0xc1e1e3d0,
- (q31_t)0x4f5c35a3, (q31_t)0xc1dede87, (q31_t)0x4f500260, (q31_t)0xc1dbdba3,
- (q31_t)0x4f43ce86, (q31_t)0xc1d8db25, (q31_t)0x4f379a16, (q31_t)0xc1d5dd0c,
- (q31_t)0x4f2b650f, (q31_t)0xc1d2e158, (q31_t)0x4f1f2f73, (q31_t)0xc1cfe80a,
- (q31_t)0x4f12f941, (q31_t)0xc1ccf122, (q31_t)0x4f06c27a, (q31_t)0xc1c9fca0,
- (q31_t)0x4efa8b20, (q31_t)0xc1c70a84, (q31_t)0x4eee5331, (q31_t)0xc1c41ace,
- (q31_t)0x4ee21aaf, (q31_t)0xc1c12d7e, (q31_t)0x4ed5e19a, (q31_t)0xc1be4294,
- (q31_t)0x4ec9a7f3, (q31_t)0xc1bb5a11, (q31_t)0x4ebd6db9, (q31_t)0xc1b873f5,
- (q31_t)0x4eb132ef, (q31_t)0xc1b5903f, (q31_t)0x4ea4f793, (q31_t)0xc1b2aef0,
- (q31_t)0x4e98bba7, (q31_t)0xc1afd007, (q31_t)0x4e8c7f2a, (q31_t)0xc1acf386,
- (q31_t)0x4e80421e, (q31_t)0xc1aa196c, (q31_t)0x4e740483, (q31_t)0xc1a741b9,
- (q31_t)0x4e67c65a, (q31_t)0xc1a46c6e, (q31_t)0x4e5b87a2, (q31_t)0xc1a1998a,
- (q31_t)0x4e4f485c, (q31_t)0xc19ec90d, (q31_t)0x4e430889, (q31_t)0xc19bfaf9,
- (q31_t)0x4e36c82a, (q31_t)0xc1992f4c, (q31_t)0x4e2a873e, (q31_t)0xc1966606,
- (q31_t)0x4e1e45c6, (q31_t)0xc1939f29, (q31_t)0x4e1203c3, (q31_t)0xc190dab4,
- (q31_t)0x4e05c135, (q31_t)0xc18e18a7, (q31_t)0x4df97e1d, (q31_t)0xc18b5903,
- (q31_t)0x4ded3a7b, (q31_t)0xc1889bc6, (q31_t)0x4de0f64f, (q31_t)0xc185e0f3,
- (q31_t)0x4dd4b19a, (q31_t)0xc1832888, (q31_t)0x4dc86c5d, (q31_t)0xc1807285,
- (q31_t)0x4dbc2698, (q31_t)0xc17dbeec, (q31_t)0x4dafe04b, (q31_t)0xc17b0dbb,
- (q31_t)0x4da39978, (q31_t)0xc1785ef4, (q31_t)0x4d97521d, (q31_t)0xc175b296,
- (q31_t)0x4d8b0a3d, (q31_t)0xc17308a1, (q31_t)0x4d7ec1d6, (q31_t)0xc1706115,
- (q31_t)0x4d7278eb, (q31_t)0xc16dbbf3, (q31_t)0x4d662f7b, (q31_t)0xc16b193a,
- (q31_t)0x4d59e586, (q31_t)0xc16878eb, (q31_t)0x4d4d9b0e, (q31_t)0xc165db05,
- (q31_t)0x4d415013, (q31_t)0xc1633f8a, (q31_t)0x4d350495, (q31_t)0xc160a678,
- (q31_t)0x4d28b894, (q31_t)0xc15e0fd1, (q31_t)0x4d1c6c11, (q31_t)0xc15b7b94,
- (q31_t)0x4d101f0e, (q31_t)0xc158e9c1, (q31_t)0x4d03d189, (q31_t)0xc1565a58,
- (q31_t)0x4cf78383, (q31_t)0xc153cd5a, (q31_t)0x4ceb34fe, (q31_t)0xc15142c6,
- (q31_t)0x4cdee5f9, (q31_t)0xc14eba9d, (q31_t)0x4cd29676, (q31_t)0xc14c34df,
- (q31_t)0x4cc64673, (q31_t)0xc149b18b, (q31_t)0x4cb9f5f3, (q31_t)0xc14730a3,
- (q31_t)0x4cada4f5, (q31_t)0xc144b225, (q31_t)0x4ca1537a, (q31_t)0xc1423613,
- (q31_t)0x4c950182, (q31_t)0xc13fbc6c, (q31_t)0x4c88af0e, (q31_t)0xc13d4530,
- (q31_t)0x4c7c5c1e, (q31_t)0xc13ad060, (q31_t)0x4c7008b3, (q31_t)0xc1385dfb,
- (q31_t)0x4c63b4ce, (q31_t)0xc135ee02, (q31_t)0x4c57606e, (q31_t)0xc1338075,
- (q31_t)0x4c4b0b94, (q31_t)0xc1311553, (q31_t)0x4c3eb641, (q31_t)0xc12eac9d,
- (q31_t)0x4c326075, (q31_t)0xc12c4653, (q31_t)0x4c260a31, (q31_t)0xc129e276,
- (q31_t)0x4c19b374, (q31_t)0xc1278104, (q31_t)0x4c0d5c41, (q31_t)0xc12521ff,
- (q31_t)0x4c010496, (q31_t)0xc122c566, (q31_t)0x4bf4ac75, (q31_t)0xc1206b39,
- (q31_t)0x4be853de, (q31_t)0xc11e1379, (q31_t)0x4bdbfad1, (q31_t)0xc11bbe26,
- (q31_t)0x4bcfa150, (q31_t)0xc1196b3f, (q31_t)0x4bc34759, (q31_t)0xc1171ac6,
- (q31_t)0x4bb6ecef, (q31_t)0xc114ccb9, (q31_t)0x4baa9211, (q31_t)0xc1128119,
- (q31_t)0x4b9e36c0, (q31_t)0xc11037e6, (q31_t)0x4b91dafc, (q31_t)0xc10df120,
- (q31_t)0x4b857ec7, (q31_t)0xc10bacc8, (q31_t)0x4b79221f, (q31_t)0xc1096add,
- (q31_t)0x4b6cc506, (q31_t)0xc1072b5f, (q31_t)0x4b60677c, (q31_t)0xc104ee4f,
- (q31_t)0x4b540982, (q31_t)0xc102b3ac, (q31_t)0x4b47ab19, (q31_t)0xc1007b77,
- (q31_t)0x4b3b4c40, (q31_t)0xc0fe45b0, (q31_t)0x4b2eecf8, (q31_t)0xc0fc1257,
- (q31_t)0x4b228d42, (q31_t)0xc0f9e16b, (q31_t)0x4b162d1d, (q31_t)0xc0f7b2ee,
- (q31_t)0x4b09cc8c, (q31_t)0xc0f586df, (q31_t)0x4afd6b8d, (q31_t)0xc0f35d3e,
- (q31_t)0x4af10a22, (q31_t)0xc0f1360b, (q31_t)0x4ae4a84b, (q31_t)0xc0ef1147,
- (q31_t)0x4ad84609, (q31_t)0xc0eceef1, (q31_t)0x4acbe35b, (q31_t)0xc0eacf09,
- (q31_t)0x4abf8043, (q31_t)0xc0e8b190, (q31_t)0x4ab31cc1, (q31_t)0xc0e69686,
- (q31_t)0x4aa6b8d5, (q31_t)0xc0e47deb, (q31_t)0x4a9a5480, (q31_t)0xc0e267be,
- (q31_t)0x4a8defc3, (q31_t)0xc0e05401, (q31_t)0x4a818a9d, (q31_t)0xc0de42b2,
- (q31_t)0x4a752510, (q31_t)0xc0dc33d2, (q31_t)0x4a68bf1b, (q31_t)0xc0da2762,
- (q31_t)0x4a5c58c0, (q31_t)0xc0d81d61, (q31_t)0x4a4ff1fe, (q31_t)0xc0d615cf,
- (q31_t)0x4a438ad7, (q31_t)0xc0d410ad, (q31_t)0x4a37234a, (q31_t)0xc0d20dfa,
- (q31_t)0x4a2abb59, (q31_t)0xc0d00db6, (q31_t)0x4a1e5303, (q31_t)0xc0ce0fe3,
- (q31_t)0x4a11ea49, (q31_t)0xc0cc147f, (q31_t)0x4a05812c, (q31_t)0xc0ca1b8a,
- (q31_t)0x49f917ac, (q31_t)0xc0c82506, (q31_t)0x49ecadc9, (q31_t)0xc0c630f2,
- (q31_t)0x49e04385, (q31_t)0xc0c43f4d, (q31_t)0x49d3d8df, (q31_t)0xc0c25019,
- (q31_t)0x49c76dd8, (q31_t)0xc0c06355, (q31_t)0x49bb0271, (q31_t)0xc0be7901,
- (q31_t)0x49ae96aa, (q31_t)0xc0bc911d, (q31_t)0x49a22a83, (q31_t)0xc0baabaa,
- (q31_t)0x4995bdfd, (q31_t)0xc0b8c8a7, (q31_t)0x49895118, (q31_t)0xc0b6e815,
- (q31_t)0x497ce3d5, (q31_t)0xc0b509f3, (q31_t)0x49707635, (q31_t)0xc0b32e42,
- (q31_t)0x49640837, (q31_t)0xc0b15502, (q31_t)0x495799dd, (q31_t)0xc0af7e33,
- (q31_t)0x494b2b27, (q31_t)0xc0ada9d4, (q31_t)0x493ebc14, (q31_t)0xc0abd7e6,
- (q31_t)0x49324ca7, (q31_t)0xc0aa086a, (q31_t)0x4925dcdf, (q31_t)0xc0a83b5e,
- (q31_t)0x49196cbc, (q31_t)0xc0a670c4, (q31_t)0x490cfc40, (q31_t)0xc0a4a89b,
- (q31_t)0x49008b6a, (q31_t)0xc0a2e2e3, (q31_t)0x48f41a3c, (q31_t)0xc0a11f9d,
- (q31_t)0x48e7a8b5, (q31_t)0xc09f5ec8, (q31_t)0x48db36d6, (q31_t)0xc09da065,
- (q31_t)0x48cec4a0, (q31_t)0xc09be473, (q31_t)0x48c25213, (q31_t)0xc09a2af3,
- (q31_t)0x48b5df30, (q31_t)0xc09873e4, (q31_t)0x48a96bf6, (q31_t)0xc096bf48,
- (q31_t)0x489cf867, (q31_t)0xc0950d1d, (q31_t)0x48908483, (q31_t)0xc0935d64,
- (q31_t)0x4884104b, (q31_t)0xc091b01d, (q31_t)0x48779bbe, (q31_t)0xc0900548,
- (q31_t)0x486b26de, (q31_t)0xc08e5ce5, (q31_t)0x485eb1ab, (q31_t)0xc08cb6f5,
- (q31_t)0x48523c25, (q31_t)0xc08b1376, (q31_t)0x4845c64d, (q31_t)0xc089726a,
- (q31_t)0x48395024, (q31_t)0xc087d3d0, (q31_t)0x482cd9a9, (q31_t)0xc08637a9,
- (q31_t)0x482062de, (q31_t)0xc0849df4, (q31_t)0x4813ebc2, (q31_t)0xc08306b2,
- (q31_t)0x48077457, (q31_t)0xc08171e2, (q31_t)0x47fafc9c, (q31_t)0xc07fdf85,
- (q31_t)0x47ee8493, (q31_t)0xc07e4f9b, (q31_t)0x47e20c3b, (q31_t)0xc07cc223,
- (q31_t)0x47d59396, (q31_t)0xc07b371e, (q31_t)0x47c91aa3, (q31_t)0xc079ae8c,
- (q31_t)0x47bca163, (q31_t)0xc078286e, (q31_t)0x47b027d7, (q31_t)0xc076a4c2,
- (q31_t)0x47a3adff, (q31_t)0xc0752389, (q31_t)0x479733dc, (q31_t)0xc073a4c3,
- (q31_t)0x478ab96e, (q31_t)0xc0722871, (q31_t)0x477e3eb5, (q31_t)0xc070ae92,
- (q31_t)0x4771c3b3, (q31_t)0xc06f3726, (q31_t)0x47654867, (q31_t)0xc06dc22e,
- (q31_t)0x4758ccd2, (q31_t)0xc06c4fa8, (q31_t)0x474c50f4, (q31_t)0xc06adf97,
- (q31_t)0x473fd4cf, (q31_t)0xc06971f9, (q31_t)0x47335862, (q31_t)0xc06806ce,
- (q31_t)0x4726dbae, (q31_t)0xc0669e18, (q31_t)0x471a5eb3, (q31_t)0xc06537d4,
- (q31_t)0x470de172, (q31_t)0xc063d405, (q31_t)0x470163eb, (q31_t)0xc06272aa,
- (q31_t)0x46f4e620, (q31_t)0xc06113c2, (q31_t)0x46e86810, (q31_t)0xc05fb74e,
- (q31_t)0x46dbe9bb, (q31_t)0xc05e5d4e, (q31_t)0x46cf6b23, (q31_t)0xc05d05c3,
- (q31_t)0x46c2ec48, (q31_t)0xc05bb0ab, (q31_t)0x46b66d29, (q31_t)0xc05a5e07,
- (q31_t)0x46a9edc9, (q31_t)0xc0590dd8, (q31_t)0x469d6e27, (q31_t)0xc057c01d,
- (q31_t)0x4690ee44, (q31_t)0xc05674d6, (q31_t)0x46846e1f, (q31_t)0xc0552c03,
- (q31_t)0x4677edbb, (q31_t)0xc053e5a5, (q31_t)0x466b6d16, (q31_t)0xc052a1bb,
- (q31_t)0x465eec33, (q31_t)0xc0516045, (q31_t)0x46526b10, (q31_t)0xc0502145,
- (q31_t)0x4645e9af, (q31_t)0xc04ee4b8, (q31_t)0x46396810, (q31_t)0xc04daaa1,
- (q31_t)0x462ce634, (q31_t)0xc04c72fe, (q31_t)0x4620641a, (q31_t)0xc04b3dcf,
- (q31_t)0x4613e1c5, (q31_t)0xc04a0b16, (q31_t)0x46075f33, (q31_t)0xc048dad1,
- (q31_t)0x45fadc66, (q31_t)0xc047ad01, (q31_t)0x45ee595d, (q31_t)0xc04681a6,
- (q31_t)0x45e1d61b, (q31_t)0xc04558c0, (q31_t)0x45d5529e, (q31_t)0xc044324f,
- (q31_t)0x45c8cee7, (q31_t)0xc0430e53, (q31_t)0x45bc4af8, (q31_t)0xc041eccc,
- (q31_t)0x45afc6d0, (q31_t)0xc040cdba, (q31_t)0x45a3426f, (q31_t)0xc03fb11d,
- (q31_t)0x4596bdd7, (q31_t)0xc03e96f6, (q31_t)0x458a3908, (q31_t)0xc03d7f44,
- (q31_t)0x457db403, (q31_t)0xc03c6a07, (q31_t)0x45712ec7, (q31_t)0xc03b573f,
- (q31_t)0x4564a955, (q31_t)0xc03a46ed, (q31_t)0x455823ae, (q31_t)0xc0393910,
- (q31_t)0x454b9dd3, (q31_t)0xc0382da8, (q31_t)0x453f17c3, (q31_t)0xc03724b6,
- (q31_t)0x4532917f, (q31_t)0xc0361e3a, (q31_t)0x45260b08, (q31_t)0xc0351a33,
- (q31_t)0x4519845e, (q31_t)0xc03418a2, (q31_t)0x450cfd82, (q31_t)0xc0331986,
- (q31_t)0x45007674, (q31_t)0xc0321ce0, (q31_t)0x44f3ef35, (q31_t)0xc03122b0,
- (q31_t)0x44e767c5, (q31_t)0xc0302af5, (q31_t)0x44dae024, (q31_t)0xc02f35b1,
- (q31_t)0x44ce5854, (q31_t)0xc02e42e2, (q31_t)0x44c1d054, (q31_t)0xc02d5289,
- (q31_t)0x44b54825, (q31_t)0xc02c64a6, (q31_t)0x44a8bfc7, (q31_t)0xc02b7939,
- (q31_t)0x449c373c, (q31_t)0xc02a9042, (q31_t)0x448fae83, (q31_t)0xc029a9c1,
- (q31_t)0x4483259d, (q31_t)0xc028c5b6, (q31_t)0x44769c8b, (q31_t)0xc027e421,
- (q31_t)0x446a134c, (q31_t)0xc0270502, (q31_t)0x445d89e2, (q31_t)0xc0262859,
- (q31_t)0x4451004d, (q31_t)0xc0254e27, (q31_t)0x4444768d, (q31_t)0xc024766a,
- (q31_t)0x4437eca4, (q31_t)0xc023a124, (q31_t)0x442b6290, (q31_t)0xc022ce54,
- (q31_t)0x441ed854, (q31_t)0xc021fdfb, (q31_t)0x44124dee, (q31_t)0xc0213018,
- (q31_t)0x4405c361, (q31_t)0xc02064ab, (q31_t)0x43f938ac, (q31_t)0xc01f9bb5,
- (q31_t)0x43ecadcf, (q31_t)0xc01ed535, (q31_t)0x43e022cc, (q31_t)0xc01e112b,
- (q31_t)0x43d397a3, (q31_t)0xc01d4f99, (q31_t)0x43c70c54, (q31_t)0xc01c907c,
- (q31_t)0x43ba80df, (q31_t)0xc01bd3d6, (q31_t)0x43adf546, (q31_t)0xc01b19a7,
- (q31_t)0x43a16988, (q31_t)0xc01a61ee, (q31_t)0x4394dda7, (q31_t)0xc019acac,
- (q31_t)0x438851a2, (q31_t)0xc018f9e1, (q31_t)0x437bc57b, (q31_t)0xc018498c,
- (q31_t)0x436f3931, (q31_t)0xc0179bae, (q31_t)0x4362acc5, (q31_t)0xc016f047,
- (q31_t)0x43562038, (q31_t)0xc0164757, (q31_t)0x43499389, (q31_t)0xc015a0dd,
- (q31_t)0x433d06bb, (q31_t)0xc014fcda, (q31_t)0x433079cc, (q31_t)0xc0145b4e,
- (q31_t)0x4323ecbe, (q31_t)0xc013bc39, (q31_t)0x43175f91, (q31_t)0xc0131f9b,
- (q31_t)0x430ad245, (q31_t)0xc0128574, (q31_t)0x42fe44dc, (q31_t)0xc011edc3,
- (q31_t)0x42f1b755, (q31_t)0xc011588a, (q31_t)0x42e529b0, (q31_t)0xc010c5c7,
- (q31_t)0x42d89bf0, (q31_t)0xc010357c, (q31_t)0x42cc0e13, (q31_t)0xc00fa7a8,
- (q31_t)0x42bf801a, (q31_t)0xc00f1c4a, (q31_t)0x42b2f207, (q31_t)0xc00e9364,
- (q31_t)0x42a663d8, (q31_t)0xc00e0cf5, (q31_t)0x4299d590, (q31_t)0xc00d88fd,
- (q31_t)0x428d472e, (q31_t)0xc00d077c, (q31_t)0x4280b8b3, (q31_t)0xc00c8872,
- (q31_t)0x42742a1f, (q31_t)0xc00c0be0, (q31_t)0x42679b73, (q31_t)0xc00b91c4,
- (q31_t)0x425b0caf, (q31_t)0xc00b1a20, (q31_t)0x424e7dd4, (q31_t)0xc00aa4f3,
- (q31_t)0x4241eee2, (q31_t)0xc00a323d, (q31_t)0x42355fd9, (q31_t)0xc009c1ff,
- (q31_t)0x4228d0bb, (q31_t)0xc0095438, (q31_t)0x421c4188, (q31_t)0xc008e8e8,
- (q31_t)0x420fb240, (q31_t)0xc008800f, (q31_t)0x420322e3, (q31_t)0xc00819ae,
- (q31_t)0x41f69373, (q31_t)0xc007b5c4, (q31_t)0x41ea03ef, (q31_t)0xc0075452,
- (q31_t)0x41dd7459, (q31_t)0xc006f556, (q31_t)0x41d0e4b0, (q31_t)0xc00698d3,
- (q31_t)0x41c454f5, (q31_t)0xc0063ec6, (q31_t)0x41b7c528, (q31_t)0xc005e731,
- (q31_t)0x41ab354b, (q31_t)0xc0059214, (q31_t)0x419ea55d, (q31_t)0xc0053f6e,
- (q31_t)0x4192155f, (q31_t)0xc004ef3f, (q31_t)0x41858552, (q31_t)0xc004a188,
- (q31_t)0x4178f536, (q31_t)0xc0045648, (q31_t)0x416c650b, (q31_t)0xc0040d80,
- (q31_t)0x415fd4d2, (q31_t)0xc003c72f, (q31_t)0x4153448c, (q31_t)0xc0038356,
- (q31_t)0x4146b438, (q31_t)0xc00341f4, (q31_t)0x413a23d8, (q31_t)0xc003030a,
- (q31_t)0x412d936c, (q31_t)0xc002c697, (q31_t)0x412102f4, (q31_t)0xc0028c9c,
- (q31_t)0x41147271, (q31_t)0xc0025519, (q31_t)0x4107e1e3, (q31_t)0xc002200d,
- (q31_t)0x40fb514b, (q31_t)0xc001ed78, (q31_t)0x40eec0aa, (q31_t)0xc001bd5c,
- (q31_t)0x40e22fff, (q31_t)0xc0018fb6, (q31_t)0x40d59f4c, (q31_t)0xc0016489,
- (q31_t)0x40c90e90, (q31_t)0xc0013bd3, (q31_t)0x40bc7dcc, (q31_t)0xc0011594,
- (q31_t)0x40afed02, (q31_t)0xc000f1ce, (q31_t)0x40a35c30, (q31_t)0xc000d07e,
- (q31_t)0x4096cb58, (q31_t)0xc000b1a7, (q31_t)0x408a3a7b, (q31_t)0xc0009547,
- (q31_t)0x407da998, (q31_t)0xc0007b5f, (q31_t)0x407118b0, (q31_t)0xc00063ee,
- (q31_t)0x406487c4, (q31_t)0xc0004ef5, (q31_t)0x4057f6d4, (q31_t)0xc0003c74,
- (q31_t)0x404b65e1, (q31_t)0xc0002c6a, (q31_t)0x403ed4ea, (q31_t)0xc0001ed8,
- (q31_t)0x403243f1, (q31_t)0xc00013bd, (q31_t)0x4025b2f7, (q31_t)0xc0000b1a,
- (q31_t)0x401921fb, (q31_t)0xc00004ef, (q31_t)0x400c90fe, (q31_t)0xc000013c,
-};
+ @addtogroup RealFFT
+ @{
+ */
/**
-* @} end of RealFFT_Table group
-*/
+ @brief Initialization function for the Q31 RFFT/RIFFT.
+ @param[in,out] S points to an instance of the Q31 RFFT/RIFFT structure
+ @param[in] fftLenReal length of the FFT
+ @param[in] ifftFlagR flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLenReal is not a supported length
-/**
-* @addtogroup RealFFT
-* @{
-*/
-
-/**
-* @brief Initialization function for the Q31 RFFT/RIFFT.
-* @param[in, out] *S points to an instance of the Q31 RFFT/RIFFT structure.
-* @param[in] fftLenReal length of the FFT.
-* @param[in] ifftFlagR flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform.
-* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
-* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLenReal is not a supported value.
-*
-* \par Description:
-* \par
-* The parameter fftLenReal Specifies length of RFFT/RIFFT Process. Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
-* \par
-* The parameter ifftFlagR controls whether a forward or inverse transform is computed.
-* Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
-* \par
-* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
-* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-* \par 7
-* This function also initializes Twiddle factor table.
+ @par Details
+ The parameter fftLenReal specifies length of RFFT/RIFFT Process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
+ @par
+ The parameter ifftFlagR controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ This function also initializes Twiddle factor table.
*/
arm_status arm_rfft_init_q31(
@@ -4229,42 +91,60 @@ arm_status arm_rfft_init_q31(
/* Initialization of coef modifier depending on the FFT length */
switch (S->fftLenReal)
{
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
case 8192U:
S->twidCoefRModifier = 1U;
S->pCfft = &arm_cfft_sR_q31_len4096;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
case 4096U:
S->twidCoefRModifier = 2U;
S->pCfft = &arm_cfft_sR_q31_len2048;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
case 2048U:
S->twidCoefRModifier = 4U;
S->pCfft = &arm_cfft_sR_q31_len1024;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
case 1024U:
S->twidCoefRModifier = 8U;
S->pCfft = &arm_cfft_sR_q31_len512;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
case 512U:
S->twidCoefRModifier = 16U;
S->pCfft = &arm_cfft_sR_q31_len256;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
case 256U:
S->twidCoefRModifier = 32U;
S->pCfft = &arm_cfft_sR_q31_len128;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
case 128U:
S->twidCoefRModifier = 64U;
S->pCfft = &arm_cfft_sR_q31_len64;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
case 64U:
S->twidCoefRModifier = 128U;
S->pCfft = &arm_cfft_sR_q31_len32;
break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
case 32U:
S->twidCoefRModifier = 256U;
S->pCfft = &arm_cfft_sR_q31_len16;
break;
+#endif
default:
/* Reporting argument error if rfftSize is not valid value */
status = ARM_MATH_ARGUMENT_ERROR;
@@ -4276,5 +156,5 @@ arm_status arm_rfft_init_q31(
}
/**
-* @} end of RealFFT group
-*/
+ @} end of RealFFT group
+ */
diff --git a/DSP/Source/TransformFunctions/arm_rfft_q15.c b/DSP/Source/TransformFunctions/arm_rfft_q15.c
index f85cf30..fdc9bab 100644
--- a/DSP/Source/TransformFunctions/arm_rfft_q15.c
+++ b/DSP/Source/TransformFunctions/arm_rfft_q15.c
@@ -3,13 +3,13 @@
* Title: arm_rfft_q15.c
* Description: RFFT & RIFFT Q15 process function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -33,173 +33,161 @@
* -------------------------------------------------------------------- */
void arm_split_rfft_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pATable,
- q15_t * pBTable,
- q15_t * pDst,
- uint32_t modifier);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier);
void arm_split_rifft_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pATable,
- q15_t * pBTable,
- q15_t * pDst,
- uint32_t modifier);
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier);
/**
-* @addtogroup RealFFT
-* @{
-*/
+ @addtogroup RealFFT
+ @{
+ */
/**
-* @brief Processing function for the Q15 RFFT/RIFFT.
-* @param[in] *S points to an instance of the Q15 RFFT/RIFFT structure.
-* @param[in] *pSrc points to the input buffer.
-* @param[out] *pDst points to the output buffer.
-* @return none.
-*
-* \par Input an output formats:
-* \par
-* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-* Hence the output format is different for different RFFT sizes.
-* The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
-* \par
-* \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
-* \par
-* \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
-*/
+ @brief Processing function for the Q15 RFFT/RIFFT.
+ @param[in] S points to an instance of the Q15 RFFT/RIFFT structure
+ @param[in] pSrc points to input buffer
+ @param[out] pDst points to output buffer
+ @return none
+
+ @par Input an output formats
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different RFFT sizes.
+ The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+ @par
+ \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
+ @par
+ \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
+ */
void arm_rfft_q15(
- const arm_rfft_instance_q15 * S,
- q15_t * pSrc,
- q15_t * pDst)
+ const arm_rfft_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst)
{
- const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
- uint32_t i;
- uint32_t L2 = S->fftLenReal >> 1;
+ const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
+ uint32_t L2 = S->fftLenReal >> 1U;
+ uint32_t i;
- /* Calculation of RIFFT of input */
- if (S->ifftFlagR == 1U)
- {
- /* Real IFFT core process */
- arm_split_rifft_q15(pSrc, L2, S->pTwiddleAReal,
- S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ /* Calculation of RIFFT of input */
+ if (S->ifftFlagR == 1U)
+ {
+ /* Real IFFT core process */
+ arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
- /* Complex IFFT process */
- arm_cfft_q15(S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+ /* Complex IFFT process */
+ arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
- for(i=0;ifftLenReal;i++)
- {
- pDst[i] = pDst[i] << 1;
- }
- }
- else
- {
- /* Calculation of RFFT of input */
+ for(i = 0; i < S->fftLenReal; i++)
+ {
+ pDst[i] = pDst[i] << 1U;
+ }
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
- /* Complex FFT process */
- arm_cfft_q15(S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+ /* Complex FFT process */
+ arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+ /* Real FFT core process */
+ arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ }
- /* Real FFT core process */
- arm_split_rfft_q15(pSrc, L2, S->pTwiddleAReal,
- S->pTwiddleBReal, pDst, S->twidCoefRModifier);
- }
}
/**
-* @} end of RealFFT group
-*/
+ @} end of RealFFT group
+ */
/**
-* @brief Core Real FFT process
-* @param *pSrc points to the input buffer.
-* @param fftLen length of FFT.
-* @param *pATable points to the A twiddle Coef buffer.
-* @param *pBTable points to the B twiddle Coef buffer.
-* @param *pDst points to the output buffer.
-* @param modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-* The function implements a Real FFT
-*/
+ @brief Core Real FFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+
+ @par
+ The function implements a Real FFT
+ */
void arm_split_rfft_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pATable,
- q15_t * pBTable,
- q15_t * pDst,
- uint32_t modifier)
-{
- uint32_t i; /* Loop Counter */
- q31_t outR, outI; /* Temporary variables for output */
- q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
- q15_t *pSrc1, *pSrc2;
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pSrc1, *pSrc2;
#if defined (ARM_MATH_DSP)
- q15_t *pD1, *pD2;
+ q15_t *pD1, *pD2;
#endif
- // pSrc[2U * fftLen] = pSrc[0];
- // pSrc[(2U * fftLen) + 1U] = pSrc[1];
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
- pCoefA = &pATable[modifier * 2U];
- pCoefB = &pBTable[modifier * 2U];
-
- pSrc1 = &pSrc[2];
- pSrc2 = &pSrc[(2U * fftLen) - 2U];
+ pSrc1 = &pSrc[2];
+ pSrc2 = &pSrc[(2U * fftLen) - 2U];
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
i = 1U;
pD1 = pDst + 2;
pD2 = pDst + (4U * fftLen) - 2;
- for(i = fftLen - 1; i > 0; i--)
+ for (i = fftLen - 1; i > 0; i--)
{
/*
- outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]
- + pSrc[2 * n - 2 * i] * pBTable[2 * i] +
- pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- */
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); */
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
+ */
#ifndef ARM_MATH_BIG_ENDIAN
-
/* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
- outR = __SMUSD(*__SIMD32(pSrc1), *__SIMD32(pCoefA));
-
+ outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
#else
-
/* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
- outR = -(__SMUSD(*__SIMD32(pSrc1), *__SIMD32(pCoefA)));
+ outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* pSrc[2 * n - 2 * i] * pBTable[2 * i] +
- pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
- outR = __SMLAD(*__SIMD32(pSrc2), *__SIMD32(pCoefB), outR) >> 16U;
-
- /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+ outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
#ifndef ARM_MATH_BIG_ENDIAN
-
- outI = __SMUSDX(*__SIMD32(pSrc2)--, *__SIMD32(pCoefB));
-
+ outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
#else
-
- outI = __SMUSDX(*__SIMD32(pCoefB), *__SIMD32(pSrc2)--);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
- outI = __SMLADX(*__SIMD32(pSrc1)++, *__SIMD32(pCoefA), outI);
+ outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
/* write output */
*pD1++ = (q15_t) outR;
@@ -215,23 +203,23 @@ void arm_split_rfft_q15(
pCoefA = pCoefA + (2U * modifier);
}
- pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
- pDst[(2U * fftLen) + 1U] = 0;
+ pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2U * fftLen + 1U] = 0;
- pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
pDst[1] = 0;
#else
- /* Run the below code for Cortex-M0 */
i = 1U;
while (i < fftLen)
{
/*
- outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]
- + pSrc[2 * n - 2 * i] * pBTable[2 * i] +
- pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
*/
outR = *pSrc1 * *pCoefA;
@@ -239,10 +227,11 @@ void arm_split_rfft_q15(
outR = outR + (*pSrc2 * *pCoefB);
outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
-
- /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ /*
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
*/
outI = *pSrc2 * *(pCoefB + 1);
@@ -256,7 +245,7 @@ void arm_split_rfft_q15(
/* write output */
pDst[2U * i] = (q15_t) outR;
- pDst[(2U * i) + 1U] = outI >> 16U;
+ pDst[2U * i + 1U] = outI >> 16U;
/* write complex conjugate output */
pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
@@ -270,7 +259,7 @@ void arm_split_rfft_q15(
}
pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
- pDst[(2U * fftLen) + 1U] = 0;
+ pDst[2U * fftLen + 1U] = 0;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
pDst[1] = 0;
@@ -280,147 +269,112 @@ void arm_split_rfft_q15(
/**
-* @brief Core Real IFFT process
-* @param[in] *pSrc points to the input buffer.
-* @param[in] fftLen length of FFT.
-* @param[in] *pATable points to the twiddle Coef A buffer.
-* @param[in] *pBTable points to the twiddle Coef B buffer.
-* @param[out] *pDst points to the output buffer.
-* @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-* The function implements a Real IFFT
-*/
+ @brief Core Real IFFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+
+ @par
+ The function implements a Real IFFT
+ */
+
void arm_split_rifft_q15(
- q15_t * pSrc,
- uint32_t fftLen,
- q15_t * pATable,
- q15_t * pBTable,
- q15_t * pDst,
- uint32_t modifier)
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
{
- uint32_t i; /* Loop Counter */
- q31_t outR, outI; /* Temporary variables for output */
- q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
- q15_t *pSrc1, *pSrc2;
- q15_t *pDst1 = &pDst[0];
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pSrc1, *pSrc2;
+ q15_t *pDst1 = &pDst[0];
- pCoefA = &pATable[0];
- pCoefB = &pBTable[0];
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
- pSrc1 = &pSrc[0];
- pSrc2 = &pSrc[2U * fftLen];
+ pSrc1 = &pSrc[0];
+ pSrc2 = &pSrc[2 * fftLen];
+
+ i = fftLen;
+ while (i > 0U)
+ {
+ /*
+ outR = ( pIn[2 * i] * pATable[2 * i]
+ + pIn[2 * i + 1] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ - pIn[2 * i] * pATable[2 * i + 1]
+ - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- i = fftLen;
-
- while (i > 0U)
- {
- /*
- outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
- outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
- */
-
-
#ifndef ARM_MATH_BIG_ENDIAN
-
- /* pIn[2 * n - 2 * i] * pBTable[2 * i] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
- outR = __SMUSD(*__SIMD32(pSrc2), *__SIMD32(pCoefB));
-
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+ outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
#else
+ /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
+ outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] +
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
- outR = -(__SMUSD(*__SIMD32(pSrc2), *__SIMD32(pCoefB)));
+ /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
+ outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i] */
- outR = __SMLAD(*__SIMD32(pSrc1), *__SIMD32(pCoefA), outR) >> 16U;
-
- /*
- -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] +
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
- outI = __SMUADX(*__SIMD32(pSrc2)--, *__SIMD32(pCoefB));
-
- /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
+ /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
+ /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
#ifndef ARM_MATH_BIG_ENDIAN
-
- outI = __SMLSDX(*__SIMD32(pCoefA), *__SIMD32(pSrc1)++, -outI);
-
+ outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
#else
+ outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- outI = __SMLSDX(*__SIMD32(pSrc1)++, *__SIMD32(pCoefA), -outI);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- /* write output */
-
+ /* write output */
#ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst1)++ = __PKHBT(outR, (outI >> 16U), 16);
-
+ write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
#else
+ write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- *__SIMD32(pDst1)++ = __PKHBT((outI >> 16U), outR, 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+#else /* #if defined (ARM_MATH_DSP) */
- /* update coefficient pointer */
- pCoefB = pCoefB + (2U * modifier);
- pCoefA = pCoefA + (2U * modifier);
+ outR = *pSrc2 * *pCoefB;
+ outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
+ outR = outR + (*pSrc1 * *pCoefA);
+ outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
- i--;
- }
-#else
- /* Run the below code for Cortex-M0 */
- i = fftLen;
+ outI = *(pSrc1 + 1) * *pCoefA;
+ outI = outI - (*pSrc1 * *(pCoefA + 1));
+ outI = outI - (*pSrc2 * *(pCoefB + 1));
+ outI = outI - (*(pSrc2 + 1) * *(pCoefB));
- while (i > 0U)
- {
- /*
- outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- */
+ /* update input pointers */
+ pSrc1 += 2U;
+ pSrc2 -= 2U;
- outR = *pSrc2 * *pCoefB;
- outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
- outR = outR + (*pSrc1 * *pCoefA);
- outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
+ /* write output */
+ *pDst1++ = (q15_t) outR;
+ *pDst1++ = (q15_t) (outI >> 16);
- /*
- outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
- */
-
- outI = *(pSrc1 + 1) * *pCoefA;
- outI = outI - (*pSrc1 * *(pCoefA + 1));
- outI = outI - (*pSrc2 * *(pCoefB + 1));
- outI = outI - (*(pSrc2 + 1) * *(pCoefB));
-
- /* update input pointers */
- pSrc1 += 2U;
- pSrc2 -= 2U;
-
- /* write output */
- *pDst1++ = (q15_t) outR;
- *pDst1++ = (q15_t) (outI >> 16);
-
- /* update coefficient pointer */
- pCoefB = pCoefB + (2U * modifier);
- pCoefA = pCoefA + (2U * modifier);
-
- i--;
- }
#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2 * modifier);
+ pCoefA = pCoefA + (2 * modifier);
+
+ i--;
+ }
+
}
diff --git a/DSP/Source/TransformFunctions/arm_rfft_q31.c b/DSP/Source/TransformFunctions/arm_rfft_q31.c
index 5386140..d16600d 100644
--- a/DSP/Source/TransformFunctions/arm_rfft_q31.c
+++ b/DSP/Source/TransformFunctions/arm_rfft_q31.c
@@ -3,13 +3,13 @@
* Title: arm_rfft_q31.c
* Description: FFT & RIFFT Q31 process function
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -33,251 +33,260 @@
* -------------------------------------------------------------------- */
void arm_split_rfft_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pATable,
- q31_t * pBTable,
- q31_t * pDst,
- uint32_t modifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier);
void arm_split_rifft_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pATable,
- q31_t * pBTable,
- q31_t * pDst,
- uint32_t modifier);
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier);
/**
-* @addtogroup RealFFT
-* @{
-*/
+ @addtogroup RealFFT
+ @{
+ */
/**
-* @brief Processing function for the Q31 RFFT/RIFFT.
-* @param[in] *S points to an instance of the Q31 RFFT/RIFFT structure.
-* @param[in] *pSrc points to the input buffer.
-* @param[out] *pDst points to the output buffer.
-* @return none.
-*
-* \par Input an output formats:
-* \par
-* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-* Hence the output format is different for different RFFT sizes.
-* The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
-* \par
-* \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT"
-*
-* \par
-* \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT"
-*/
+ @brief Processing function for the Q31 RFFT/RIFFT.
+ @param[in] S points to an instance of the Q31 RFFT/RIFFT structure
+ @param[in] pSrc points to input buffer
+ @param[out] pDst points to output buffer
+ @return none
+
+ @par Input an output formats
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different RFFT sizes.
+ The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+ @par
+ \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT"
+ @par
+ \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT"
+ */
+
void arm_rfft_q31(
- const arm_rfft_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst)
+ const arm_rfft_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst)
{
- const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
- uint32_t i;
- uint32_t L2 = S->fftLenReal >> 1;
+ const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
+ uint32_t L2 = S->fftLenReal >> 1U;
+ uint32_t i;
- /* Calculation of RIFFT of input */
- if (S->ifftFlagR == 1U)
- {
- /* Real IFFT core process */
- arm_split_rifft_q31(pSrc, L2, S->pTwiddleAReal,
- S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ /* Calculation of RIFFT of input */
+ if (S->ifftFlagR == 1U)
+ {
+ /* Real IFFT core process */
+ arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
- /* Complex IFFT process */
- arm_cfft_q31(S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+ /* Complex IFFT process */
+ arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
- for(i=0;ifftLenReal;i++)
- {
- pDst[i] = pDst[i] << 1;
- }
- }
- else
- {
- /* Calculation of RFFT of input */
+ for(i = 0; i < S->fftLenReal; i++)
+ {
+ pDst[i] = pDst[i] << 1U;
+ }
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
- /* Complex FFT process */
- arm_cfft_q31(S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+ /* Complex FFT process */
+ arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+ /* Real FFT core process */
+ arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ }
- /* Real FFT core process */
- arm_split_rfft_q31(pSrc, L2, S->pTwiddleAReal,
- S->pTwiddleBReal, pDst, S->twidCoefRModifier);
- }
}
/**
-* @} end of RealFFT group
-*/
+ @} end of RealFFT group
+ */
/**
-* @brief Core Real FFT process
-* @param[in] *pSrc points to the input buffer.
-* @param[in] fftLen length of FFT.
-* @param[in] *pATable points to the twiddle Coef A buffer.
-* @param[in] *pBTable points to the twiddle Coef B buffer.
-* @param[out] *pDst points to the output buffer.
-* @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-*/
+ @brief Core Real FFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
void arm_split_rfft_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pATable,
- q31_t * pBTable,
- q31_t * pDst,
- uint32_t modifier)
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier)
{
- uint32_t i; /* Loop Counter */
- q31_t outR, outI; /* Temporary variables for output */
- q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
- q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
- q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[(4U * fftLen) - 1U];
- q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[(2U * fftLen) - 1U];
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1];
+ q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[2 * fftLen - 1];
- /* Init coefficient pointers */
- pCoefA = &pATable[modifier * 2U];
- pCoefB = &pBTable[modifier * 2U];
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
- i = fftLen - 1U;
+ i = fftLen - 1U;
- while (i > 0U)
- {
- /*
- outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]
- + pSrc[2 * n - 2 * i] * pBTable[2 * i] +
- pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- */
+ while (i > 0U)
+ {
+ /*
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); */
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
- CoefA1 = *pCoefA++;
- CoefA2 = *pCoefA;
+ CoefA1 = *pCoefA++;
+ CoefA2 = *pCoefA;
- /* outR = (pSrc[2 * i] * pATable[2 * i] */
- mult_32x32_keep32_R(outR, *pIn1, CoefA1);
+ /* outR = (pSrc[2 * i] * pATable[2 * i] */
+ mult_32x32_keep32_R (outR, *pIn1, CoefA1);
- /* outI = pIn[2 * i] * pATable[2 * i + 1] */
- mult_32x32_keep32_R(outI, *pIn1++, CoefA2);
+ /* outI = pIn[2 * i] * pATable[2 * i + 1] */
+ mult_32x32_keep32_R (outI, *pIn1++, CoefA2);
- /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
- multSub_32x32_keep32_R(outR, *pIn1, CoefA2);
+ /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
+ multSub_32x32_keep32_R (outR, *pIn1, CoefA2);
- /* (pIn[2 * i + 1] * pATable[2 * i] */
- multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);
+ /* (pIn[2 * i + 1] * pATable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
- /* pSrc[2 * n - 2 * i] * pBTable[2 * i] */
- multSub_32x32_keep32_R(outR, *pIn2, CoefA2);
- CoefB1 = *pCoefB;
+ /* pSrc[2 * n - 2 * i] * pBTable[2 * i] */
+ multSub_32x32_keep32_R (outR, *pIn2, CoefA2);
+ CoefB1 = *pCoefB;
- /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
- multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
- /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
- multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);
+ /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
- /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
- multSub_32x32_keep32_R(outI, *pIn2--, CoefA2);
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefA2);
- /* write output */
- *pOut1++ = outR;
- *pOut1++ = outI;
+ /* write output */
+ *pOut1++ = outR;
+ *pOut1++ = outI;
- /* write complex conjugate output */
- *pOut2-- = -outI;
- *pOut2-- = outR;
+ /* write complex conjugate output */
+ *pOut2-- = -outI;
+ *pOut2-- = outR;
- /* update coefficient pointer */
- pCoefB = pCoefB + (modifier * 2U);
- pCoefA = pCoefA + ((modifier * 2U) - 1U);
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2 * modifier);
+ pCoefA = pCoefA + (2 * modifier - 1);
- i--;
- }
- pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
- pDst[(2U * fftLen) + 1U] = 0;
+ /* Decrement loop count */
+ i--;
+ }
- pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
- pDst[1] = 0;
+ pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2 * fftLen + 1] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+ pDst[1] = 0;
}
+
/**
-* @brief Core Real IFFT process
-* @param[in] *pSrc points to the input buffer.
-* @param[in] fftLen length of FFT.
-* @param[in] *pATable points to the twiddle Coef A buffer.
-* @param[in] *pBTable points to the twiddle Coef B buffer.
-* @param[out] *pDst points to the output buffer.
-* @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-*/
+ @brief Core Real IFFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
void arm_split_rifft_q31(
- q31_t * pSrc,
- uint32_t fftLen,
- q31_t * pATable,
- q31_t * pBTable,
- q31_t * pDst,
- uint32_t modifier)
-{
- q31_t outR, outI; /* Temporary variables for output */
- q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
- q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
- q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[(2U * fftLen) + 1U];
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier)
+{
+ q31_t outR, outI; /* Temporary variables for output */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1];
- pCoefA = &pATable[0];
- pCoefB = &pBTable[0];
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
- while (fftLen > 0U)
- {
- /*
- outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
- pIn[2 * n - 2 * i] * pBTable[2 * i] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+ while (fftLen > 0U)
+ {
+ /*
+ outR = ( pIn[2 * i] * pATable[2 * i]
+ + pIn[2 * i + 1] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
- outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -
- pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
- pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
- */
- CoefA1 = *pCoefA++;
- CoefA2 = *pCoefA;
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ - pIn[2 * i] * pATable[2 * i + 1]
+ - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
- /* outR = (pIn[2 * i] * pATable[2 * i] */
- mult_32x32_keep32_R(outR, *pIn1, CoefA1);
+ CoefA1 = *pCoefA++;
+ CoefA2 = *pCoefA;
- /* - pIn[2 * i] * pATable[2 * i + 1] */
- mult_32x32_keep32_R(outI, *pIn1++, -CoefA2);
+ /* outR = (pIn[2 * i] * pATable[2 * i] */
+ mult_32x32_keep32_R (outR, *pIn1, CoefA1);
- /* pIn[2 * i + 1] * pATable[2 * i + 1] */
- multAcc_32x32_keep32_R(outR, *pIn1, CoefA2);
+ /* - pIn[2 * i] * pATable[2 * i + 1] */
+ mult_32x32_keep32_R (outI, *pIn1++, -CoefA2);
- /* pIn[2 * i + 1] * pATable[2 * i] */
- multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);
+ /* pIn[2 * i + 1] * pATable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn1, CoefA2);
- /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
- multAcc_32x32_keep32_R(outR, *pIn2, CoefA2);
- CoefB1 = *pCoefB;
+ /* pIn[2 * i + 1] * pATable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
- /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
- multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefA2);
+ CoefB1 = *pCoefB;
- /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
- multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
- /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
- multAcc_32x32_keep32_R(outI, *pIn2--, CoefA2);
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
- /* write output */
- *pDst++ = outR;
- *pDst++ = outI;
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2);
- /* update coefficient pointer */
- pCoefB = pCoefB + (modifier * 2U);
- pCoefA = pCoefA + ((modifier * 2U) - 1U);
+ /* write output */
+ *pDst++ = outR;
+ *pDst++ = outI;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (modifier * 2);
+ pCoefA = pCoefA + (modifier * 2 - 1);
+
+ /* Decrement loop count */
+ fftLen--;
+ }
- /* Decrement loop count */
- fftLen--;
- }
}
diff --git a/DSP/armcc.cmake b/DSP/armcc.cmake
new file mode 100644
index 0000000..544a834
--- /dev/null
+++ b/DSP/armcc.cmake
@@ -0,0 +1,48 @@
+# Setting Linux is forcing th extension to be .o instead of .obj when building on WIndows.
+# It is important because armlink is failing when files have .obj extensions (error with
+# scatter file section not found)
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_SYSTEM_PROCESSOR arm)
+
+
+
+SET(tools "C:/PROGRA~1/ARM/DEVELO~1.0/sw/ARMCOM~1.12")
+
+SET(CMAKE_C_COMPILER "${tools}/bin/armclang.exe")
+SET(CMAKE_CXX_COMPILER "${tools}/bin/armclang.exe")
+
+SET(CMAKE_AR "${tools}/bin/armar.exe")
+SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/armar.exe")
+SET(CMAKE_C_COMPILER_AR "${tools}/bin/armar.exe")
+SET(CMAKE_LINKER "${tools}/bin/armlink.exe")
+SET(CMAKE_C_LINK_EXECUTABLE " -o ")
+set(CMAKE_C_RESPONSE_FILE_LINK_FLAG "--via ")
+SET(CMAKE_C_OUTPUT_EXTENSION .o)
+SET(CMAKE_CXX_OUTPUT_EXTENSION .o)
+SET(CMAKE_ASM_OUTPUT_EXTENSION .o)
+# When library defined as STATIC, this line is needed to describe how the .a file must be
+# create. Some changes to the line may be needed.
+SET(CMAKE_C_CREATE_STATIC_LIBRARY "${tools}/bin/armar.exe -r -s --create " )
+set(ARMAC6 ON)
+# default core
+
+if(NOT ARM_CPU)
+ set(
+ ARM_CPU "cortex-a5"
+ CACHE STRING "Set ARM CPU. Default : cortex-a5"
+ )
+endif(NOT ARM_CPU)
+
+SET(CMAKE_C_FLAGS "-mcpu=${ARM_CPU} --target=arm-arm-none-eabi" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_CXX_FLAGS "-mcpu=${ARM_CPU} --target=arm-arm-none-eabi" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_ASM_FLAGS "-g -x assembler-with-cpp -masm=auto -mcpu=${ARM_CPU} --target=arm-arm-none-eabi" CACHE INTERNAL "ASM compiler common flags")
+#SET(CMAKE_EXE_LINKER_FLAGS "-flto" CACHE INTERNAL "linker flags")
+
+# Where is the target environment
+SET(CMAKE_FIND_ROOT_PATH "${tools}")
+# Search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# For libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
diff --git a/DSP/config.cmake b/DSP/config.cmake
new file mode 100644
index 0000000..b6c84be
--- /dev/null
+++ b/DSP/config.cmake
@@ -0,0 +1,141 @@
+include(CMakePrintHelpers)
+cmake_policy(SET CMP0077 NEW)
+
+SET(CORTEXM ON)
+option(FASTMATHCOMPUTATIONS "Fast Math enabled" ON)
+option(NEON "Neon acceleration" OFF)
+option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
+option(LOOPUNROLL "Loop unrolling" ON)
+option(ROUNDING "Rounding" OFF)
+option(MATRIXCHECK "Matrix Checks" OFF)
+
+###################
+#
+# ALL CORTEX
+#
+
+function(configdsp PROJECTNAME DSP)
+ target_compile_options(${PROJECTNAME} PUBLIC "-mfloat-abi=hard;-mlittle-endian")
+
+ if (CONFIGTABLE)
+ # Public because initialization for FFT may be defined in client code
+ # and needs access to the table.
+ target_compile_definitions(${PROJECTNAME} PUBLIC ARM_DSP_CONFIG_TABLES)
+ endif()
+
+ if (FASTMATHCOMPUTATIONS)
+ target_compile_options(${PROJECTNAME} PUBLIC "-ffast-math")
+ endif()
+
+ if (LOOPUNROLL)
+ target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_LOOPUNROLL)
+ endif()
+
+ if (ROUNDING)
+ target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_ROUNDING)
+ endif()
+
+ if (MATRIXCHECK)
+ target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_MATRIX_CHECK)
+ endif()
+
+
+ ###################
+ #
+ # CORTEX-A
+ #
+
+ # CORTEX-A9
+ if (ARM_CPU STREQUAL "cortex-a9" )
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core_A/Include")
+ SET(CORTEXM OFF)
+
+ if (NOT (NEON OR NEONEXPERIMENTAL))
+ target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=vfpv3-d16-fp16")
+ endif()
+
+ endif()
+
+ # CORTEX-A7
+ if (ARM_CPU STREQUAL "cortex-a7" )
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core_A/Include")
+ SET(CORTEXM OFF)
+
+ if (NOT (NEON OR NEONEXPERIMENTAL))
+ target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=vfpv4-d16")
+ endif()
+
+ endif()
+
+ # CORTEX-A5
+ if (ARM_CPU STREQUAL "cortex-a5" )
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core_A/Include")
+ SET(CORTEXM OFF)
+
+ if ((NEON OR NEONEXPERIMENTAL))
+ target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=neon-vfpv4")
+ else()
+ target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=vfpv4-d16")
+ endif()
+ endif()
+
+
+ ###################
+ #
+ # CORTEX-M
+ #
+
+ # CORTEX-M35
+ if (ARM_CPU STREQUAL "cortex-m35")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M33
+ if (ARM_CPU STREQUAL "cortex-m33")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M23
+ if (ARM_CPU STREQUAL "cortex-m23")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M7
+ if (ARM_CPU STREQUAL "cortex-m7")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M4
+ if (ARM_CPU STREQUAL "cortex-m4")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+
+ endif()
+
+ # CORTEX-M3
+ if (ARM_CPU STREQUAL "cortex-m3")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M0plus
+ if (ARM_CPU STREQUAL "cortex-m0p")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ # CORTEX-M0
+ if (ARM_CPU STREQUAL "cortex-m0")
+ target_include_directories(${PROJECTNAME} PUBLIC "${DSP}/../../Core/Include")
+ endif()
+
+ ###################
+ #
+ # FEATURES
+ #
+
+ if (NEON AND NOT CORTEXM)
+ target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON __FPU_PRESENT)
+ endif()
+
+ if (NEONEXPERIMENTAL AND NOT CORTEXM)
+ target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL __FPU_PRESENT)
+ endif()
+endfunction()
\ No newline at end of file
diff --git a/DSP/configBoot.cmake b/DSP/configBoot.cmake
new file mode 100644
index 0000000..68a364c
--- /dev/null
+++ b/DSP/configBoot.cmake
@@ -0,0 +1,118 @@
+include(CMakePrintHelpers)
+include(configUtils)
+
+enable_language(C ASM)
+
+option(FILEIO "Test trace using printf" ON)
+
+# Otherwise there is a .obj on windows and it creates problems
+# with armlink.
+SET(CMAKE_C_OUTPUT_EXTENSION .o)
+SET(CMAKE_CXX_OUTPUT_EXTENSION .o)
+SET(CMAKE_ASM_OUTPUT_EXTENSION .o)
+
+
+get_filename_component(PROJECT_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+cmake_print_variables(PROJECT_NAME)
+
+#set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+
+if (ARMAC6)
+
+ ###################
+ #
+ # Cortex cortex-m7
+ #
+ if (ARM_CPU STREQUAL "cortex-m7")
+ cortexm(ARMCM7)
+
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMCM7_DP)
+
+
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m4
+ #
+ if (ARM_CPU STREQUAL "cortex-m4")
+ cortexm(ARMCM4)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMCM4_FP)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m35p
+ #
+ if (ARM_CPU STREQUAL "cortex-m35")
+ cortexm(ARMCM35P)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMCM35P)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m33
+ #
+ if (ARM_CPU STREQUAL "cortex-m33")
+ cortexm(ARMCM33)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMCM33)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m23
+ #
+ if (ARM_CPU STREQUAL "cortex-m23")
+ cortexm(ARMCM23)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMCM23)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m0+
+ #
+ if (ARM_CPU STREQUAL "cortex-m0p")
+ cortexm(ARMCM0plus)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-m0
+ #
+ if (ARM_CPU STREQUAL "cortex-m0")
+ cortexm(ARMCM0)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-a5
+ #
+ if (ARM_CPU STREQUAL "cortex-a5")
+ cortexa(ARMCA5)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMv7A)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-a7
+ #
+ if (ARM_CPU STREQUAL "cortex-a7")
+ cortexa(ARMCA7)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMv7A)
+ endif()
+
+ ###################
+ #
+ # Cortex cortex-a9
+ #
+ if (ARM_CPU STREQUAL "cortex-a9")
+ cortexa(ARMCA9)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ARMv7A)
+ endif()
+
+endif()
+
+if (FILEIO)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE FILEIO)
+endif()
\ No newline at end of file
diff --git a/DSP/configUtils.cmake b/DSP/configUtils.cmake
new file mode 100644
index 0000000..43b1c5f
--- /dev/null
+++ b/DSP/configUtils.cmake
@@ -0,0 +1,51 @@
+function(cortexm CORE)
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Source/ARM/startup_${CORE}.s)
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Source/system_${CORE}.c)
+ target_include_directories(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Include)
+ target_include_directories(${PROJECT_NAME} PRIVATE ${ROOT}/CMSIS/Core/Include)
+
+ if (TESTFRAMEWORK)
+ # Need bigger sections for test framework
+ # So we use the test framework scatter file
+ set(SCATTERFILE "${ROOT}/CMSIS/DSP/DSP_Lib_TestSuite/Common/platform/ARMCLANG/armcc6_arm.sct")
+ else()
+ set(SCATTERFILE "${ROOT}/Device/ARM/${CORE}/Source/ARM/${CORE}_ac6.sct")
+ endif()
+
+ target_link_options(${PROJECT_NAME} PRIVATE "--info=sizes;--entry=Reset_Handler;--scatter=${SCATTERFILE}")
+
+endfunction()
+
+function(cortexa CORE)
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Source/mmu_${CORE}.c)
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Source/system_${CORE}.c)
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/CMSIS/Core_A/Source/irq_ctrl_gic.c)
+
+ target_include_directories(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Include)
+ target_include_directories(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Config)
+ target_include_directories(${PROJECT_NAME} PRIVATE ${ROOT}/CMSIS/Core_A/Include)
+
+ target_sources(${PROJECT_NAME} PRIVATE ${ROOT}/Device/ARM/${CORE}/Source/AC6/startup_${CORE}.c)
+
+ if (TESTFRAMEWORK)
+ # Test scatter file is missing some sections required by startup file for
+ # cortex-a
+ #set(SCATTERFILE "${ROOT}/CMSIS/DSP/DSP_Lib_TestSuite/Common/platform/ARMCLANG/armcc6_arm.sct")
+ target_include_directories(${PROJECT_NAME} PRIVATE ../Examples/ARM/boot)
+ else()
+ target_include_directories(${PROJECT_NAME} PRIVATE ../boot)
+ endif()
+
+ set(SCATTERFILE ${CMAKE_CURRENT_BINARY_DIR}/tempLink/${CORE}.sct)
+
+
+ # Copy the mem file to the build directory
+ # so that it can be find when preprocessing the scatter file
+ # since we cannot pass an include path to armlink
+ file(COPY ${ROOT}/Device/ARM/${CORE}/Config/mem_${CORE}.h DESTINATION tempLink)
+ file(COPY ${ROOT}/Device/ARM/${CORE}/Source/AC6/${CORE}.sct DESTINATION tempLink)
+
+ target_compile_definitions(${PROJECT_NAME} PRIVATE -DCMSIS_device_header="${CORE}.h")
+
+ target_link_options(${PROJECT_NAME} PRIVATE "--info=sizes;--entry=Vectors;--scatter=${SCATTERFILE}")
+endfunction()
\ No newline at end of file
diff --git a/DSP/fft.cmake b/DSP/fft.cmake
new file mode 100644
index 0000000..ddaade1
--- /dev/null
+++ b/DSP/fft.cmake
@@ -0,0 +1,523 @@
+function(fft PROJECT)
+#######################################
+#
+# CFFT F32
+#
+
+
+if (CONFIGTABLE AND CFFT_F32_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_4096)
+endif()
+
+#######################################
+#
+# CFFT Q31
+#
+
+if (CONFIGTABLE AND CFFT_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# CFFT Q15
+#
+
+if (CONFIGTABLE AND CFFT_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# RFFT FAST F32
+#
+
+if (CONFIGTABLE AND RFFT_FAST_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_2048)
+endif()
+
+#######################################
+#
+# RFFT F32
+#
+
+if (CONFIGTABLE AND RFFT_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+#######################################
+#
+# RFFT Q31
+#
+
+if (CONFIGTABLE AND RFFT_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# RFFT FAST Q15
+#
+
+if (CONFIGTABLE AND RFFT_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# DCT4 F32
+#
+
+if (CONFIGTABLE AND DCT4_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+#######################################
+#
+# DCT4 Q31
+#
+
+if (CONFIGTABLE AND DCT4_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+#######################################
+#
+# DCT4 Q15
+#
+
+if (CONFIGTABLE AND DCT4_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+endfunction()
\ No newline at end of file
diff --git a/DSP/interpol.cmake b/DSP/interpol.cmake
new file mode 100644
index 0000000..80282cf
--- /dev/null
+++ b/DSP/interpol.cmake
@@ -0,0 +1,43 @@
+function(interpol PROJECT)
+
+if (CONFIGTABLE AND ARM_COS_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_COS_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_COS_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q15)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q15)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_COS_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_COS_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_LMS_NORM_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_LMS_NORM_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q15)
+endif()
+
+endfunction()
\ No newline at end of file
diff --git a/Include/cmsis_armcc.h b/Include/cmsis_armcc.h
index 4d9d064..59f173a 100644
--- a/Include/cmsis_armcc.h
+++ b/Include/cmsis_armcc.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file cmsis_armcc.h
* @brief CMSIS compiler ARMCC (Arm Compiler 5) header file
- * @version V5.0.4
- * @date 10. January 2018
+ * @version V5.1.0
+ * @date 08. May 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -47,6 +47,10 @@
/* __ARM_ARCH_8M_BASE__ not applicable */
/* __ARM_ARCH_8M_MAIN__ not applicable */
+/* CMSIS compiler control DSP macros */
+#if ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+ #define __ARM_FEATURE_DSP 1
+#endif
/* CMSIS compiler specific defines */
#ifndef __ASM
@@ -100,6 +104,31 @@
#ifndef __RESTRICT
#define __RESTRICT __restrict
#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __memory_changed()
+#endif
+
+/* ######################### Startup and Lowlevel Init ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE __attribute((used, section("RESET")))
+#endif
/* ########################### Core Function Access ########################### */
/** \ingroup CMSIS_Core_FunctionInterface
diff --git a/Include/cmsis_armclang.h b/Include/cmsis_armclang.h
index 162a400..e917f35 100644
--- a/Include/cmsis_armclang.h
+++ b/Include/cmsis_armclang.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file cmsis_armclang.h
* @brief CMSIS compiler armclang (Arm Compiler 6) header file
- * @version V5.0.4
- * @date 10. January 2018
+ * @version V5.2.0
+ * @date 08. May 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -43,9 +43,9 @@
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
-#ifndef __STATIC_FORCEINLINE
+#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
-#endif
+#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((__noreturn__))
#endif
@@ -110,7 +110,31 @@
#ifndef __RESTRICT
#define __RESTRICT __restrict
#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
+#endif
+/* ######################### Startup and Lowlevel Init ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE __attribute((used, section("RESET")))
+#endif
/* ########################### Core Function Access ########################### */
/** \ingroup CMSIS_Core_FunctionInterface
@@ -781,9 +805,11 @@ __STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
* Otherwise, use general registers, specified by constraint "r" */
#if defined (__thumb__) && !defined (__thumb2__)
#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
#define __CMSIS_GCC_USE_REG(r) "l" (r)
#else
#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
#define __CMSIS_GCC_USE_REG(r) "r" (r)
#endif
@@ -821,14 +847,14 @@ __STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
so that all instructions following the ISB are fetched from cache or memory,
after the instruction has been completed.
*/
-#define __ISB() __builtin_arm_isb(0xF);
+#define __ISB() __builtin_arm_isb(0xF)
/**
\brief Data Synchronization Barrier
\details Acts as a special kind of Data Memory Barrier.
It completes when all explicit memory accesses before this instruction complete.
*/
-#define __DSB() __builtin_arm_dsb(0xF);
+#define __DSB() __builtin_arm_dsb(0xF)
/**
@@ -836,7 +862,7 @@ __STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
\details Ensures the apparent order of the explicit memory operations before
and after the instruction, without ensuring their completion.
*/
-#define __DMB() __builtin_arm_dmb(0xF);
+#define __DMB() __builtin_arm_dmb(0xF)
/**
@@ -908,7 +934,23 @@ __STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
\param [in] value Value to count the leading zeros
\return number of leading zeros in value
*/
-#define __CLZ (uint8_t)__builtin_clz
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+ /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+ __builtin_clz(0) is undefined behaviour, so handle this case specially.
+ This guarantees ARM-compatible results if happening to compile on a non-ARM
+ target, and ensures the compiler doesn't decide to activate any
+ optimisations using the logic "value was passed to __builtin_clz, so it
+ is non-zero".
+ ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
+ single CLZ instruction.
+ */
+ if (value == 0U)
+ {
+ return 32U;
+ }
+ return __builtin_clz(value);
+}
#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
@@ -1321,532 +1363,65 @@ __STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
-__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-
-__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-
-__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
-{
- uint32_t result;
-
- __ASM volatile ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
- return(result);
-}
-
-#define __SSAT16(ARG1,ARG2) \
-({ \
- int32_t __RES, __ARG1 = (ARG1); \
- __ASM ("ssat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
- __RES; \
- })
-
-#define __USAT16(ARG1,ARG2) \
-({ \
- uint32_t __RES, __ARG1 = (ARG1); \
- __ASM ("usat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
- __RES; \
- })
-
-__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
-{
- uint32_t result;
-
- __ASM volatile ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
-{
- uint32_t result;
-
- __ASM volatile ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
-{
- uint32_t result;
-
- __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
-{
- uint32_t result;
-
- __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
-{
- union llreg_u{
- uint32_t w32[2];
- uint64_t w64;
- } llr;
- llr.w64 = acc;
-
-#ifndef __ARMEB__ /* Little endian */
- __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
-#else /* Big endian */
- __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
-#endif
-
- return(llr.w64);
-}
-
-__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
-{
- union llreg_u{
- uint32_t w32[2];
- uint64_t w64;
- } llr;
- llr.w64 = acc;
-
-#ifndef __ARMEB__ /* Little endian */
- __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
-#else /* Big endian */
- __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
-#endif
-
- return(llr.w64);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
-{
- uint32_t result;
-
- __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
-{
- uint32_t result;
-
- __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
- return(result);
-}
-
-__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
-{
- union llreg_u{
- uint32_t w32[2];
- uint64_t w64;
- } llr;
- llr.w64 = acc;
-
-#ifndef __ARMEB__ /* Little endian */
- __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
-#else /* Big endian */
- __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
-#endif
-
- return(llr.w64);
-}
-
-__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
-{
- union llreg_u{
- uint32_t w32[2];
- uint64_t w64;
- } llr;
- llr.w64 = acc;
-
-#ifndef __ARMEB__ /* Little endian */
- __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
-#else /* Big endian */
- __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
-#endif
-
- return(llr.w64);
-}
-
-__STATIC_FORCEINLINE uint32_t __SEL (uint32_t op1, uint32_t op2)
-{
- uint32_t result;
-
- __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2)
-{
- int32_t result;
-
- __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2)
-{
- int32_t result;
-
- __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
- return(result);
-}
-
-#if 0
-#define __PKHBT(ARG1,ARG2,ARG3) \
-({ \
- uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
- __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
- __RES; \
- })
-
-#define __PKHTB(ARG1,ARG2,ARG3) \
-({ \
- uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
- if (ARG3 == 0) \
- __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \
- else \
- __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
- __RES; \
- })
-#endif
+#define __SADD8 __builtin_arm_sadd8
+#define __QADD8 __builtin_arm_qadd8
+#define __SHADD8 __builtin_arm_shadd8
+#define __UADD8 __builtin_arm_uadd8
+#define __UQADD8 __builtin_arm_uqadd8
+#define __UHADD8 __builtin_arm_uhadd8
+#define __SSUB8 __builtin_arm_ssub8
+#define __QSUB8 __builtin_arm_qsub8
+#define __SHSUB8 __builtin_arm_shsub8
+#define __USUB8 __builtin_arm_usub8
+#define __UQSUB8 __builtin_arm_uqsub8
+#define __UHSUB8 __builtin_arm_uhsub8
+#define __SADD16 __builtin_arm_sadd16
+#define __QADD16 __builtin_arm_qadd16
+#define __SHADD16 __builtin_arm_shadd16
+#define __UADD16 __builtin_arm_uadd16
+#define __UQADD16 __builtin_arm_uqadd16
+#define __UHADD16 __builtin_arm_uhadd16
+#define __SSUB16 __builtin_arm_ssub16
+#define __QSUB16 __builtin_arm_qsub16
+#define __SHSUB16 __builtin_arm_shsub16
+#define __USUB16 __builtin_arm_usub16
+#define __UQSUB16 __builtin_arm_uqsub16
+#define __UHSUB16 __builtin_arm_uhsub16
+#define __SASX __builtin_arm_sasx
+#define __QASX __builtin_arm_qasx
+#define __SHASX __builtin_arm_shasx
+#define __UASX __builtin_arm_uasx
+#define __UQASX __builtin_arm_uqasx
+#define __UHASX __builtin_arm_uhasx
+#define __SSAX __builtin_arm_ssax
+#define __QSAX __builtin_arm_qsax
+#define __SHSAX __builtin_arm_shsax
+#define __USAX __builtin_arm_usax
+#define __UQSAX __builtin_arm_uqsax
+#define __UHSAX __builtin_arm_uhsax
+#define __USAD8 __builtin_arm_usad8
+#define __USADA8 __builtin_arm_usada8
+#define __SSAT16 __builtin_arm_ssat16
+#define __USAT16 __builtin_arm_usat16
+#define __UXTB16 __builtin_arm_uxtb16
+#define __UXTAB16 __builtin_arm_uxtab16
+#define __SXTB16 __builtin_arm_sxtb16
+#define __SXTAB16 __builtin_arm_sxtab16
+#define __SMUAD __builtin_arm_smuad
+#define __SMUADX __builtin_arm_smuadx
+#define __SMLAD __builtin_arm_smlad
+#define __SMLADX __builtin_arm_smladx
+#define __SMLALD __builtin_arm_smlald
+#define __SMLALDX __builtin_arm_smlaldx
+#define __SMUSD __builtin_arm_smusd
+#define __SMUSDX __builtin_arm_smusdx
+#define __SMLSD __builtin_arm_smlsd
+#define __SMLSDX __builtin_arm_smlsdx
+#define __SMLSLD __builtin_arm_smlsld
+#define __SMLSLDX __builtin_arm_smlsldx
+#define __SEL __builtin_arm_sel
+#define __QADD __builtin_arm_qadd
+#define __QSUB __builtin_arm_qsub
#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
diff --git a/Include/cmsis_armclang_ltm.h b/Include/cmsis_armclang_ltm.h
new file mode 100644
index 0000000..feec324
--- /dev/null
+++ b/Include/cmsis_armclang_ltm.h
@@ -0,0 +1,1891 @@
+/**************************************************************************//**
+ * @file cmsis_armclang_ltm.h
+ * @brief CMSIS compiler armclang (Arm Compiler 6) header file
+ * @version V1.2.0
+ * @date 08. May 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2018-2019 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*lint -esym(9058, IRQn)*/ /* disable MISRA 2012 Rule 2.4 for IRQn */
+
+#ifndef __CMSIS_ARMCLANG_H
+#define __CMSIS_ARMCLANG_H
+
+#pragma clang system_header /* treat file as system include file */
+
+#ifndef __ARM_COMPAT_H
+#include /* Compatibility header for Arm Compiler 5 intrinsics */
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef __ASM
+ #define __ASM __asm
+#endif
+#ifndef __INLINE
+ #define __INLINE __inline
+#endif
+#ifndef __STATIC_INLINE
+ #define __STATIC_INLINE static __inline
+#endif
+#ifndef __STATIC_FORCEINLINE
+ #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
+#endif
+#ifndef __NO_RETURN
+ #define __NO_RETURN __attribute__((__noreturn__))
+#endif
+#ifndef __USED
+ #define __USED __attribute__((used))
+#endif
+#ifndef __WEAK
+ #define __WEAK __attribute__((weak))
+#endif
+#ifndef __PACKED
+ #define __PACKED __attribute__((packed, aligned(1)))
+#endif
+#ifndef __PACKED_STRUCT
+ #define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef __PACKED_UNION
+ #define __PACKED_UNION union __attribute__((packed, aligned(1)))
+#endif
+#ifndef __UNALIGNED_UINT32 /* deprecated */
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32 */
+ struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+ #pragma clang diagnostic pop
+ #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef __UNALIGNED_UINT16_WRITE
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
+ __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+ #pragma clang diagnostic pop
+ #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef __UNALIGNED_UINT16_READ
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
+ __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+ #pragma clang diagnostic pop
+ #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef __UNALIGNED_UINT32_WRITE
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
+ __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+ #pragma clang diagnostic pop
+ #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef __UNALIGNED_UINT32_READ
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_READ */
+ __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+ #pragma clang diagnostic pop
+ #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef __ALIGNED
+ #define __ALIGNED(x) __attribute__((aligned(x)))
+#endif
+#ifndef __RESTRICT
+ #define __RESTRICT __restrict
+#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
+#endif
+
+/* ######################### Startup and Lowlevel Init ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE __attribute((used, section("RESET")))
+#endif
+
+
+/* ########################### Core Function Access ########################### */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+ @{
+ */
+
+/**
+ \brief Enable IRQ Interrupts
+ \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq(); see arm_compat.h */
+
+
+/**
+ \brief Disable IRQ Interrupts
+ \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq(); see arm_compat.h */
+
+
+/**
+ \brief Get Control Register
+ \details Returns the content of the Control Register.
+ \return Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, control" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Control Register (non-secure)
+ \details Returns the content of the non-secure Control Register when in secure mode.
+ \return non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Control Register
+ \details Writes the given value to the Control Register.
+ \param [in] control Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+ __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Control Register (non-secure)
+ \details Writes the given value to the non-secure Control Register when in secure state.
+ \param [in] control Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+ __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+ \brief Get IPSR Register
+ \details Returns the content of the IPSR Register.
+ \return IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+ return(result);
+}
+
+
+/**
+ \brief Get APSR Register
+ \details Returns the content of the APSR Register.
+ \return APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+ return(result);
+}
+
+
+/**
+ \brief Get xPSR Register
+ \details Returns the content of the xPSR Register.
+ \return xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+ return(result);
+}
+
+
+/**
+ \brief Get Process Stack Pointer
+ \details Returns the current value of the Process Stack Pointer (PSP).
+ \return PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, psp" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Process Stack Pointer (non-secure)
+ \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+ \return PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, psp_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Process Stack Pointer
+ \details Assigns the given value to the Process Stack Pointer (PSP).
+ \param [in] topOfProcStack Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+ __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Process Stack Pointer (non-secure)
+ \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+ \param [in] topOfProcStack Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+ __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+ \brief Get Main Stack Pointer
+ \details Returns the current value of the Main Stack Pointer (MSP).
+ \return MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, msp" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Main Stack Pointer (non-secure)
+ \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+ \return MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Main Stack Pointer
+ \details Assigns the given value to the Main Stack Pointer (MSP).
+ \param [in] topOfMainStack Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+ __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Main Stack Pointer (non-secure)
+ \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+ \param [in] topOfMainStack Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+ __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Stack Pointer (non-secure)
+ \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+ \return SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+ return(result);
+}
+
+
+/**
+ \brief Set Stack Pointer (non-secure)
+ \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+ \param [in] topOfStack Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+ __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+ \brief Get Priority Mask
+ \details Returns the current state of the priority mask bit from the Priority Mask Register.
+ \return Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, primask" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Priority Mask (non-secure)
+ \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+ \return Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Priority Mask
+ \details Assigns the given value to the Priority Mask Register.
+ \param [in] priMask Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+ __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Priority Mask (non-secure)
+ \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+ \param [in] priMask Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+ __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) )
+/**
+ \brief Enable FIQ
+ \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq __enable_fiq /* see arm_compat.h */
+
+
+/**
+ \brief Disable FIQ
+ \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq __disable_fiq /* see arm_compat.h */
+
+
+/**
+ \brief Get Base Priority
+ \details Returns the current value of the Base Priority register.
+ \return Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Base Priority (non-secure)
+ \details Returns the current value of the non-secure Base Priority register when in secure state.
+ \return Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Base Priority
+ \details Assigns the given value to the Base Priority register.
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+ __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Base Priority (non-secure)
+ \details Assigns the given value to the non-secure Base Priority register when in secure state.
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+ __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+ \brief Set Base Priority with condition
+ \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+ or the new value increases the BASEPRI priority level.
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+ __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+ \brief Get Fault Mask
+ \details Returns the current value of the Fault Mask register.
+ \return Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+ return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Fault Mask (non-secure)
+ \details Returns the current value of the non-secure Fault Mask register when in secure state.
+ \return Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+ return(result);
+}
+#endif
+
+
+/**
+ \brief Set Fault Mask
+ \details Assigns the given value to the Fault Mask register.
+ \param [in] faultMask Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+ __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Fault Mask (non-secure)
+ \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+ \param [in] faultMask Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+ __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) )
+
+/**
+ \brief Get Process Stack Pointer Limit
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence zero is returned always in non-secure
+ mode.
+
+ \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+ \return PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+ (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+ // without main extensions, the non-secure PSPLIM is RAZ/WI
+ return 0U;
+#else
+ uint32_t result;
+ __ASM volatile ("MRS %0, psplim" : "=r" (result) );
+ return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Process Stack Pointer Limit (non-secure)
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence zero is returned always in non-secure
+ mode.
+
+ \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+ \return PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+ // without main extensions, the non-secure PSPLIM is RAZ/WI
+ return 0U;
+#else
+ uint32_t result;
+ __ASM volatile ("MRS %0, psplim_ns" : "=r" (result) );
+ return result;
+#endif
+}
+#endif
+
+
+/**
+ \brief Set Process Stack Pointer Limit
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence the write is silently ignored in non-secure
+ mode.
+
+ \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+ \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+ (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+ // without main extensions, the non-secure PSPLIM is RAZ/WI
+ (void)ProcStackPtrLimit;
+#else
+ __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Process Stack Pointer (non-secure)
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence the write is silently ignored in non-secure
+ mode.
+
+ \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+ \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+ // without main extensions, the non-secure PSPLIM is RAZ/WI
+ (void)ProcStackPtrLimit;
+#else
+ __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+ \brief Get Main Stack Pointer Limit
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence zero is returned always.
+
+ \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+ \return MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+ (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+ // without main extensions, the non-secure MSPLIM is RAZ/WI
+ return 0U;
+#else
+ uint32_t result;
+ __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+ return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Get Main Stack Pointer Limit (non-secure)
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence zero is returned always.
+
+ \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+ \return MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+ // without main extensions, the non-secure MSPLIM is RAZ/WI
+ return 0U;
+#else
+ uint32_t result;
+ __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+ return result;
+#endif
+}
+#endif
+
+
+/**
+ \brief Set Main Stack Pointer Limit
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence the write is silently ignored.
+
+ \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+ \param [in] MainStackPtrLimit Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+ (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+ // without main extensions, the non-secure MSPLIM is RAZ/WI
+ (void)MainStackPtrLimit;
+#else
+ __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+ \brief Set Main Stack Pointer Limit (non-secure)
+ Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+ Stack Pointer Limit register hence the write is silently ignored.
+
+ \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+ \param [in] MainStackPtrLimit Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+ // without main extensions, the non-secure MSPLIM is RAZ/WI
+ (void)MainStackPtrLimit;
+#else
+ __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */
+
+/**
+ \brief Get FPSCR
+ \details Returns the current value of the Floating Point Status/Control register.
+ \return Floating Point Status/Control register value
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+ (defined (__FPU_USED ) && (__FPU_USED == 1U)) )
+#define __get_FPSCR (uint32_t)__builtin_arm_get_fpscr
+#else
+#define __get_FPSCR() ((uint32_t)0U)
+#endif
+
+/**
+ \brief Set FPSCR
+ \details Assigns the given value to the Floating Point Status/Control register.
+ \param [in] fpscr Floating Point Status/Control value to set
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+ (defined (__FPU_USED ) && (__FPU_USED == 1U)) )
+#define __set_FPSCR __builtin_arm_set_fpscr
+#else
+#define __set_FPSCR(x) ((void)(x))
+#endif
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ########################## Core Instruction Access ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+ Access to dedicated instructions
+ @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+ \brief No Operation
+ \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP __builtin_arm_nop
+
+/**
+ \brief Wait For Interrupt
+ \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI __builtin_arm_wfi
+
+
+/**
+ \brief Wait For Event
+ \details Wait For Event is a hint instruction that permits the processor to enter
+ a low-power state until one of a number of events occurs.
+ */
+#define __WFE __builtin_arm_wfe
+
+
+/**
+ \brief Send Event
+ \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV __builtin_arm_sev
+
+
+/**
+ \brief Instruction Synchronization Barrier
+ \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+ so that all instructions following the ISB are fetched from cache or memory,
+ after the instruction has been completed.
+ */
+#define __ISB() __builtin_arm_isb(0xF)
+
+/**
+ \brief Data Synchronization Barrier
+ \details Acts as a special kind of Data Memory Barrier.
+ It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB() __builtin_arm_dsb(0xF)
+
+
+/**
+ \brief Data Memory Barrier
+ \details Ensures the apparent order of the explicit memory operations before
+ and after the instruction, without ensuring their completion.
+ */
+#define __DMB() __builtin_arm_dmb(0xF)
+
+
+/**
+ \brief Reverse byte order (32 bit)
+ \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __REV(value) __builtin_bswap32(value)
+
+
+/**
+ \brief Reverse byte order (16 bit)
+ \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __REV16(value) __ROR(__REV(value), 16)
+
+
+/**
+ \brief Reverse byte order (16 bit)
+ \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __REVSH(value) (int16_t)__builtin_bswap16(value)
+
+
+/**
+ \brief Rotate Right in unsigned value (32 bit)
+ \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+ \param [in] op1 Value to rotate
+ \param [in] op2 Number of Bits to rotate
+ \return Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+ op2 %= 32U;
+ if (op2 == 0U)
+ {
+ return op1;
+ }
+ return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+ \brief Breakpoint
+ \details Causes the processor to enter Debug state.
+ Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+ \param [in] value is ignored by the processor.
+ If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value) __ASM volatile ("bkpt "#value)
+
+
+/**
+ \brief Reverse bit order of value
+ \details Reverses the bit order of the given value.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __RBIT __builtin_arm_rbit
+
+/**
+ \brief Count leading zeros
+ \details Counts the number of leading zeros of a data value.
+ \param [in] value Value to count the leading zeros
+ \return number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+ /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+ __builtin_clz(0) is undefined behaviour, so handle this case specially.
+ This guarantees ARM-compatible results if happening to compile on a non-ARM
+ target, and ensures the compiler doesn't decide to activate any
+ optimisations using the logic "value was passed to __builtin_clz, so it
+ is non-zero".
+ ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
+ single CLZ instruction.
+ */
+ if (value == 0U)
+ {
+ return 32U;
+ }
+ return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) )
+/**
+ \brief LDR Exclusive (8 bit)
+ \details Executes a exclusive LDR instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#define __LDREXB (uint8_t)__builtin_arm_ldrex
+
+
+/**
+ \brief LDR Exclusive (16 bit)
+ \details Executes a exclusive LDR instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#define __LDREXH (uint16_t)__builtin_arm_ldrex
+
+
+/**
+ \brief LDR Exclusive (32 bit)
+ \details Executes a exclusive LDR instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#define __LDREXW (uint32_t)__builtin_arm_ldrex
+
+
+/**
+ \brief STR Exclusive (8 bit)
+ \details Executes a exclusive STR instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXB (uint32_t)__builtin_arm_strex
+
+
+/**
+ \brief STR Exclusive (16 bit)
+ \details Executes a exclusive STR instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXH (uint32_t)__builtin_arm_strex
+
+
+/**
+ \brief STR Exclusive (32 bit)
+ \details Executes a exclusive STR instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXW (uint32_t)__builtin_arm_strex
+
+
+/**
+ \brief Remove the exclusive lock
+ \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX __builtin_arm_clrex
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) )
+
+/**
+ \brief Signed Saturate
+ \details Saturates a signed value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+#define __SSAT __builtin_arm_ssat
+
+
+/**
+ \brief Unsigned Saturate
+ \details Saturates an unsigned value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+#define __USAT __builtin_arm_usat
+
+
+/**
+ \brief Rotate Right with Extend (32 bit)
+ \details Moves each bit of a bitstring right by one bit.
+ The carry input is shifted in at the left end of the bitstring.
+ \param [in] value Value to rotate
+ \return Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+ uint32_t result;
+
+ __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+ return(result);
+}
+
+
+/**
+ \brief LDRT Unprivileged (8 bit)
+ \details Executes a Unprivileged LDRT instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return ((uint8_t) result); /* Add explicit type cast here */
+}
+
+
+/**
+ \brief LDRT Unprivileged (16 bit)
+ \details Executes a Unprivileged LDRT instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return ((uint16_t) result); /* Add explicit type cast here */
+}
+
+
+/**
+ \brief LDRT Unprivileged (32 bit)
+ \details Executes a Unprivileged LDRT instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return(result);
+}
+
+
+/**
+ \brief STRT Unprivileged (8 bit)
+ \details Executes a Unprivileged STRT instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+ __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+ \brief STRT Unprivileged (16 bit)
+ \details Executes a Unprivileged STRT instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+ __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+ \brief STRT Unprivileged (32 bit)
+ \details Executes a Unprivileged STRT instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+ __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */
+
+/**
+ \brief Signed Saturate
+ \details Saturates a signed value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+ if ((sat >= 1U) && (sat <= 32U))
+ {
+ const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+ const int32_t min = -1 - max ;
+ if (val > max)
+ {
+ return max;
+ }
+ else if (val < min)
+ {
+ return min;
+ }
+ }
+ return val;
+}
+
+/**
+ \brief Unsigned Saturate
+ \details Saturates an unsigned value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+ if (sat <= 31U)
+ {
+ const uint32_t max = ((1U << sat) - 1U);
+ if (val > (int32_t)max)
+ {
+ return max;
+ }
+ else if (val < 0)
+ {
+ return 0U;
+ }
+ }
+ return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \
+ (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) )
+/**
+ \brief Load-Acquire (8 bit)
+ \details Executes a LDAB instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return ((uint8_t) result);
+}
+
+
+/**
+ \brief Load-Acquire (16 bit)
+ \details Executes a LDAH instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return ((uint16_t) result);
+}
+
+
+/**
+ \brief Load-Acquire (32 bit)
+ \details Executes a LDA instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+ uint32_t result;
+
+ __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) );
+ return(result);
+}
+
+
+/**
+ \brief Store-Release (8 bit)
+ \details Executes a STLB instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+ __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+ \brief Store-Release (16 bit)
+ \details Executes a STLH instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+ __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+ \brief Store-Release (32 bit)
+ \details Executes a STL instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+ __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+ \brief Load-Acquire Exclusive (8 bit)
+ \details Executes a LDAB exclusive instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#define __LDAEXB (uint8_t)__builtin_arm_ldaex
+
+
+/**
+ \brief Load-Acquire Exclusive (16 bit)
+ \details Executes a LDAH exclusive instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#define __LDAEXH (uint16_t)__builtin_arm_ldaex
+
+
+/**
+ \brief Load-Acquire Exclusive (32 bit)
+ \details Executes a LDA exclusive instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#define __LDAEX (uint32_t)__builtin_arm_ldaex
+
+
+/**
+ \brief Store-Release Exclusive (8 bit)
+ \details Executes a STLB exclusive instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STLEXB (uint32_t)__builtin_arm_stlex
+
+
+/**
+ \brief Store-Release Exclusive (16 bit)
+ \details Executes a STLH exclusive instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STLEXH (uint32_t)__builtin_arm_stlex
+
+
+/**
+ \brief Store-Release Exclusive (32 bit)
+ \details Executes a STL exclusive instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STLEX (uint32_t)__builtin_arm_stlex
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+ (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ################### Compiler specific Intrinsics ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+ Access to dedicated SIMD instructions
+ @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#define __SSAT16(ARG1,ARG2) \
+({ \
+ int32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("ssat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+#define __USAT16(ARG1,ARG2) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("usat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
+{
+ uint32_t result;
+
+ __ASM volatile ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
+{
+ uint32_t result;
+
+ __ASM volatile ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ /* Little endian */
+ __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else /* Big endian */
+ __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ /* Little endian */
+ __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else /* Big endian */
+ __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ /* Little endian */
+ __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else /* Big endian */
+ __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ /* Little endian */
+ __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else /* Big endian */
+ __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SEL (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2)
+{
+ int32_t result;
+
+ __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2)
+{
+ int32_t result;
+
+ __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
+ ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
+
+#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \
+ ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) )
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+ int32_t result;
+
+ __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCLANG_H */
diff --git a/Include/cmsis_compiler.h b/Include/cmsis_compiler.h
index 94212eb..adbf296 100644
--- a/Include/cmsis_compiler.h
+++ b/Include/cmsis_compiler.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file cmsis_compiler.h
* @brief CMSIS compiler generic header file
- * @version V5.0.4
- * @date 10. January 2018
+ * @version V5.1.0
+ * @date 09. October 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -35,9 +35,15 @@
/*
- * Arm Compiler 6 (armclang)
+ * Arm Compiler 6.6 LTM (armclang)
*/
-#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
+ #include "cmsis_armclang_ltm.h"
+
+ /*
+ * Arm Compiler above 6.10.1 (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
#include "cmsis_armclang.h"
@@ -115,8 +121,11 @@
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __RESTRICT
- #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
- #define __RESTRICT
+ #define __RESTRICT __restrict
+ #endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
#endif
@@ -187,6 +196,10 @@
#warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
#define __RESTRICT
#endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
+ #endif
/*
@@ -255,6 +268,10 @@
#warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
#define __RESTRICT
#endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
+ #endif
#else
diff --git a/Include/cmsis_gcc.h b/Include/cmsis_gcc.h
index 2d9db15..3ddcc58 100644
--- a/Include/cmsis_gcc.h
+++ b/Include/cmsis_gcc.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file cmsis_gcc.h
* @brief CMSIS compiler GCC header file
- * @version V5.0.4
- * @date 09. April 2018
+ * @version V5.2.0
+ * @date 08. May 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -113,7 +113,74 @@
#ifndef __RESTRICT
#define __RESTRICT __restrict
#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
+#endif
+/* ######################### Startup and Lowlevel Init ######################## */
+
+#ifndef __PROGRAM_START
+
+/**
+ \brief Initializes data and bss sections
+ \details This default implementations initialized all data and additional bss
+ sections relying on .copy.table and .zero.table specified properly
+ in the used linker script.
+
+ */
+__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
+{
+ extern void _start(void) __NO_RETURN;
+
+ typedef struct {
+ uint32_t const* src;
+ uint32_t* dest;
+ uint32_t wlen;
+ } __copy_table_t;
+
+ typedef struct {
+ uint32_t* dest;
+ uint32_t wlen;
+ } __zero_table_t;
+
+ extern const __copy_table_t __copy_table_start__;
+ extern const __copy_table_t __copy_table_end__;
+ extern const __zero_table_t __zero_table_start__;
+ extern const __zero_table_t __zero_table_end__;
+
+ for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) {
+ for(uint32_t i=0u; iwlen; ++i) {
+ pTable->dest[i] = pTable->src[i];
+ }
+ }
+
+ for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) {
+ for(uint32_t i=0u; iwlen; ++i) {
+ pTable->dest[i] = 0u;
+ }
+ }
+
+ _start();
+}
+
+#define __PROGRAM_START __cmsis_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP __StackTop
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT __StackLimit
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE __attribute((used, section(".vectors")))
+#endif
/* ########################### Core Function Access ########################### */
/** \ingroup CMSIS_Core_FunctionInterface
@@ -1008,7 +1075,23 @@ __STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
\param [in] value Value to count the leading zeros
\return number of leading zeros in value
*/
-#define __CLZ (uint8_t)__builtin_clz
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+ /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+ __builtin_clz(0) is undefined behaviour, so handle this case specially.
+ This guarantees ARM-compatible results if happening to compile on a non-ARM
+ target, and ensures the compiler doesn't decide to activate any
+ optimisations using the logic "value was passed to __builtin_clz, so it
+ is non-zero".
+ ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
+ single CLZ instruction.
+ */
+ if (value == 0U)
+ {
+ return 32U;
+ }
+ return __builtin_clz(value);
+}
#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
diff --git a/Include/cmsis_iccarm.h b/Include/cmsis_iccarm.h
index 11c4af0..12d68fd 100644
--- a/Include/cmsis_iccarm.h
+++ b/Include/cmsis_iccarm.h
@@ -1,13 +1,14 @@
/**************************************************************************//**
* @file cmsis_iccarm.h
* @brief CMSIS compiler ICCARM (IAR Compiler for Arm) header file
- * @version V5.0.7
- * @date 19. June 2018
+ * @version V5.1.0
+ * @date 08. May 2019
******************************************************************************/
//------------------------------------------------------------------------------
//
-// Copyright (c) 2017-2018 IAR Systems
+// Copyright (c) 2017-2019 IAR Systems
+// Copyright (c) 2017-2019 Arm Limited. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License")
// you may not use this file except in compliance with the License.
@@ -110,6 +111,10 @@
#define __ASM __asm
#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
+#endif
+
#ifndef __INLINE
#define __INLINE inline
#endif
@@ -150,7 +155,12 @@
#endif
#ifndef __RESTRICT
- #define __RESTRICT __restrict
+ #if __ICCARM_V8
+ #define __RESTRICT __restrict
+ #else
+ /* Needs IAR language extensions */
+ #define __RESTRICT restrict
+ #endif
#endif
#ifndef __STATIC_INLINE
@@ -234,6 +244,25 @@ __packed struct __iar_u32 { uint32_t v; };
#endif
#endif
+#ifndef __PROGRAM_START
+#define __PROGRAM_START __iar_program_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP CSTACK$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT CSTACK$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __vector_table
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE @".intvec"
+#endif
#ifndef __ICCARM_INTRINSICS_VERSION__
#define __ICCARM_INTRINSICS_VERSION__ 0
diff --git a/Include/cmsis_version.h b/Include/cmsis_version.h
index 660f612..f2e2746 100644
--- a/Include/cmsis_version.h
+++ b/Include/cmsis_version.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file cmsis_version.h
* @brief CMSIS Core(M) Version definitions
- * @version V5.0.2
- * @date 19. April 2017
+ * @version V5.0.3
+ * @date 24. June 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2017 ARM Limited. All rights reserved.
+ * Copyright (c) 2009-2019 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -33,7 +33,7 @@
/* CMSIS Version definitions */
#define __CM_CMSIS_VERSION_MAIN ( 5U) /*!< [31:16] CMSIS Core(M) main version */
-#define __CM_CMSIS_VERSION_SUB ( 1U) /*!< [15:0] CMSIS Core(M) sub version */
+#define __CM_CMSIS_VERSION_SUB ( 3U) /*!< [15:0] CMSIS Core(M) sub version */
#define __CM_CMSIS_VERSION ((__CM_CMSIS_VERSION_MAIN << 16U) | \
__CM_CMSIS_VERSION_SUB ) /*!< CMSIS Core(M) version number */
#endif
diff --git a/Include/core_armv81mml.h b/Include/core_armv81mml.h
new file mode 100644
index 0000000..8441e57
--- /dev/null
+++ b/Include/core_armv81mml.h
@@ -0,0 +1,2968 @@
+/**************************************************************************//**
+ * @file core_armv81mml.h
+ * @brief CMSIS Armv8.1-M Mainline Core Peripheral Access Layer Header File
+ * @version V1.0.0
+ * @date 15. March 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2018-2019 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined ( __ICCARM__ )
+ #pragma system_include /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+ #pragma clang system_header /* treat file as system include file */
+#endif
+
+#ifndef __CORE_ARMV81MML_H_GENERIC
+#define __CORE_ARMV81MML_H_GENERIC
+
+#include
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+ \page CMSIS_MISRA_Exceptions MISRA-C:2004 Compliance Exceptions
+ CMSIS violates the following MISRA-C:2004 rules:
+
+ \li Required Rule 8.5, object/function definition in header file.
+ Function definitions in header files are used to allow 'inlining'.
+
+ \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Unions are used for effective representation of core registers.
+
+ \li Advisory Rule 19.7, Function-like macro defined.
+ Function-like macros are used to allow more efficient code.
+ */
+
+
+/*******************************************************************************
+ * CMSIS definitions
+ ******************************************************************************/
+/**
+ \ingroup Cortex_ARMV81MML
+ @{
+ */
+
+#include "cmsis_version.h"
+
+#define __ARM_ARCH_8M_MAIN__ 1 // patching for now
+/* CMSIS ARMV81MML definitions */
+#define __ARMv81MML_CMSIS_VERSION_MAIN (__CM_CMSIS_VERSION_MAIN) /*!< \deprecated [31:16] CMSIS HAL main version */
+#define __ARMv81MML_CMSIS_VERSION_SUB (__CM_CMSIS_VERSION_SUB) /*!< \deprecated [15:0] CMSIS HAL sub version */
+#define __ARMv81MML_CMSIS_VERSION ((__ARMv81MML_CMSIS_VERSION_MAIN << 16U) | \
+ __ARMv81MML_CMSIS_VERSION_SUB ) /*!< \deprecated CMSIS HAL version number */
+
+#define __CORTEX_M (81U) /*!< Cortex-M Core */
+
+/** __FPU_USED indicates whether an FPU is used or not.
+ For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions.
+*/
+#if defined ( __CC_ARM )
+ #if defined __TARGET_FPU_VFP
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined(__ARM_FEATURE_DSP)
+ #if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+ #if defined __ARM_FP
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined(__ARM_FEATURE_DSP)
+ #if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __GNUC__ )
+ #if defined (__VFP_FP__) && !defined(__SOFTFP__)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined(__ARM_FEATURE_DSP)
+ #if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __ICCARM__ )
+ #if defined __ARMVFP__
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined(__ARM_FEATURE_DSP)
+ #if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __TI_ARM__ )
+ #if defined __TI_VFP_SUPPORT__
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#elif defined ( __TASKING__ )
+ #if defined __FPU_VFP__
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#elif defined ( __CSMC__ )
+ #if ( __CSMC__ & 0x400U)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#endif
+
+#include "cmsis_compiler.h" /* CMSIS compiler specific defines */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_ARMV81MML_H_GENERIC */
+
+#ifndef __CMSIS_GENERIC
+
+#ifndef __CORE_ARMV81MML_H_DEPENDANT
+#define __CORE_ARMV81MML_H_DEPENDANT
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* check device defines and use defaults */
+#if defined __CHECK_DEVICE_DEFINES
+ #ifndef __ARMv81MML_REV
+ #define __ARMv81MML_REV 0x0000U
+ #warning "__ARMv81MML_REV not defined in device header file; using default!"
+ #endif
+
+ #ifndef __FPU_PRESENT
+ #define __FPU_PRESENT 0U
+ #warning "__FPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __MPU_PRESENT
+ #define __MPU_PRESENT 0U
+ #warning "__MPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __SAUREGION_PRESENT
+ #define __SAUREGION_PRESENT 0U
+ #warning "__SAUREGION_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __DSP_PRESENT
+ #define __DSP_PRESENT 0U
+ #warning "__DSP_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __NVIC_PRIO_BITS
+ #define __NVIC_PRIO_BITS 3U
+ #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
+ #endif
+
+ #ifndef __Vendor_SysTickConfig
+ #define __Vendor_SysTickConfig 0U
+ #warning "__Vendor_SysTickConfig not defined in device header file; using default!"
+ #endif
+#endif
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+ \defgroup CMSIS_glob_defs CMSIS Global Defines
+
+ IO Type Qualifiers are used
+ \li to specify the access to peripheral variables.
+ \li for automatic generation of peripheral register debug information.
+*/
+#ifdef __cplusplus
+ #define __I volatile /*!< Defines 'read only' permissions */
+#else
+ #define __I volatile const /*!< Defines 'read only' permissions */
+#endif
+#define __O volatile /*!< Defines 'write only' permissions */
+#define __IO volatile /*!< Defines 'read / write' permissions */
+
+/* following defines should be used for structure members */
+#define __IM volatile const /*! Defines 'read only' structure member permissions */
+#define __OM volatile /*! Defines 'write only' structure member permissions */
+#define __IOM volatile /*! Defines 'read / write' structure member permissions */
+
+/*@} end of group ARMv81MML */
+
+
+
+/*******************************************************************************
+ * Register Abstraction
+ Core Register contain:
+ - Core Register
+ - Core NVIC Register
+ - Core SCB Register
+ - Core SysTick Register
+ - Core Debug Register
+ - Core MPU Register
+ - Core SAU Register
+ - Core FPU Register
+ ******************************************************************************/
+/**
+ \defgroup CMSIS_core_register Defines and Type Definitions
+ \brief Type definitions and defines for Cortex-M processor based devices.
+*/
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_CORE Status and Control Registers
+ \brief Core Register type definitions.
+ @{
+ */
+
+/**
+ \brief Union type to access the Application Program Status Register (APSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t _reserved0:16; /*!< bit: 0..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:7; /*!< bit: 20..26 Reserved */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} APSR_Type;
+
+/* APSR Register Definitions */
+#define APSR_N_Pos 31U /*!< APSR: N Position */
+#define APSR_N_Msk (1UL << APSR_N_Pos) /*!< APSR: N Mask */
+
+#define APSR_Z_Pos 30U /*!< APSR: Z Position */
+#define APSR_Z_Msk (1UL << APSR_Z_Pos) /*!< APSR: Z Mask */
+
+#define APSR_C_Pos 29U /*!< APSR: C Position */
+#define APSR_C_Msk (1UL << APSR_C_Pos) /*!< APSR: C Mask */
+
+#define APSR_V_Pos 28U /*!< APSR: V Position */
+#define APSR_V_Msk (1UL << APSR_V_Pos) /*!< APSR: V Mask */
+
+#define APSR_Q_Pos 27U /*!< APSR: Q Position */
+#define APSR_Q_Msk (1UL << APSR_Q_Pos) /*!< APSR: Q Mask */
+
+#define APSR_GE_Pos 16U /*!< APSR: GE Position */
+#define APSR_GE_Msk (0xFUL << APSR_GE_Pos) /*!< APSR: GE Mask */
+
+
+/**
+ \brief Union type to access the Interrupt Program Status Register (IPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:23; /*!< bit: 9..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} IPSR_Type;
+
+/* IPSR Register Definitions */
+#define IPSR_ISR_Pos 0U /*!< IPSR: ISR Position */
+#define IPSR_ISR_Msk (0x1FFUL /*<< IPSR_ISR_Pos*/) /*!< IPSR: ISR Mask */
+
+
+/**
+ \brief Union type to access the Special-Purpose Program Status Registers (xPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:7; /*!< bit: 9..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:4; /*!< bit: 20..23 Reserved */
+ uint32_t T:1; /*!< bit: 24 Thumb bit (read 0) */
+ uint32_t IT:2; /*!< bit: 25..26 saved IT state (read 0) */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} xPSR_Type;
+
+/* xPSR Register Definitions */
+#define xPSR_N_Pos 31U /*!< xPSR: N Position */
+#define xPSR_N_Msk (1UL << xPSR_N_Pos) /*!< xPSR: N Mask */
+
+#define xPSR_Z_Pos 30U /*!< xPSR: Z Position */
+#define xPSR_Z_Msk (1UL << xPSR_Z_Pos) /*!< xPSR: Z Mask */
+
+#define xPSR_C_Pos 29U /*!< xPSR: C Position */
+#define xPSR_C_Msk (1UL << xPSR_C_Pos) /*!< xPSR: C Mask */
+
+#define xPSR_V_Pos 28U /*!< xPSR: V Position */
+#define xPSR_V_Msk (1UL << xPSR_V_Pos) /*!< xPSR: V Mask */
+
+#define xPSR_Q_Pos 27U /*!< xPSR: Q Position */
+#define xPSR_Q_Msk (1UL << xPSR_Q_Pos) /*!< xPSR: Q Mask */
+
+#define xPSR_IT_Pos 25U /*!< xPSR: IT Position */
+#define xPSR_IT_Msk (3UL << xPSR_IT_Pos) /*!< xPSR: IT Mask */
+
+#define xPSR_T_Pos 24U /*!< xPSR: T Position */
+#define xPSR_T_Msk (1UL << xPSR_T_Pos) /*!< xPSR: T Mask */
+
+#define xPSR_GE_Pos 16U /*!< xPSR: GE Position */
+#define xPSR_GE_Msk (0xFUL << xPSR_GE_Pos) /*!< xPSR: GE Mask */
+
+#define xPSR_ISR_Pos 0U /*!< xPSR: ISR Position */
+#define xPSR_ISR_Msk (0x1FFUL /*<< xPSR_ISR_Pos*/) /*!< xPSR: ISR Mask */
+
+
+/**
+ \brief Union type to access the Control Registers (CONTROL).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t nPRIV:1; /*!< bit: 0 Execution privilege in Thread mode */
+ uint32_t SPSEL:1; /*!< bit: 1 Stack-pointer select */
+ uint32_t FPCA:1; /*!< bit: 2 Floating-point context active */
+ uint32_t SFPA:1; /*!< bit: 3 Secure floating-point active */
+ uint32_t _reserved1:28; /*!< bit: 4..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} CONTROL_Type;
+
+/* CONTROL Register Definitions */
+#define CONTROL_SFPA_Pos 3U /*!< CONTROL: SFPA Position */
+#define CONTROL_SFPA_Msk (1UL << CONTROL_SFPA_Pos) /*!< CONTROL: SFPA Mask */
+
+#define CONTROL_FPCA_Pos 2U /*!< CONTROL: FPCA Position */
+#define CONTROL_FPCA_Msk (1UL << CONTROL_FPCA_Pos) /*!< CONTROL: FPCA Mask */
+
+#define CONTROL_SPSEL_Pos 1U /*!< CONTROL: SPSEL Position */
+#define CONTROL_SPSEL_Msk (1UL << CONTROL_SPSEL_Pos) /*!< CONTROL: SPSEL Mask */
+
+#define CONTROL_nPRIV_Pos 0U /*!< CONTROL: nPRIV Position */
+#define CONTROL_nPRIV_Msk (1UL /*<< CONTROL_nPRIV_Pos*/) /*!< CONTROL: nPRIV Mask */
+
+/*@} end of group CMSIS_CORE */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_NVIC Nested Vectored Interrupt Controller (NVIC)
+ \brief Type definitions for the NVIC Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the Nested Vectored Interrupt Controller (NVIC).
+ */
+typedef struct
+{
+ __IOM uint32_t ISER[16U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
+ uint32_t RESERVED0[16U];
+ __IOM uint32_t ICER[16U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
+ uint32_t RSERVED1[16U];
+ __IOM uint32_t ISPR[16U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
+ uint32_t RESERVED2[16U];
+ __IOM uint32_t ICPR[16U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
+ uint32_t RESERVED3[16U];
+ __IOM uint32_t IABR[16U]; /*!< Offset: 0x200 (R/W) Interrupt Active bit Register */
+ uint32_t RESERVED4[16U];
+ __IOM uint32_t ITNS[16U]; /*!< Offset: 0x280 (R/W) Interrupt Non-Secure State Register */
+ uint32_t RESERVED5[16U];
+ __IOM uint8_t IPR[496U]; /*!< Offset: 0x300 (R/W) Interrupt Priority Register (8Bit wide) */
+ uint32_t RESERVED6[580U];
+ __OM uint32_t STIR; /*!< Offset: 0xE00 ( /W) Software Trigger Interrupt Register */
+} NVIC_Type;
+
+/* Software Triggered Interrupt Register Definitions */
+#define NVIC_STIR_INTID_Pos 0U /*!< STIR: INTLINESNUM Position */
+#define NVIC_STIR_INTID_Msk (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/) /*!< STIR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_NVIC */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCB System Control Block (SCB)
+ \brief Type definitions for the System Control Block Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Control Block (SCB).
+ */
+typedef struct
+{
+ __IM uint32_t CPUID; /*!< Offset: 0x000 (R/ ) CPUID Base Register */
+ __IOM uint32_t ICSR; /*!< Offset: 0x004 (R/W) Interrupt Control and State Register */
+ __IOM uint32_t VTOR; /*!< Offset: 0x008 (R/W) Vector Table Offset Register */
+ __IOM uint32_t AIRCR; /*!< Offset: 0x00C (R/W) Application Interrupt and Reset Control Register */
+ __IOM uint32_t SCR; /*!< Offset: 0x010 (R/W) System Control Register */
+ __IOM uint32_t CCR; /*!< Offset: 0x014 (R/W) Configuration Control Register */
+ __IOM uint8_t SHPR[12U]; /*!< Offset: 0x018 (R/W) System Handlers Priority Registers (4-7, 8-11, 12-15) */
+ __IOM uint32_t SHCSR; /*!< Offset: 0x024 (R/W) System Handler Control and State Register */
+ __IOM uint32_t CFSR; /*!< Offset: 0x028 (R/W) Configurable Fault Status Register */
+ __IOM uint32_t HFSR; /*!< Offset: 0x02C (R/W) HardFault Status Register */
+ __IOM uint32_t DFSR; /*!< Offset: 0x030 (R/W) Debug Fault Status Register */
+ __IOM uint32_t MMFAR; /*!< Offset: 0x034 (R/W) MemManage Fault Address Register */
+ __IOM uint32_t BFAR; /*!< Offset: 0x038 (R/W) BusFault Address Register */
+ __IOM uint32_t AFSR; /*!< Offset: 0x03C (R/W) Auxiliary Fault Status Register */
+ __IM uint32_t ID_PFR[2U]; /*!< Offset: 0x040 (R/ ) Processor Feature Register */
+ __IM uint32_t ID_DFR; /*!< Offset: 0x048 (R/ ) Debug Feature Register */
+ __IM uint32_t ID_ADR; /*!< Offset: 0x04C (R/ ) Auxiliary Feature Register */
+ __IM uint32_t ID_MMFR[4U]; /*!< Offset: 0x050 (R/ ) Memory Model Feature Register */
+ __IM uint32_t ID_ISAR[6U]; /*!< Offset: 0x060 (R/ ) Instruction Set Attributes Register */
+ __IM uint32_t CLIDR; /*!< Offset: 0x078 (R/ ) Cache Level ID register */
+ __IM uint32_t CTR; /*!< Offset: 0x07C (R/ ) Cache Type register */
+ __IM uint32_t CCSIDR; /*!< Offset: 0x080 (R/ ) Cache Size ID Register */
+ __IOM uint32_t CSSELR; /*!< Offset: 0x084 (R/W) Cache Size Selection Register */
+ __IOM uint32_t CPACR; /*!< Offset: 0x088 (R/W) Coprocessor Access Control Register */
+ __IOM uint32_t NSACR; /*!< Offset: 0x08C (R/W) Non-Secure Access Control Register */
+ uint32_t RESERVED3[92U];
+ __OM uint32_t STIR; /*!< Offset: 0x200 ( /W) Software Triggered Interrupt Register */
+ uint32_t RESERVED4[15U];
+ __IM uint32_t MVFR0; /*!< Offset: 0x240 (R/ ) Media and VFP Feature Register 0 */
+ __IM uint32_t MVFR1; /*!< Offset: 0x244 (R/ ) Media and VFP Feature Register 1 */
+ __IM uint32_t MVFR2; /*!< Offset: 0x248 (R/ ) Media and VFP Feature Register 2 */
+ uint32_t RESERVED5[1U];
+ __OM uint32_t ICIALLU; /*!< Offset: 0x250 ( /W) I-Cache Invalidate All to PoU */
+ uint32_t RESERVED6[1U];
+ __OM uint32_t ICIMVAU; /*!< Offset: 0x258 ( /W) I-Cache Invalidate by MVA to PoU */
+ __OM uint32_t DCIMVAC; /*!< Offset: 0x25C ( /W) D-Cache Invalidate by MVA to PoC */
+ __OM uint32_t DCISW; /*!< Offset: 0x260 ( /W) D-Cache Invalidate by Set-way */
+ __OM uint32_t DCCMVAU; /*!< Offset: 0x264 ( /W) D-Cache Clean by MVA to PoU */
+ __OM uint32_t DCCMVAC; /*!< Offset: 0x268 ( /W) D-Cache Clean by MVA to PoC */
+ __OM uint32_t DCCSW; /*!< Offset: 0x26C ( /W) D-Cache Clean by Set-way */
+ __OM uint32_t DCCIMVAC; /*!< Offset: 0x270 ( /W) D-Cache Clean and Invalidate by MVA to PoC */
+ __OM uint32_t DCCISW; /*!< Offset: 0x274 ( /W) D-Cache Clean and Invalidate by Set-way */
+ uint32_t RESERVED7[6U];
+ __IOM uint32_t ITCMCR; /*!< Offset: 0x290 (R/W) Instruction Tightly-Coupled Memory Control Register */
+ __IOM uint32_t DTCMCR; /*!< Offset: 0x294 (R/W) Data Tightly-Coupled Memory Control Registers */
+ __IOM uint32_t AHBPCR; /*!< Offset: 0x298 (R/W) AHBP Control Register */
+ __IOM uint32_t CACR; /*!< Offset: 0x29C (R/W) L1 Cache Control Register */
+ __IOM uint32_t AHBSCR; /*!< Offset: 0x2A0 (R/W) AHB Slave Control Register */
+ uint32_t RESERVED8[1U];
+ __IOM uint32_t ABFSR; /*!< Offset: 0x2A8 (R/W) Auxiliary Bus Fault Status Register */
+} SCB_Type;
+
+/* SCB CPUID Register Definitions */
+#define SCB_CPUID_IMPLEMENTER_Pos 24U /*!< SCB CPUID: IMPLEMENTER Position */
+#define SCB_CPUID_IMPLEMENTER_Msk (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos) /*!< SCB CPUID: IMPLEMENTER Mask */
+
+#define SCB_CPUID_VARIANT_Pos 20U /*!< SCB CPUID: VARIANT Position */
+#define SCB_CPUID_VARIANT_Msk (0xFUL << SCB_CPUID_VARIANT_Pos) /*!< SCB CPUID: VARIANT Mask */
+
+#define SCB_CPUID_ARCHITECTURE_Pos 16U /*!< SCB CPUID: ARCHITECTURE Position */
+#define SCB_CPUID_ARCHITECTURE_Msk (0xFUL << SCB_CPUID_ARCHITECTURE_Pos) /*!< SCB CPUID: ARCHITECTURE Mask */
+
+#define SCB_CPUID_PARTNO_Pos 4U /*!< SCB CPUID: PARTNO Position */
+#define SCB_CPUID_PARTNO_Msk (0xFFFUL << SCB_CPUID_PARTNO_Pos) /*!< SCB CPUID: PARTNO Mask */
+
+#define SCB_CPUID_REVISION_Pos 0U /*!< SCB CPUID: REVISION Position */
+#define SCB_CPUID_REVISION_Msk (0xFUL /*<< SCB_CPUID_REVISION_Pos*/) /*!< SCB CPUID: REVISION Mask */
+
+/* SCB Interrupt Control State Register Definitions */
+#define SCB_ICSR_PENDNMISET_Pos 31U /*!< SCB ICSR: PENDNMISET Position */
+#define SCB_ICSR_PENDNMISET_Msk (1UL << SCB_ICSR_PENDNMISET_Pos) /*!< SCB ICSR: PENDNMISET Mask */
+
+#define SCB_ICSR_NMIPENDSET_Pos SCB_ICSR_PENDNMISET_Pos /*!< SCB ICSR: NMIPENDSET Position, backward compatibility */
+#define SCB_ICSR_NMIPENDSET_Msk SCB_ICSR_PENDNMISET_Msk /*!< SCB ICSR: NMIPENDSET Mask, backward compatibility */
+
+#define SCB_ICSR_PENDNMICLR_Pos 30U /*!< SCB ICSR: PENDNMICLR Position */
+#define SCB_ICSR_PENDNMICLR_Msk (1UL << SCB_ICSR_PENDNMICLR_Pos) /*!< SCB ICSR: PENDNMICLR Mask */
+
+#define SCB_ICSR_PENDSVSET_Pos 28U /*!< SCB ICSR: PENDSVSET Position */
+#define SCB_ICSR_PENDSVSET_Msk (1UL << SCB_ICSR_PENDSVSET_Pos) /*!< SCB ICSR: PENDSVSET Mask */
+
+#define SCB_ICSR_PENDSVCLR_Pos 27U /*!< SCB ICSR: PENDSVCLR Position */
+#define SCB_ICSR_PENDSVCLR_Msk (1UL << SCB_ICSR_PENDSVCLR_Pos) /*!< SCB ICSR: PENDSVCLR Mask */
+
+#define SCB_ICSR_PENDSTSET_Pos 26U /*!< SCB ICSR: PENDSTSET Position */
+#define SCB_ICSR_PENDSTSET_Msk (1UL << SCB_ICSR_PENDSTSET_Pos) /*!< SCB ICSR: PENDSTSET Mask */
+
+#define SCB_ICSR_PENDSTCLR_Pos 25U /*!< SCB ICSR: PENDSTCLR Position */
+#define SCB_ICSR_PENDSTCLR_Msk (1UL << SCB_ICSR_PENDSTCLR_Pos) /*!< SCB ICSR: PENDSTCLR Mask */
+
+#define SCB_ICSR_STTNS_Pos 24U /*!< SCB ICSR: STTNS Position (Security Extension) */
+#define SCB_ICSR_STTNS_Msk (1UL << SCB_ICSR_STTNS_Pos) /*!< SCB ICSR: STTNS Mask (Security Extension) */
+
+#define SCB_ICSR_ISRPREEMPT_Pos 23U /*!< SCB ICSR: ISRPREEMPT Position */
+#define SCB_ICSR_ISRPREEMPT_Msk (1UL << SCB_ICSR_ISRPREEMPT_Pos) /*!< SCB ICSR: ISRPREEMPT Mask */
+
+#define SCB_ICSR_ISRPENDING_Pos 22U /*!< SCB ICSR: ISRPENDING Position */
+#define SCB_ICSR_ISRPENDING_Msk (1UL << SCB_ICSR_ISRPENDING_Pos) /*!< SCB ICSR: ISRPENDING Mask */
+
+#define SCB_ICSR_VECTPENDING_Pos 12U /*!< SCB ICSR: VECTPENDING Position */
+#define SCB_ICSR_VECTPENDING_Msk (0x1FFUL << SCB_ICSR_VECTPENDING_Pos) /*!< SCB ICSR: VECTPENDING Mask */
+
+#define SCB_ICSR_RETTOBASE_Pos 11U /*!< SCB ICSR: RETTOBASE Position */
+#define SCB_ICSR_RETTOBASE_Msk (1UL << SCB_ICSR_RETTOBASE_Pos) /*!< SCB ICSR: RETTOBASE Mask */
+
+#define SCB_ICSR_VECTACTIVE_Pos 0U /*!< SCB ICSR: VECTACTIVE Position */
+#define SCB_ICSR_VECTACTIVE_Msk (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/) /*!< SCB ICSR: VECTACTIVE Mask */
+
+/* SCB Vector Table Offset Register Definitions */
+#define SCB_VTOR_TBLOFF_Pos 7U /*!< SCB VTOR: TBLOFF Position */
+#define SCB_VTOR_TBLOFF_Msk (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos) /*!< SCB VTOR: TBLOFF Mask */
+
+/* SCB Application Interrupt and Reset Control Register Definitions */
+#define SCB_AIRCR_VECTKEY_Pos 16U /*!< SCB AIRCR: VECTKEY Position */
+#define SCB_AIRCR_VECTKEY_Msk (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos) /*!< SCB AIRCR: VECTKEY Mask */
+
+#define SCB_AIRCR_VECTKEYSTAT_Pos 16U /*!< SCB AIRCR: VECTKEYSTAT Position */
+#define SCB_AIRCR_VECTKEYSTAT_Msk (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos) /*!< SCB AIRCR: VECTKEYSTAT Mask */
+
+#define SCB_AIRCR_ENDIANESS_Pos 15U /*!< SCB AIRCR: ENDIANESS Position */
+#define SCB_AIRCR_ENDIANESS_Msk (1UL << SCB_AIRCR_ENDIANESS_Pos) /*!< SCB AIRCR: ENDIANESS Mask */
+
+#define SCB_AIRCR_PRIS_Pos 14U /*!< SCB AIRCR: PRIS Position */
+#define SCB_AIRCR_PRIS_Msk (1UL << SCB_AIRCR_PRIS_Pos) /*!< SCB AIRCR: PRIS Mask */
+
+#define SCB_AIRCR_BFHFNMINS_Pos 13U /*!< SCB AIRCR: BFHFNMINS Position */
+#define SCB_AIRCR_BFHFNMINS_Msk (1UL << SCB_AIRCR_BFHFNMINS_Pos) /*!< SCB AIRCR: BFHFNMINS Mask */
+
+#define SCB_AIRCR_PRIGROUP_Pos 8U /*!< SCB AIRCR: PRIGROUP Position */
+#define SCB_AIRCR_PRIGROUP_Msk (7UL << SCB_AIRCR_PRIGROUP_Pos) /*!< SCB AIRCR: PRIGROUP Mask */
+
+#define SCB_AIRCR_SYSRESETREQS_Pos 3U /*!< SCB AIRCR: SYSRESETREQS Position */
+#define SCB_AIRCR_SYSRESETREQS_Msk (1UL << SCB_AIRCR_SYSRESETREQS_Pos) /*!< SCB AIRCR: SYSRESETREQS Mask */
+
+#define SCB_AIRCR_SYSRESETREQ_Pos 2U /*!< SCB AIRCR: SYSRESETREQ Position */
+#define SCB_AIRCR_SYSRESETREQ_Msk (1UL << SCB_AIRCR_SYSRESETREQ_Pos) /*!< SCB AIRCR: SYSRESETREQ Mask */
+
+#define SCB_AIRCR_VECTCLRACTIVE_Pos 1U /*!< SCB AIRCR: VECTCLRACTIVE Position */
+#define SCB_AIRCR_VECTCLRACTIVE_Msk (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos) /*!< SCB AIRCR: VECTCLRACTIVE Mask */
+
+/* SCB System Control Register Definitions */
+#define SCB_SCR_SEVONPEND_Pos 4U /*!< SCB SCR: SEVONPEND Position */
+#define SCB_SCR_SEVONPEND_Msk (1UL << SCB_SCR_SEVONPEND_Pos) /*!< SCB SCR: SEVONPEND Mask */
+
+#define SCB_SCR_SLEEPDEEPS_Pos 3U /*!< SCB SCR: SLEEPDEEPS Position */
+#define SCB_SCR_SLEEPDEEPS_Msk (1UL << SCB_SCR_SLEEPDEEPS_Pos) /*!< SCB SCR: SLEEPDEEPS Mask */
+
+#define SCB_SCR_SLEEPDEEP_Pos 2U /*!< SCB SCR: SLEEPDEEP Position */
+#define SCB_SCR_SLEEPDEEP_Msk (1UL << SCB_SCR_SLEEPDEEP_Pos) /*!< SCB SCR: SLEEPDEEP Mask */
+
+#define SCB_SCR_SLEEPONEXIT_Pos 1U /*!< SCB SCR: SLEEPONEXIT Position */
+#define SCB_SCR_SLEEPONEXIT_Msk (1UL << SCB_SCR_SLEEPONEXIT_Pos) /*!< SCB SCR: SLEEPONEXIT Mask */
+
+/* SCB Configuration Control Register Definitions */
+#define SCB_CCR_BP_Pos 18U /*!< SCB CCR: BP Position */
+#define SCB_CCR_BP_Msk (1UL << SCB_CCR_BP_Pos) /*!< SCB CCR: BP Mask */
+
+#define SCB_CCR_IC_Pos 17U /*!< SCB CCR: IC Position */
+#define SCB_CCR_IC_Msk (1UL << SCB_CCR_IC_Pos) /*!< SCB CCR: IC Mask */
+
+#define SCB_CCR_DC_Pos 16U /*!< SCB CCR: DC Position */
+#define SCB_CCR_DC_Msk (1UL << SCB_CCR_DC_Pos) /*!< SCB CCR: DC Mask */
+
+#define SCB_CCR_STKOFHFNMIGN_Pos 10U /*!< SCB CCR: STKOFHFNMIGN Position */
+#define SCB_CCR_STKOFHFNMIGN_Msk (1UL << SCB_CCR_STKOFHFNMIGN_Pos) /*!< SCB CCR: STKOFHFNMIGN Mask */
+
+#define SCB_CCR_BFHFNMIGN_Pos 8U /*!< SCB CCR: BFHFNMIGN Position */
+#define SCB_CCR_BFHFNMIGN_Msk (1UL << SCB_CCR_BFHFNMIGN_Pos) /*!< SCB CCR: BFHFNMIGN Mask */
+
+#define SCB_CCR_DIV_0_TRP_Pos 4U /*!< SCB CCR: DIV_0_TRP Position */
+#define SCB_CCR_DIV_0_TRP_Msk (1UL << SCB_CCR_DIV_0_TRP_Pos) /*!< SCB CCR: DIV_0_TRP Mask */
+
+#define SCB_CCR_UNALIGN_TRP_Pos 3U /*!< SCB CCR: UNALIGN_TRP Position */
+#define SCB_CCR_UNALIGN_TRP_Msk (1UL << SCB_CCR_UNALIGN_TRP_Pos) /*!< SCB CCR: UNALIGN_TRP Mask */
+
+#define SCB_CCR_USERSETMPEND_Pos 1U /*!< SCB CCR: USERSETMPEND Position */
+#define SCB_CCR_USERSETMPEND_Msk (1UL << SCB_CCR_USERSETMPEND_Pos) /*!< SCB CCR: USERSETMPEND Mask */
+
+/* SCB System Handler Control and State Register Definitions */
+#define SCB_SHCSR_HARDFAULTPENDED_Pos 21U /*!< SCB SHCSR: HARDFAULTPENDED Position */
+#define SCB_SHCSR_HARDFAULTPENDED_Msk (1UL << SCB_SHCSR_HARDFAULTPENDED_Pos) /*!< SCB SHCSR: HARDFAULTPENDED Mask */
+
+#define SCB_SHCSR_SECUREFAULTPENDED_Pos 20U /*!< SCB SHCSR: SECUREFAULTPENDED Position */
+#define SCB_SHCSR_SECUREFAULTPENDED_Msk (1UL << SCB_SHCSR_SECUREFAULTPENDED_Pos) /*!< SCB SHCSR: SECUREFAULTPENDED Mask */
+
+#define SCB_SHCSR_SECUREFAULTENA_Pos 19U /*!< SCB SHCSR: SECUREFAULTENA Position */
+#define SCB_SHCSR_SECUREFAULTENA_Msk (1UL << SCB_SHCSR_SECUREFAULTENA_Pos) /*!< SCB SHCSR: SECUREFAULTENA Mask */
+
+#define SCB_SHCSR_USGFAULTENA_Pos 18U /*!< SCB SHCSR: USGFAULTENA Position */
+#define SCB_SHCSR_USGFAULTENA_Msk (1UL << SCB_SHCSR_USGFAULTENA_Pos) /*!< SCB SHCSR: USGFAULTENA Mask */
+
+#define SCB_SHCSR_BUSFAULTENA_Pos 17U /*!< SCB SHCSR: BUSFAULTENA Position */
+#define SCB_SHCSR_BUSFAULTENA_Msk (1UL << SCB_SHCSR_BUSFAULTENA_Pos) /*!< SCB SHCSR: BUSFAULTENA Mask */
+
+#define SCB_SHCSR_MEMFAULTENA_Pos 16U /*!< SCB SHCSR: MEMFAULTENA Position */
+#define SCB_SHCSR_MEMFAULTENA_Msk (1UL << SCB_SHCSR_MEMFAULTENA_Pos) /*!< SCB SHCSR: MEMFAULTENA Mask */
+
+#define SCB_SHCSR_SVCALLPENDED_Pos 15U /*!< SCB SHCSR: SVCALLPENDED Position */
+#define SCB_SHCSR_SVCALLPENDED_Msk (1UL << SCB_SHCSR_SVCALLPENDED_Pos) /*!< SCB SHCSR: SVCALLPENDED Mask */
+
+#define SCB_SHCSR_BUSFAULTPENDED_Pos 14U /*!< SCB SHCSR: BUSFAULTPENDED Position */
+#define SCB_SHCSR_BUSFAULTPENDED_Msk (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos) /*!< SCB SHCSR: BUSFAULTPENDED Mask */
+
+#define SCB_SHCSR_MEMFAULTPENDED_Pos 13U /*!< SCB SHCSR: MEMFAULTPENDED Position */
+#define SCB_SHCSR_MEMFAULTPENDED_Msk (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos) /*!< SCB SHCSR: MEMFAULTPENDED Mask */
+
+#define SCB_SHCSR_USGFAULTPENDED_Pos 12U /*!< SCB SHCSR: USGFAULTPENDED Position */
+#define SCB_SHCSR_USGFAULTPENDED_Msk (1UL << SCB_SHCSR_USGFAULTPENDED_Pos) /*!< SCB SHCSR: USGFAULTPENDED Mask */
+
+#define SCB_SHCSR_SYSTICKACT_Pos 11U /*!< SCB SHCSR: SYSTICKACT Position */
+#define SCB_SHCSR_SYSTICKACT_Msk (1UL << SCB_SHCSR_SYSTICKACT_Pos) /*!< SCB SHCSR: SYSTICKACT Mask */
+
+#define SCB_SHCSR_PENDSVACT_Pos 10U /*!< SCB SHCSR: PENDSVACT Position */
+#define SCB_SHCSR_PENDSVACT_Msk (1UL << SCB_SHCSR_PENDSVACT_Pos) /*!< SCB SHCSR: PENDSVACT Mask */
+
+#define SCB_SHCSR_MONITORACT_Pos 8U /*!< SCB SHCSR: MONITORACT Position */
+#define SCB_SHCSR_MONITORACT_Msk (1UL << SCB_SHCSR_MONITORACT_Pos) /*!< SCB SHCSR: MONITORACT Mask */
+
+#define SCB_SHCSR_SVCALLACT_Pos 7U /*!< SCB SHCSR: SVCALLACT Position */
+#define SCB_SHCSR_SVCALLACT_Msk (1UL << SCB_SHCSR_SVCALLACT_Pos) /*!< SCB SHCSR: SVCALLACT Mask */
+
+#define SCB_SHCSR_NMIACT_Pos 5U /*!< SCB SHCSR: NMIACT Position */
+#define SCB_SHCSR_NMIACT_Msk (1UL << SCB_SHCSR_NMIACT_Pos) /*!< SCB SHCSR: NMIACT Mask */
+
+#define SCB_SHCSR_SECUREFAULTACT_Pos 4U /*!< SCB SHCSR: SECUREFAULTACT Position */
+#define SCB_SHCSR_SECUREFAULTACT_Msk (1UL << SCB_SHCSR_SECUREFAULTACT_Pos) /*!< SCB SHCSR: SECUREFAULTACT Mask */
+
+#define SCB_SHCSR_USGFAULTACT_Pos 3U /*!< SCB SHCSR: USGFAULTACT Position */
+#define SCB_SHCSR_USGFAULTACT_Msk (1UL << SCB_SHCSR_USGFAULTACT_Pos) /*!< SCB SHCSR: USGFAULTACT Mask */
+
+#define SCB_SHCSR_HARDFAULTACT_Pos 2U /*!< SCB SHCSR: HARDFAULTACT Position */
+#define SCB_SHCSR_HARDFAULTACT_Msk (1UL << SCB_SHCSR_HARDFAULTACT_Pos) /*!< SCB SHCSR: HARDFAULTACT Mask */
+
+#define SCB_SHCSR_BUSFAULTACT_Pos 1U /*!< SCB SHCSR: BUSFAULTACT Position */
+#define SCB_SHCSR_BUSFAULTACT_Msk (1UL << SCB_SHCSR_BUSFAULTACT_Pos) /*!< SCB SHCSR: BUSFAULTACT Mask */
+
+#define SCB_SHCSR_MEMFAULTACT_Pos 0U /*!< SCB SHCSR: MEMFAULTACT Position */
+#define SCB_SHCSR_MEMFAULTACT_Msk (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/) /*!< SCB SHCSR: MEMFAULTACT Mask */
+
+/* SCB Configurable Fault Status Register Definitions */
+#define SCB_CFSR_USGFAULTSR_Pos 16U /*!< SCB CFSR: Usage Fault Status Register Position */
+#define SCB_CFSR_USGFAULTSR_Msk (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos) /*!< SCB CFSR: Usage Fault Status Register Mask */
+
+#define SCB_CFSR_BUSFAULTSR_Pos 8U /*!< SCB CFSR: Bus Fault Status Register Position */
+#define SCB_CFSR_BUSFAULTSR_Msk (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos) /*!< SCB CFSR: Bus Fault Status Register Mask */
+
+#define SCB_CFSR_MEMFAULTSR_Pos 0U /*!< SCB CFSR: Memory Manage Fault Status Register Position */
+#define SCB_CFSR_MEMFAULTSR_Msk (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/) /*!< SCB CFSR: Memory Manage Fault Status Register Mask */
+
+/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_MMARVALID_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 7U) /*!< SCB CFSR (MMFSR): MMARVALID Position */
+#define SCB_CFSR_MMARVALID_Msk (1UL << SCB_CFSR_MMARVALID_Pos) /*!< SCB CFSR (MMFSR): MMARVALID Mask */
+
+#define SCB_CFSR_MLSPERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 5U) /*!< SCB CFSR (MMFSR): MLSPERR Position */
+#define SCB_CFSR_MLSPERR_Msk (1UL << SCB_CFSR_MLSPERR_Pos) /*!< SCB CFSR (MMFSR): MLSPERR Mask */
+
+#define SCB_CFSR_MSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 4U) /*!< SCB CFSR (MMFSR): MSTKERR Position */
+#define SCB_CFSR_MSTKERR_Msk (1UL << SCB_CFSR_MSTKERR_Pos) /*!< SCB CFSR (MMFSR): MSTKERR Mask */
+
+#define SCB_CFSR_MUNSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 3U) /*!< SCB CFSR (MMFSR): MUNSTKERR Position */
+#define SCB_CFSR_MUNSTKERR_Msk (1UL << SCB_CFSR_MUNSTKERR_Pos) /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */
+
+#define SCB_CFSR_DACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 1U) /*!< SCB CFSR (MMFSR): DACCVIOL Position */
+#define SCB_CFSR_DACCVIOL_Msk (1UL << SCB_CFSR_DACCVIOL_Pos) /*!< SCB CFSR (MMFSR): DACCVIOL Mask */
+
+#define SCB_CFSR_IACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 0U) /*!< SCB CFSR (MMFSR): IACCVIOL Position */
+#define SCB_CFSR_IACCVIOL_Msk (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/) /*!< SCB CFSR (MMFSR): IACCVIOL Mask */
+
+/* BusFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_BFARVALID_Pos (SCB_CFSR_BUSFAULTSR_Pos + 7U) /*!< SCB CFSR (BFSR): BFARVALID Position */
+#define SCB_CFSR_BFARVALID_Msk (1UL << SCB_CFSR_BFARVALID_Pos) /*!< SCB CFSR (BFSR): BFARVALID Mask */
+
+#define SCB_CFSR_LSPERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 5U) /*!< SCB CFSR (BFSR): LSPERR Position */
+#define SCB_CFSR_LSPERR_Msk (1UL << SCB_CFSR_LSPERR_Pos) /*!< SCB CFSR (BFSR): LSPERR Mask */
+
+#define SCB_CFSR_STKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 4U) /*!< SCB CFSR (BFSR): STKERR Position */
+#define SCB_CFSR_STKERR_Msk (1UL << SCB_CFSR_STKERR_Pos) /*!< SCB CFSR (BFSR): STKERR Mask */
+
+#define SCB_CFSR_UNSTKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 3U) /*!< SCB CFSR (BFSR): UNSTKERR Position */
+#define SCB_CFSR_UNSTKERR_Msk (1UL << SCB_CFSR_UNSTKERR_Pos) /*!< SCB CFSR (BFSR): UNSTKERR Mask */
+
+#define SCB_CFSR_IMPRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 2U) /*!< SCB CFSR (BFSR): IMPRECISERR Position */
+#define SCB_CFSR_IMPRECISERR_Msk (1UL << SCB_CFSR_IMPRECISERR_Pos) /*!< SCB CFSR (BFSR): IMPRECISERR Mask */
+
+#define SCB_CFSR_PRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 1U) /*!< SCB CFSR (BFSR): PRECISERR Position */
+#define SCB_CFSR_PRECISERR_Msk (1UL << SCB_CFSR_PRECISERR_Pos) /*!< SCB CFSR (BFSR): PRECISERR Mask */
+
+#define SCB_CFSR_IBUSERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 0U) /*!< SCB CFSR (BFSR): IBUSERR Position */
+#define SCB_CFSR_IBUSERR_Msk (1UL << SCB_CFSR_IBUSERR_Pos) /*!< SCB CFSR (BFSR): IBUSERR Mask */
+
+/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_DIVBYZERO_Pos (SCB_CFSR_USGFAULTSR_Pos + 9U) /*!< SCB CFSR (UFSR): DIVBYZERO Position */
+#define SCB_CFSR_DIVBYZERO_Msk (1UL << SCB_CFSR_DIVBYZERO_Pos) /*!< SCB CFSR (UFSR): DIVBYZERO Mask */
+
+#define SCB_CFSR_UNALIGNED_Pos (SCB_CFSR_USGFAULTSR_Pos + 8U) /*!< SCB CFSR (UFSR): UNALIGNED Position */
+#define SCB_CFSR_UNALIGNED_Msk (1UL << SCB_CFSR_UNALIGNED_Pos) /*!< SCB CFSR (UFSR): UNALIGNED Mask */
+
+#define SCB_CFSR_STKOF_Pos (SCB_CFSR_USGFAULTSR_Pos + 4U) /*!< SCB CFSR (UFSR): STKOF Position */
+#define SCB_CFSR_STKOF_Msk (1UL << SCB_CFSR_STKOF_Pos) /*!< SCB CFSR (UFSR): STKOF Mask */
+
+#define SCB_CFSR_NOCP_Pos (SCB_CFSR_USGFAULTSR_Pos + 3U) /*!< SCB CFSR (UFSR): NOCP Position */
+#define SCB_CFSR_NOCP_Msk (1UL << SCB_CFSR_NOCP_Pos) /*!< SCB CFSR (UFSR): NOCP Mask */
+
+#define SCB_CFSR_INVPC_Pos (SCB_CFSR_USGFAULTSR_Pos + 2U) /*!< SCB CFSR (UFSR): INVPC Position */
+#define SCB_CFSR_INVPC_Msk (1UL << SCB_CFSR_INVPC_Pos) /*!< SCB CFSR (UFSR): INVPC Mask */
+
+#define SCB_CFSR_INVSTATE_Pos (SCB_CFSR_USGFAULTSR_Pos + 1U) /*!< SCB CFSR (UFSR): INVSTATE Position */
+#define SCB_CFSR_INVSTATE_Msk (1UL << SCB_CFSR_INVSTATE_Pos) /*!< SCB CFSR (UFSR): INVSTATE Mask */
+
+#define SCB_CFSR_UNDEFINSTR_Pos (SCB_CFSR_USGFAULTSR_Pos + 0U) /*!< SCB CFSR (UFSR): UNDEFINSTR Position */
+#define SCB_CFSR_UNDEFINSTR_Msk (1UL << SCB_CFSR_UNDEFINSTR_Pos) /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */
+
+/* SCB Hard Fault Status Register Definitions */
+#define SCB_HFSR_DEBUGEVT_Pos 31U /*!< SCB HFSR: DEBUGEVT Position */
+#define SCB_HFSR_DEBUGEVT_Msk (1UL << SCB_HFSR_DEBUGEVT_Pos) /*!< SCB HFSR: DEBUGEVT Mask */
+
+#define SCB_HFSR_FORCED_Pos 30U /*!< SCB HFSR: FORCED Position */
+#define SCB_HFSR_FORCED_Msk (1UL << SCB_HFSR_FORCED_Pos) /*!< SCB HFSR: FORCED Mask */
+
+#define SCB_HFSR_VECTTBL_Pos 1U /*!< SCB HFSR: VECTTBL Position */
+#define SCB_HFSR_VECTTBL_Msk (1UL << SCB_HFSR_VECTTBL_Pos) /*!< SCB HFSR: VECTTBL Mask */
+
+/* SCB Debug Fault Status Register Definitions */
+#define SCB_DFSR_EXTERNAL_Pos 4U /*!< SCB DFSR: EXTERNAL Position */
+#define SCB_DFSR_EXTERNAL_Msk (1UL << SCB_DFSR_EXTERNAL_Pos) /*!< SCB DFSR: EXTERNAL Mask */
+
+#define SCB_DFSR_VCATCH_Pos 3U /*!< SCB DFSR: VCATCH Position */
+#define SCB_DFSR_VCATCH_Msk (1UL << SCB_DFSR_VCATCH_Pos) /*!< SCB DFSR: VCATCH Mask */
+
+#define SCB_DFSR_DWTTRAP_Pos 2U /*!< SCB DFSR: DWTTRAP Position */
+#define SCB_DFSR_DWTTRAP_Msk (1UL << SCB_DFSR_DWTTRAP_Pos) /*!< SCB DFSR: DWTTRAP Mask */
+
+#define SCB_DFSR_BKPT_Pos 1U /*!< SCB DFSR: BKPT Position */
+#define SCB_DFSR_BKPT_Msk (1UL << SCB_DFSR_BKPT_Pos) /*!< SCB DFSR: BKPT Mask */
+
+#define SCB_DFSR_HALTED_Pos 0U /*!< SCB DFSR: HALTED Position */
+#define SCB_DFSR_HALTED_Msk (1UL /*<< SCB_DFSR_HALTED_Pos*/) /*!< SCB DFSR: HALTED Mask */
+
+/* SCB Non-Secure Access Control Register Definitions */
+#define SCB_NSACR_CP11_Pos 11U /*!< SCB NSACR: CP11 Position */
+#define SCB_NSACR_CP11_Msk (1UL << SCB_NSACR_CP11_Pos) /*!< SCB NSACR: CP11 Mask */
+
+#define SCB_NSACR_CP10_Pos 10U /*!< SCB NSACR: CP10 Position */
+#define SCB_NSACR_CP10_Msk (1UL << SCB_NSACR_CP10_Pos) /*!< SCB NSACR: CP10 Mask */
+
+#define SCB_NSACR_CPn_Pos 0U /*!< SCB NSACR: CPn Position */
+#define SCB_NSACR_CPn_Msk (1UL /*<< SCB_NSACR_CPn_Pos*/) /*!< SCB NSACR: CPn Mask */
+
+/* SCB Cache Level ID Register Definitions */
+#define SCB_CLIDR_LOUU_Pos 27U /*!< SCB CLIDR: LoUU Position */
+#define SCB_CLIDR_LOUU_Msk (7UL << SCB_CLIDR_LOUU_Pos) /*!< SCB CLIDR: LoUU Mask */
+
+#define SCB_CLIDR_LOC_Pos 24U /*!< SCB CLIDR: LoC Position */
+#define SCB_CLIDR_LOC_Msk (7UL << SCB_CLIDR_LOC_Pos) /*!< SCB CLIDR: LoC Mask */
+
+/* SCB Cache Type Register Definitions */
+#define SCB_CTR_FORMAT_Pos 29U /*!< SCB CTR: Format Position */
+#define SCB_CTR_FORMAT_Msk (7UL << SCB_CTR_FORMAT_Pos) /*!< SCB CTR: Format Mask */
+
+#define SCB_CTR_CWG_Pos 24U /*!< SCB CTR: CWG Position */
+#define SCB_CTR_CWG_Msk (0xFUL << SCB_CTR_CWG_Pos) /*!< SCB CTR: CWG Mask */
+
+#define SCB_CTR_ERG_Pos 20U /*!< SCB CTR: ERG Position */
+#define SCB_CTR_ERG_Msk (0xFUL << SCB_CTR_ERG_Pos) /*!< SCB CTR: ERG Mask */
+
+#define SCB_CTR_DMINLINE_Pos 16U /*!< SCB CTR: DminLine Position */
+#define SCB_CTR_DMINLINE_Msk (0xFUL << SCB_CTR_DMINLINE_Pos) /*!< SCB CTR: DminLine Mask */
+
+#define SCB_CTR_IMINLINE_Pos 0U /*!< SCB CTR: ImInLine Position */
+#define SCB_CTR_IMINLINE_Msk (0xFUL /*<< SCB_CTR_IMINLINE_Pos*/) /*!< SCB CTR: ImInLine Mask */
+
+/* SCB Cache Size ID Register Definitions */
+#define SCB_CCSIDR_WT_Pos 31U /*!< SCB CCSIDR: WT Position */
+#define SCB_CCSIDR_WT_Msk (1UL << SCB_CCSIDR_WT_Pos) /*!< SCB CCSIDR: WT Mask */
+
+#define SCB_CCSIDR_WB_Pos 30U /*!< SCB CCSIDR: WB Position */
+#define SCB_CCSIDR_WB_Msk (1UL << SCB_CCSIDR_WB_Pos) /*!< SCB CCSIDR: WB Mask */
+
+#define SCB_CCSIDR_RA_Pos 29U /*!< SCB CCSIDR: RA Position */
+#define SCB_CCSIDR_RA_Msk (1UL << SCB_CCSIDR_RA_Pos) /*!< SCB CCSIDR: RA Mask */
+
+#define SCB_CCSIDR_WA_Pos 28U /*!< SCB CCSIDR: WA Position */
+#define SCB_CCSIDR_WA_Msk (1UL << SCB_CCSIDR_WA_Pos) /*!< SCB CCSIDR: WA Mask */
+
+#define SCB_CCSIDR_NUMSETS_Pos 13U /*!< SCB CCSIDR: NumSets Position */
+#define SCB_CCSIDR_NUMSETS_Msk (0x7FFFUL << SCB_CCSIDR_NUMSETS_Pos) /*!< SCB CCSIDR: NumSets Mask */
+
+#define SCB_CCSIDR_ASSOCIATIVITY_Pos 3U /*!< SCB CCSIDR: Associativity Position */
+#define SCB_CCSIDR_ASSOCIATIVITY_Msk (0x3FFUL << SCB_CCSIDR_ASSOCIATIVITY_Pos) /*!< SCB CCSIDR: Associativity Mask */
+
+#define SCB_CCSIDR_LINESIZE_Pos 0U /*!< SCB CCSIDR: LineSize Position */
+#define SCB_CCSIDR_LINESIZE_Msk (7UL /*<< SCB_CCSIDR_LINESIZE_Pos*/) /*!< SCB CCSIDR: LineSize Mask */
+
+/* SCB Cache Size Selection Register Definitions */
+#define SCB_CSSELR_LEVEL_Pos 1U /*!< SCB CSSELR: Level Position */
+#define SCB_CSSELR_LEVEL_Msk (7UL << SCB_CSSELR_LEVEL_Pos) /*!< SCB CSSELR: Level Mask */
+
+#define SCB_CSSELR_IND_Pos 0U /*!< SCB CSSELR: InD Position */
+#define SCB_CSSELR_IND_Msk (1UL /*<< SCB_CSSELR_IND_Pos*/) /*!< SCB CSSELR: InD Mask */
+
+/* SCB Software Triggered Interrupt Register Definitions */
+#define SCB_STIR_INTID_Pos 0U /*!< SCB STIR: INTID Position */
+#define SCB_STIR_INTID_Msk (0x1FFUL /*<< SCB_STIR_INTID_Pos*/) /*!< SCB STIR: INTID Mask */
+
+/* SCB D-Cache Invalidate by Set-way Register Definitions */
+#define SCB_DCISW_WAY_Pos 30U /*!< SCB DCISW: Way Position */
+#define SCB_DCISW_WAY_Msk (3UL << SCB_DCISW_WAY_Pos) /*!< SCB DCISW: Way Mask */
+
+#define SCB_DCISW_SET_Pos 5U /*!< SCB DCISW: Set Position */
+#define SCB_DCISW_SET_Msk (0x1FFUL << SCB_DCISW_SET_Pos) /*!< SCB DCISW: Set Mask */
+
+/* SCB D-Cache Clean by Set-way Register Definitions */
+#define SCB_DCCSW_WAY_Pos 30U /*!< SCB DCCSW: Way Position */
+#define SCB_DCCSW_WAY_Msk (3UL << SCB_DCCSW_WAY_Pos) /*!< SCB DCCSW: Way Mask */
+
+#define SCB_DCCSW_SET_Pos 5U /*!< SCB DCCSW: Set Position */
+#define SCB_DCCSW_SET_Msk (0x1FFUL << SCB_DCCSW_SET_Pos) /*!< SCB DCCSW: Set Mask */
+
+/* SCB D-Cache Clean and Invalidate by Set-way Register Definitions */
+#define SCB_DCCISW_WAY_Pos 30U /*!< SCB DCCISW: Way Position */
+#define SCB_DCCISW_WAY_Msk (3UL << SCB_DCCISW_WAY_Pos) /*!< SCB DCCISW: Way Mask */
+
+#define SCB_DCCISW_SET_Pos 5U /*!< SCB DCCISW: Set Position */
+#define SCB_DCCISW_SET_Msk (0x1FFUL << SCB_DCCISW_SET_Pos) /*!< SCB DCCISW: Set Mask */
+
+/* Instruction Tightly-Coupled Memory Control Register Definitions */
+#define SCB_ITCMCR_SZ_Pos 3U /*!< SCB ITCMCR: SZ Position */
+#define SCB_ITCMCR_SZ_Msk (0xFUL << SCB_ITCMCR_SZ_Pos) /*!< SCB ITCMCR: SZ Mask */
+
+#define SCB_ITCMCR_RETEN_Pos 2U /*!< SCB ITCMCR: RETEN Position */
+#define SCB_ITCMCR_RETEN_Msk (1UL << SCB_ITCMCR_RETEN_Pos) /*!< SCB ITCMCR: RETEN Mask */
+
+#define SCB_ITCMCR_RMW_Pos 1U /*!< SCB ITCMCR: RMW Position */
+#define SCB_ITCMCR_RMW_Msk (1UL << SCB_ITCMCR_RMW_Pos) /*!< SCB ITCMCR: RMW Mask */
+
+#define SCB_ITCMCR_EN_Pos 0U /*!< SCB ITCMCR: EN Position */
+#define SCB_ITCMCR_EN_Msk (1UL /*<< SCB_ITCMCR_EN_Pos*/) /*!< SCB ITCMCR: EN Mask */
+
+/* Data Tightly-Coupled Memory Control Register Definitions */
+#define SCB_DTCMCR_SZ_Pos 3U /*!< SCB DTCMCR: SZ Position */
+#define SCB_DTCMCR_SZ_Msk (0xFUL << SCB_DTCMCR_SZ_Pos) /*!< SCB DTCMCR: SZ Mask */
+
+#define SCB_DTCMCR_RETEN_Pos 2U /*!< SCB DTCMCR: RETEN Position */
+#define SCB_DTCMCR_RETEN_Msk (1UL << SCB_DTCMCR_RETEN_Pos) /*!< SCB DTCMCR: RETEN Mask */
+
+#define SCB_DTCMCR_RMW_Pos 1U /*!< SCB DTCMCR: RMW Position */
+#define SCB_DTCMCR_RMW_Msk (1UL << SCB_DTCMCR_RMW_Pos) /*!< SCB DTCMCR: RMW Mask */
+
+#define SCB_DTCMCR_EN_Pos 0U /*!< SCB DTCMCR: EN Position */
+#define SCB_DTCMCR_EN_Msk (1UL /*<< SCB_DTCMCR_EN_Pos*/) /*!< SCB DTCMCR: EN Mask */
+
+/* AHBP Control Register Definitions */
+#define SCB_AHBPCR_SZ_Pos 1U /*!< SCB AHBPCR: SZ Position */
+#define SCB_AHBPCR_SZ_Msk (7UL << SCB_AHBPCR_SZ_Pos) /*!< SCB AHBPCR: SZ Mask */
+
+#define SCB_AHBPCR_EN_Pos 0U /*!< SCB AHBPCR: EN Position */
+#define SCB_AHBPCR_EN_Msk (1UL /*<< SCB_AHBPCR_EN_Pos*/) /*!< SCB AHBPCR: EN Mask */
+
+/* L1 Cache Control Register Definitions */
+#define SCB_CACR_FORCEWT_Pos 2U /*!< SCB CACR: FORCEWT Position */
+#define SCB_CACR_FORCEWT_Msk (1UL << SCB_CACR_FORCEWT_Pos) /*!< SCB CACR: FORCEWT Mask */
+
+#define SCB_CACR_ECCEN_Pos 1U /*!< SCB CACR: ECCEN Position */
+#define SCB_CACR_ECCEN_Msk (1UL << SCB_CACR_ECCEN_Pos) /*!< SCB CACR: ECCEN Mask */
+
+#define SCB_CACR_SIWT_Pos 0U /*!< SCB CACR: SIWT Position */
+#define SCB_CACR_SIWT_Msk (1UL /*<< SCB_CACR_SIWT_Pos*/) /*!< SCB CACR: SIWT Mask */
+
+/* AHBS Control Register Definitions */
+#define SCB_AHBSCR_INITCOUNT_Pos 11U /*!< SCB AHBSCR: INITCOUNT Position */
+#define SCB_AHBSCR_INITCOUNT_Msk (0x1FUL << SCB_AHBPCR_INITCOUNT_Pos) /*!< SCB AHBSCR: INITCOUNT Mask */
+
+#define SCB_AHBSCR_TPRI_Pos 2U /*!< SCB AHBSCR: TPRI Position */
+#define SCB_AHBSCR_TPRI_Msk (0x1FFUL << SCB_AHBPCR_TPRI_Pos) /*!< SCB AHBSCR: TPRI Mask */
+
+#define SCB_AHBSCR_CTL_Pos 0U /*!< SCB AHBSCR: CTL Position*/
+#define SCB_AHBSCR_CTL_Msk (3UL /*<< SCB_AHBPCR_CTL_Pos*/) /*!< SCB AHBSCR: CTL Mask */
+
+/* Auxiliary Bus Fault Status Register Definitions */
+#define SCB_ABFSR_AXIMTYPE_Pos 8U /*!< SCB ABFSR: AXIMTYPE Position*/
+#define SCB_ABFSR_AXIMTYPE_Msk (3UL << SCB_ABFSR_AXIMTYPE_Pos) /*!< SCB ABFSR: AXIMTYPE Mask */
+
+#define SCB_ABFSR_EPPB_Pos 4U /*!< SCB ABFSR: EPPB Position*/
+#define SCB_ABFSR_EPPB_Msk (1UL << SCB_ABFSR_EPPB_Pos) /*!< SCB ABFSR: EPPB Mask */
+
+#define SCB_ABFSR_AXIM_Pos 3U /*!< SCB ABFSR: AXIM Position*/
+#define SCB_ABFSR_AXIM_Msk (1UL << SCB_ABFSR_AXIM_Pos) /*!< SCB ABFSR: AXIM Mask */
+
+#define SCB_ABFSR_AHBP_Pos 2U /*!< SCB ABFSR: AHBP Position*/
+#define SCB_ABFSR_AHBP_Msk (1UL << SCB_ABFSR_AHBP_Pos) /*!< SCB ABFSR: AHBP Mask */
+
+#define SCB_ABFSR_DTCM_Pos 1U /*!< SCB ABFSR: DTCM Position*/
+#define SCB_ABFSR_DTCM_Msk (1UL << SCB_ABFSR_DTCM_Pos) /*!< SCB ABFSR: DTCM Mask */
+
+#define SCB_ABFSR_ITCM_Pos 0U /*!< SCB ABFSR: ITCM Position*/
+#define SCB_ABFSR_ITCM_Msk (1UL /*<< SCB_ABFSR_ITCM_Pos*/) /*!< SCB ABFSR: ITCM Mask */
+
+/*@} end of group CMSIS_SCB */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB)
+ \brief Type definitions for the System Control and ID Register not in the SCB
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Control and ID Register not in the SCB.
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1U];
+ __IM uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */
+ __IOM uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */
+ __IOM uint32_t CPPWR; /*!< Offset: 0x00C (R/W) Coprocessor Power Control Register */
+} SCnSCB_Type;
+
+/* Interrupt Controller Type Register Definitions */
+#define SCnSCB_ICTR_INTLINESNUM_Pos 0U /*!< ICTR: INTLINESNUM Position */
+#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_SCnotSCB */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SysTick System Tick Timer (SysTick)
+ \brief Type definitions for the System Timer Registers.
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Timer (SysTick).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */
+ __IOM uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */
+ __IOM uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */
+ __IM uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */
+} SysTick_Type;
+
+/* SysTick Control / Status Register Definitions */
+#define SysTick_CTRL_COUNTFLAG_Pos 16U /*!< SysTick CTRL: COUNTFLAG Position */
+#define SysTick_CTRL_COUNTFLAG_Msk (1UL << SysTick_CTRL_COUNTFLAG_Pos) /*!< SysTick CTRL: COUNTFLAG Mask */
+
+#define SysTick_CTRL_CLKSOURCE_Pos 2U /*!< SysTick CTRL: CLKSOURCE Position */
+#define SysTick_CTRL_CLKSOURCE_Msk (1UL << SysTick_CTRL_CLKSOURCE_Pos) /*!< SysTick CTRL: CLKSOURCE Mask */
+
+#define SysTick_CTRL_TICKINT_Pos 1U /*!< SysTick CTRL: TICKINT Position */
+#define SysTick_CTRL_TICKINT_Msk (1UL << SysTick_CTRL_TICKINT_Pos) /*!< SysTick CTRL: TICKINT Mask */
+
+#define SysTick_CTRL_ENABLE_Pos 0U /*!< SysTick CTRL: ENABLE Position */
+#define SysTick_CTRL_ENABLE_Msk (1UL /*<< SysTick_CTRL_ENABLE_Pos*/) /*!< SysTick CTRL: ENABLE Mask */
+
+/* SysTick Reload Register Definitions */
+#define SysTick_LOAD_RELOAD_Pos 0U /*!< SysTick LOAD: RELOAD Position */
+#define SysTick_LOAD_RELOAD_Msk (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/) /*!< SysTick LOAD: RELOAD Mask */
+
+/* SysTick Current Register Definitions */
+#define SysTick_VAL_CURRENT_Pos 0U /*!< SysTick VAL: CURRENT Position */
+#define SysTick_VAL_CURRENT_Msk (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/) /*!< SysTick VAL: CURRENT Mask */
+
+/* SysTick Calibration Register Definitions */
+#define SysTick_CALIB_NOREF_Pos 31U /*!< SysTick CALIB: NOREF Position */
+#define SysTick_CALIB_NOREF_Msk (1UL << SysTick_CALIB_NOREF_Pos) /*!< SysTick CALIB: NOREF Mask */
+
+#define SysTick_CALIB_SKEW_Pos 30U /*!< SysTick CALIB: SKEW Position */
+#define SysTick_CALIB_SKEW_Msk (1UL << SysTick_CALIB_SKEW_Pos) /*!< SysTick CALIB: SKEW Mask */
+
+#define SysTick_CALIB_TENMS_Pos 0U /*!< SysTick CALIB: TENMS Position */
+#define SysTick_CALIB_TENMS_Msk (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/) /*!< SysTick CALIB: TENMS Mask */
+
+/*@} end of group CMSIS_SysTick */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_ITM Instrumentation Trace Macrocell (ITM)
+ \brief Type definitions for the Instrumentation Trace Macrocell (ITM)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Instrumentation Trace Macrocell Register (ITM).
+ */
+typedef struct
+{
+ __OM union
+ {
+ __OM uint8_t u8; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 8-bit */
+ __OM uint16_t u16; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 16-bit */
+ __OM uint32_t u32; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 32-bit */
+ } PORT [32U]; /*!< Offset: 0x000 ( /W) ITM Stimulus Port Registers */
+ uint32_t RESERVED0[864U];
+ __IOM uint32_t TER; /*!< Offset: 0xE00 (R/W) ITM Trace Enable Register */
+ uint32_t RESERVED1[15U];
+ __IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
+ uint32_t RESERVED2[15U];
+ __IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
+ uint32_t RESERVED3[29U];
+ __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
+ __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
+ __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED4[43U];
+ __OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
+ __IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
+ uint32_t RESERVED5[1U];
+ __IM uint32_t DEVARCH; /*!< Offset: 0xFBC (R/ ) ITM Device Architecture Register */
+ uint32_t RESERVED6[4U];
+ __IM uint32_t PID4; /*!< Offset: 0xFD0 (R/ ) ITM Peripheral Identification Register #4 */
+ __IM uint32_t PID5; /*!< Offset: 0xFD4 (R/ ) ITM Peripheral Identification Register #5 */
+ __IM uint32_t PID6; /*!< Offset: 0xFD8 (R/ ) ITM Peripheral Identification Register #6 */
+ __IM uint32_t PID7; /*!< Offset: 0xFDC (R/ ) ITM Peripheral Identification Register #7 */
+ __IM uint32_t PID0; /*!< Offset: 0xFE0 (R/ ) ITM Peripheral Identification Register #0 */
+ __IM uint32_t PID1; /*!< Offset: 0xFE4 (R/ ) ITM Peripheral Identification Register #1 */
+ __IM uint32_t PID2; /*!< Offset: 0xFE8 (R/ ) ITM Peripheral Identification Register #2 */
+ __IM uint32_t PID3; /*!< Offset: 0xFEC (R/ ) ITM Peripheral Identification Register #3 */
+ __IM uint32_t CID0; /*!< Offset: 0xFF0 (R/ ) ITM Component Identification Register #0 */
+ __IM uint32_t CID1; /*!< Offset: 0xFF4 (R/ ) ITM Component Identification Register #1 */
+ __IM uint32_t CID2; /*!< Offset: 0xFF8 (R/ ) ITM Component Identification Register #2 */
+ __IM uint32_t CID3; /*!< Offset: 0xFFC (R/ ) ITM Component Identification Register #3 */
+} ITM_Type;
+
+/* ITM Stimulus Port Register Definitions */
+#define ITM_STIM_DISABLED_Pos 1U /*!< ITM STIM: DISABLED Position */
+#define ITM_STIM_DISABLED_Msk (0x1UL << ITM_STIM_DISABLED_Pos) /*!< ITM STIM: DISABLED Mask */
+
+#define ITM_STIM_FIFOREADY_Pos 0U /*!< ITM STIM: FIFOREADY Position */
+#define ITM_STIM_FIFOREADY_Msk (0x1UL /*<< ITM_STIM_FIFOREADY_Pos*/) /*!< ITM STIM: FIFOREADY Mask */
+
+/* ITM Trace Privilege Register Definitions */
+#define ITM_TPR_PRIVMASK_Pos 0U /*!< ITM TPR: PRIVMASK Position */
+#define ITM_TPR_PRIVMASK_Msk (0xFUL /*<< ITM_TPR_PRIVMASK_Pos*/) /*!< ITM TPR: PRIVMASK Mask */
+
+/* ITM Trace Control Register Definitions */
+#define ITM_TCR_BUSY_Pos 23U /*!< ITM TCR: BUSY Position */
+#define ITM_TCR_BUSY_Msk (1UL << ITM_TCR_BUSY_Pos) /*!< ITM TCR: BUSY Mask */
+
+#define ITM_TCR_TRACEBUSID_Pos 16U /*!< ITM TCR: ATBID Position */
+#define ITM_TCR_TRACEBUSID_Msk (0x7FUL << ITM_TCR_TRACEBUSID_Pos) /*!< ITM TCR: ATBID Mask */
+
+#define ITM_TCR_GTSFREQ_Pos 10U /*!< ITM TCR: Global timestamp frequency Position */
+#define ITM_TCR_GTSFREQ_Msk (3UL << ITM_TCR_GTSFREQ_Pos) /*!< ITM TCR: Global timestamp frequency Mask */
+
+#define ITM_TCR_TSPRESCALE_Pos 8U /*!< ITM TCR: TSPRESCALE Position */
+#define ITM_TCR_TSPRESCALE_Msk (3UL << ITM_TCR_TSPRESCALE_Pos) /*!< ITM TCR: TSPRESCALE Mask */
+
+#define ITM_TCR_STALLENA_Pos 5U /*!< ITM TCR: STALLENA Position */
+#define ITM_TCR_STALLENA_Msk (1UL << ITM_TCR_STALLENA_Pos) /*!< ITM TCR: STALLENA Mask */
+
+#define ITM_TCR_SWOENA_Pos 4U /*!< ITM TCR: SWOENA Position */
+#define ITM_TCR_SWOENA_Msk (1UL << ITM_TCR_SWOENA_Pos) /*!< ITM TCR: SWOENA Mask */
+
+#define ITM_TCR_DWTENA_Pos 3U /*!< ITM TCR: DWTENA Position */
+#define ITM_TCR_DWTENA_Msk (1UL << ITM_TCR_DWTENA_Pos) /*!< ITM TCR: DWTENA Mask */
+
+#define ITM_TCR_SYNCENA_Pos 2U /*!< ITM TCR: SYNCENA Position */
+#define ITM_TCR_SYNCENA_Msk (1UL << ITM_TCR_SYNCENA_Pos) /*!< ITM TCR: SYNCENA Mask */
+
+#define ITM_TCR_TSENA_Pos 1U /*!< ITM TCR: TSENA Position */
+#define ITM_TCR_TSENA_Msk (1UL << ITM_TCR_TSENA_Pos) /*!< ITM TCR: TSENA Mask */
+
+#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
+#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
+
+/* ITM Integration Write Register Definitions */
+#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
+#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
+
+/* ITM Integration Read Register Definitions */
+#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
+#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
+
+/* ITM Integration Mode Control Register Definitions */
+#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
+#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
+
+/* ITM Lock Status Register Definitions */
+#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
+#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
+
+#define ITM_LSR_Access_Pos 1U /*!< ITM LSR: Access Position */
+#define ITM_LSR_Access_Msk (1UL << ITM_LSR_Access_Pos) /*!< ITM LSR: Access Mask */
+
+#define ITM_LSR_Present_Pos 0U /*!< ITM LSR: Present Position */
+#define ITM_LSR_Present_Msk (1UL /*<< ITM_LSR_Present_Pos*/) /*!< ITM LSR: Present Mask */
+
+/*@}*/ /* end of group CMSIS_ITM */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_DWT Data Watchpoint and Trace (DWT)
+ \brief Type definitions for the Data Watchpoint and Trace (DWT)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Data Watchpoint and Trace Register (DWT).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) Control Register */
+ __IOM uint32_t CYCCNT; /*!< Offset: 0x004 (R/W) Cycle Count Register */
+ __IOM uint32_t CPICNT; /*!< Offset: 0x008 (R/W) CPI Count Register */
+ __IOM uint32_t EXCCNT; /*!< Offset: 0x00C (R/W) Exception Overhead Count Register */
+ __IOM uint32_t SLEEPCNT; /*!< Offset: 0x010 (R/W) Sleep Count Register */
+ __IOM uint32_t LSUCNT; /*!< Offset: 0x014 (R/W) LSU Count Register */
+ __IOM uint32_t FOLDCNT; /*!< Offset: 0x018 (R/W) Folded-instruction Count Register */
+ __IM uint32_t PCSR; /*!< Offset: 0x01C (R/ ) Program Counter Sample Register */
+ __IOM uint32_t COMP0; /*!< Offset: 0x020 (R/W) Comparator Register 0 */
+ uint32_t RESERVED1[1U];
+ __IOM uint32_t FUNCTION0; /*!< Offset: 0x028 (R/W) Function Register 0 */
+ uint32_t RESERVED2[1U];
+ __IOM uint32_t COMP1; /*!< Offset: 0x030 (R/W) Comparator Register 1 */
+ uint32_t RESERVED3[1U];
+ __IOM uint32_t FUNCTION1; /*!< Offset: 0x038 (R/W) Function Register 1 */
+ uint32_t RESERVED4[1U];
+ __IOM uint32_t COMP2; /*!< Offset: 0x040 (R/W) Comparator Register 2 */
+ uint32_t RESERVED5[1U];
+ __IOM uint32_t FUNCTION2; /*!< Offset: 0x048 (R/W) Function Register 2 */
+ uint32_t RESERVED6[1U];
+ __IOM uint32_t COMP3; /*!< Offset: 0x050 (R/W) Comparator Register 3 */
+ uint32_t RESERVED7[1U];
+ __IOM uint32_t FUNCTION3; /*!< Offset: 0x058 (R/W) Function Register 3 */
+ uint32_t RESERVED8[1U];
+ __IOM uint32_t COMP4; /*!< Offset: 0x060 (R/W) Comparator Register 4 */
+ uint32_t RESERVED9[1U];
+ __IOM uint32_t FUNCTION4; /*!< Offset: 0x068 (R/W) Function Register 4 */
+ uint32_t RESERVED10[1U];
+ __IOM uint32_t COMP5; /*!< Offset: 0x070 (R/W) Comparator Register 5 */
+ uint32_t RESERVED11[1U];
+ __IOM uint32_t FUNCTION5; /*!< Offset: 0x078 (R/W) Function Register 5 */
+ uint32_t RESERVED12[1U];
+ __IOM uint32_t COMP6; /*!< Offset: 0x080 (R/W) Comparator Register 6 */
+ uint32_t RESERVED13[1U];
+ __IOM uint32_t FUNCTION6; /*!< Offset: 0x088 (R/W) Function Register 6 */
+ uint32_t RESERVED14[1U];
+ __IOM uint32_t COMP7; /*!< Offset: 0x090 (R/W) Comparator Register 7 */
+ uint32_t RESERVED15[1U];
+ __IOM uint32_t FUNCTION7; /*!< Offset: 0x098 (R/W) Function Register 7 */
+ uint32_t RESERVED16[1U];
+ __IOM uint32_t COMP8; /*!< Offset: 0x0A0 (R/W) Comparator Register 8 */
+ uint32_t RESERVED17[1U];
+ __IOM uint32_t FUNCTION8; /*!< Offset: 0x0A8 (R/W) Function Register 8 */
+ uint32_t RESERVED18[1U];
+ __IOM uint32_t COMP9; /*!< Offset: 0x0B0 (R/W) Comparator Register 9 */
+ uint32_t RESERVED19[1U];
+ __IOM uint32_t FUNCTION9; /*!< Offset: 0x0B8 (R/W) Function Register 9 */
+ uint32_t RESERVED20[1U];
+ __IOM uint32_t COMP10; /*!< Offset: 0x0C0 (R/W) Comparator Register 10 */
+ uint32_t RESERVED21[1U];
+ __IOM uint32_t FUNCTION10; /*!< Offset: 0x0C8 (R/W) Function Register 10 */
+ uint32_t RESERVED22[1U];
+ __IOM uint32_t COMP11; /*!< Offset: 0x0D0 (R/W) Comparator Register 11 */
+ uint32_t RESERVED23[1U];
+ __IOM uint32_t FUNCTION11; /*!< Offset: 0x0D8 (R/W) Function Register 11 */
+ uint32_t RESERVED24[1U];
+ __IOM uint32_t COMP12; /*!< Offset: 0x0E0 (R/W) Comparator Register 12 */
+ uint32_t RESERVED25[1U];
+ __IOM uint32_t FUNCTION12; /*!< Offset: 0x0E8 (R/W) Function Register 12 */
+ uint32_t RESERVED26[1U];
+ __IOM uint32_t COMP13; /*!< Offset: 0x0F0 (R/W) Comparator Register 13 */
+ uint32_t RESERVED27[1U];
+ __IOM uint32_t FUNCTION13; /*!< Offset: 0x0F8 (R/W) Function Register 13 */
+ uint32_t RESERVED28[1U];
+ __IOM uint32_t COMP14; /*!< Offset: 0x100 (R/W) Comparator Register 14 */
+ uint32_t RESERVED29[1U];
+ __IOM uint32_t FUNCTION14; /*!< Offset: 0x108 (R/W) Function Register 14 */
+ uint32_t RESERVED30[1U];
+ __IOM uint32_t COMP15; /*!< Offset: 0x110 (R/W) Comparator Register 15 */
+ uint32_t RESERVED31[1U];
+ __IOM uint32_t FUNCTION15; /*!< Offset: 0x118 (R/W) Function Register 15 */
+ uint32_t RESERVED32[934U];
+ __IM uint32_t LSR; /*!< Offset: 0xFB4 (R ) Lock Status Register */
+ uint32_t RESERVED33[1U];
+ __IM uint32_t DEVARCH; /*!< Offset: 0xFBC (R/ ) Device Architecture Register */
+} DWT_Type;
+
+/* DWT Control Register Definitions */
+#define DWT_CTRL_NUMCOMP_Pos 28U /*!< DWT CTRL: NUMCOMP Position */
+#define DWT_CTRL_NUMCOMP_Msk (0xFUL << DWT_CTRL_NUMCOMP_Pos) /*!< DWT CTRL: NUMCOMP Mask */
+
+#define DWT_CTRL_NOTRCPKT_Pos 27U /*!< DWT CTRL: NOTRCPKT Position */
+#define DWT_CTRL_NOTRCPKT_Msk (0x1UL << DWT_CTRL_NOTRCPKT_Pos) /*!< DWT CTRL: NOTRCPKT Mask */
+
+#define DWT_CTRL_NOEXTTRIG_Pos 26U /*!< DWT CTRL: NOEXTTRIG Position */
+#define DWT_CTRL_NOEXTTRIG_Msk (0x1UL << DWT_CTRL_NOEXTTRIG_Pos) /*!< DWT CTRL: NOEXTTRIG Mask */
+
+#define DWT_CTRL_NOCYCCNT_Pos 25U /*!< DWT CTRL: NOCYCCNT Position */
+#define DWT_CTRL_NOCYCCNT_Msk (0x1UL << DWT_CTRL_NOCYCCNT_Pos) /*!< DWT CTRL: NOCYCCNT Mask */
+
+#define DWT_CTRL_NOPRFCNT_Pos 24U /*!< DWT CTRL: NOPRFCNT Position */
+#define DWT_CTRL_NOPRFCNT_Msk (0x1UL << DWT_CTRL_NOPRFCNT_Pos) /*!< DWT CTRL: NOPRFCNT Mask */
+
+#define DWT_CTRL_CYCDISS_Pos 23U /*!< DWT CTRL: CYCDISS Position */
+#define DWT_CTRL_CYCDISS_Msk (0x1UL << DWT_CTRL_CYCDISS_Pos) /*!< DWT CTRL: CYCDISS Mask */
+
+#define DWT_CTRL_CYCEVTENA_Pos 22U /*!< DWT CTRL: CYCEVTENA Position */
+#define DWT_CTRL_CYCEVTENA_Msk (0x1UL << DWT_CTRL_CYCEVTENA_Pos) /*!< DWT CTRL: CYCEVTENA Mask */
+
+#define DWT_CTRL_FOLDEVTENA_Pos 21U /*!< DWT CTRL: FOLDEVTENA Position */
+#define DWT_CTRL_FOLDEVTENA_Msk (0x1UL << DWT_CTRL_FOLDEVTENA_Pos) /*!< DWT CTRL: FOLDEVTENA Mask */
+
+#define DWT_CTRL_LSUEVTENA_Pos 20U /*!< DWT CTRL: LSUEVTENA Position */
+#define DWT_CTRL_LSUEVTENA_Msk (0x1UL << DWT_CTRL_LSUEVTENA_Pos) /*!< DWT CTRL: LSUEVTENA Mask */
+
+#define DWT_CTRL_SLEEPEVTENA_Pos 19U /*!< DWT CTRL: SLEEPEVTENA Position */
+#define DWT_CTRL_SLEEPEVTENA_Msk (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos) /*!< DWT CTRL: SLEEPEVTENA Mask */
+
+#define DWT_CTRL_EXCEVTENA_Pos 18U /*!< DWT CTRL: EXCEVTENA Position */
+#define DWT_CTRL_EXCEVTENA_Msk (0x1UL << DWT_CTRL_EXCEVTENA_Pos) /*!< DWT CTRL: EXCEVTENA Mask */
+
+#define DWT_CTRL_CPIEVTENA_Pos 17U /*!< DWT CTRL: CPIEVTENA Position */
+#define DWT_CTRL_CPIEVTENA_Msk (0x1UL << DWT_CTRL_CPIEVTENA_Pos) /*!< DWT CTRL: CPIEVTENA Mask */
+
+#define DWT_CTRL_EXCTRCENA_Pos 16U /*!< DWT CTRL: EXCTRCENA Position */
+#define DWT_CTRL_EXCTRCENA_Msk (0x1UL << DWT_CTRL_EXCTRCENA_Pos) /*!< DWT CTRL: EXCTRCENA Mask */
+
+#define DWT_CTRL_PCSAMPLENA_Pos 12U /*!< DWT CTRL: PCSAMPLENA Position */
+#define DWT_CTRL_PCSAMPLENA_Msk (0x1UL << DWT_CTRL_PCSAMPLENA_Pos) /*!< DWT CTRL: PCSAMPLENA Mask */
+
+#define DWT_CTRL_SYNCTAP_Pos 10U /*!< DWT CTRL: SYNCTAP Position */
+#define DWT_CTRL_SYNCTAP_Msk (0x3UL << DWT_CTRL_SYNCTAP_Pos) /*!< DWT CTRL: SYNCTAP Mask */
+
+#define DWT_CTRL_CYCTAP_Pos 9U /*!< DWT CTRL: CYCTAP Position */
+#define DWT_CTRL_CYCTAP_Msk (0x1UL << DWT_CTRL_CYCTAP_Pos) /*!< DWT CTRL: CYCTAP Mask */
+
+#define DWT_CTRL_POSTINIT_Pos 5U /*!< DWT CTRL: POSTINIT Position */
+#define DWT_CTRL_POSTINIT_Msk (0xFUL << DWT_CTRL_POSTINIT_Pos) /*!< DWT CTRL: POSTINIT Mask */
+
+#define DWT_CTRL_POSTPRESET_Pos 1U /*!< DWT CTRL: POSTPRESET Position */
+#define DWT_CTRL_POSTPRESET_Msk (0xFUL << DWT_CTRL_POSTPRESET_Pos) /*!< DWT CTRL: POSTPRESET Mask */
+
+#define DWT_CTRL_CYCCNTENA_Pos 0U /*!< DWT CTRL: CYCCNTENA Position */
+#define DWT_CTRL_CYCCNTENA_Msk (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/) /*!< DWT CTRL: CYCCNTENA Mask */
+
+/* DWT CPI Count Register Definitions */
+#define DWT_CPICNT_CPICNT_Pos 0U /*!< DWT CPICNT: CPICNT Position */
+#define DWT_CPICNT_CPICNT_Msk (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/) /*!< DWT CPICNT: CPICNT Mask */
+
+/* DWT Exception Overhead Count Register Definitions */
+#define DWT_EXCCNT_EXCCNT_Pos 0U /*!< DWT EXCCNT: EXCCNT Position */
+#define DWT_EXCCNT_EXCCNT_Msk (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/) /*!< DWT EXCCNT: EXCCNT Mask */
+
+/* DWT Sleep Count Register Definitions */
+#define DWT_SLEEPCNT_SLEEPCNT_Pos 0U /*!< DWT SLEEPCNT: SLEEPCNT Position */
+#define DWT_SLEEPCNT_SLEEPCNT_Msk (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/) /*!< DWT SLEEPCNT: SLEEPCNT Mask */
+
+/* DWT LSU Count Register Definitions */
+#define DWT_LSUCNT_LSUCNT_Pos 0U /*!< DWT LSUCNT: LSUCNT Position */
+#define DWT_LSUCNT_LSUCNT_Msk (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/) /*!< DWT LSUCNT: LSUCNT Mask */
+
+/* DWT Folded-instruction Count Register Definitions */
+#define DWT_FOLDCNT_FOLDCNT_Pos 0U /*!< DWT FOLDCNT: FOLDCNT Position */
+#define DWT_FOLDCNT_FOLDCNT_Msk (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/) /*!< DWT FOLDCNT: FOLDCNT Mask */
+
+/* DWT Comparator Function Register Definitions */
+#define DWT_FUNCTION_ID_Pos 27U /*!< DWT FUNCTION: ID Position */
+#define DWT_FUNCTION_ID_Msk (0x1FUL << DWT_FUNCTION_ID_Pos) /*!< DWT FUNCTION: ID Mask */
+
+#define DWT_FUNCTION_MATCHED_Pos 24U /*!< DWT FUNCTION: MATCHED Position */
+#define DWT_FUNCTION_MATCHED_Msk (0x1UL << DWT_FUNCTION_MATCHED_Pos) /*!< DWT FUNCTION: MATCHED Mask */
+
+#define DWT_FUNCTION_DATAVSIZE_Pos 10U /*!< DWT FUNCTION: DATAVSIZE Position */
+#define DWT_FUNCTION_DATAVSIZE_Msk (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos) /*!< DWT FUNCTION: DATAVSIZE Mask */
+
+#define DWT_FUNCTION_ACTION_Pos 4U /*!< DWT FUNCTION: ACTION Position */
+#define DWT_FUNCTION_ACTION_Msk (0x1UL << DWT_FUNCTION_ACTION_Pos) /*!< DWT FUNCTION: ACTION Mask */
+
+#define DWT_FUNCTION_MATCH_Pos 0U /*!< DWT FUNCTION: MATCH Position */
+#define DWT_FUNCTION_MATCH_Msk (0xFUL /*<< DWT_FUNCTION_MATCH_Pos*/) /*!< DWT FUNCTION: MATCH Mask */
+
+/*@}*/ /* end of group CMSIS_DWT */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_TPI Trace Port Interface (TPI)
+ \brief Type definitions for the Trace Port Interface (TPI)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Trace Port Interface Register (TPI).
+ */
+typedef struct
+{
+ __IM uint32_t SSPSR; /*!< Offset: 0x000 (R/ ) Supported Parallel Port Sizes Register */
+ __IOM uint32_t CSPSR; /*!< Offset: 0x004 (R/W) Current Parallel Port Sizes Register */
+ uint32_t RESERVED0[2U];
+ __IOM uint32_t ACPR; /*!< Offset: 0x010 (R/W) Asynchronous Clock Prescaler Register */
+ uint32_t RESERVED1[55U];
+ __IOM uint32_t SPPR; /*!< Offset: 0x0F0 (R/W) Selected Pin Protocol Register */
+ uint32_t RESERVED2[131U];
+ __IM uint32_t FFSR; /*!< Offset: 0x300 (R/ ) Formatter and Flush Status Register */
+ __IOM uint32_t FFCR; /*!< Offset: 0x304 (R/W) Formatter and Flush Control Register */
+ __IM uint32_t FSCR; /*!< Offset: 0x308 (R/ ) Formatter Synchronization Counter Register */
+ uint32_t RESERVED3[759U];
+ __IM uint32_t TRIGGER; /*!< Offset: 0xEE8 (R/ ) TRIGGER */
+ __IM uint32_t FIFO0; /*!< Offset: 0xEEC (R/ ) Integration ETM Data */
+ __IM uint32_t ITATBCTR2; /*!< Offset: 0xEF0 (R/ ) ITATBCTR2 */
+ uint32_t RESERVED4[1U];
+ __IM uint32_t ITATBCTR0; /*!< Offset: 0xEF8 (R/ ) ITATBCTR0 */
+ __IM uint32_t FIFO1; /*!< Offset: 0xEFC (R/ ) Integration ITM Data */
+ __IOM uint32_t ITCTRL; /*!< Offset: 0xF00 (R/W) Integration Mode Control */
+ uint32_t RESERVED5[39U];
+ __IOM uint32_t CLAIMSET; /*!< Offset: 0xFA0 (R/W) Claim tag set */
+ __IOM uint32_t CLAIMCLR; /*!< Offset: 0xFA4 (R/W) Claim tag clear */
+ uint32_t RESERVED7[8U];
+ __IM uint32_t DEVID; /*!< Offset: 0xFC8 (R/ ) TPIU_DEVID */
+ __IM uint32_t DEVTYPE; /*!< Offset: 0xFCC (R/ ) TPIU_DEVTYPE */
+} TPI_Type;
+
+/* TPI Asynchronous Clock Prescaler Register Definitions */
+#define TPI_ACPR_PRESCALER_Pos 0U /*!< TPI ACPR: PRESCALER Position */
+#define TPI_ACPR_PRESCALER_Msk (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/) /*!< TPI ACPR: PRESCALER Mask */
+
+/* TPI Selected Pin Protocol Register Definitions */
+#define TPI_SPPR_TXMODE_Pos 0U /*!< TPI SPPR: TXMODE Position */
+#define TPI_SPPR_TXMODE_Msk (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/) /*!< TPI SPPR: TXMODE Mask */
+
+/* TPI Formatter and Flush Status Register Definitions */
+#define TPI_FFSR_FtNonStop_Pos 3U /*!< TPI FFSR: FtNonStop Position */
+#define TPI_FFSR_FtNonStop_Msk (0x1UL << TPI_FFSR_FtNonStop_Pos) /*!< TPI FFSR: FtNonStop Mask */
+
+#define TPI_FFSR_TCPresent_Pos 2U /*!< TPI FFSR: TCPresent Position */
+#define TPI_FFSR_TCPresent_Msk (0x1UL << TPI_FFSR_TCPresent_Pos) /*!< TPI FFSR: TCPresent Mask */
+
+#define TPI_FFSR_FtStopped_Pos 1U /*!< TPI FFSR: FtStopped Position */
+#define TPI_FFSR_FtStopped_Msk (0x1UL << TPI_FFSR_FtStopped_Pos) /*!< TPI FFSR: FtStopped Mask */
+
+#define TPI_FFSR_FlInProg_Pos 0U /*!< TPI FFSR: FlInProg Position */
+#define TPI_FFSR_FlInProg_Msk (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/) /*!< TPI FFSR: FlInProg Mask */
+
+/* TPI Formatter and Flush Control Register Definitions */
+#define TPI_FFCR_TrigIn_Pos 8U /*!< TPI FFCR: TrigIn Position */
+#define TPI_FFCR_TrigIn_Msk (0x1UL << TPI_FFCR_TrigIn_Pos) /*!< TPI FFCR: TrigIn Mask */
+
+#define TPI_FFCR_EnFCont_Pos 1U /*!< TPI FFCR: EnFCont Position */
+#define TPI_FFCR_EnFCont_Msk (0x1UL << TPI_FFCR_EnFCont_Pos) /*!< TPI FFCR: EnFCont Mask */
+
+/* TPI TRIGGER Register Definitions */
+#define TPI_TRIGGER_TRIGGER_Pos 0U /*!< TPI TRIGGER: TRIGGER Position */
+#define TPI_TRIGGER_TRIGGER_Msk (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/) /*!< TPI TRIGGER: TRIGGER Mask */
+
+/* TPI Integration ETM Data Register Definitions (FIFO0) */
+#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+
+#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */
+#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
+
+#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+
+#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */
+#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
+
+#define TPI_FIFO0_ETM2_Pos 16U /*!< TPI FIFO0: ETM2 Position */
+#define TPI_FIFO0_ETM2_Msk (0xFFUL << TPI_FIFO0_ETM2_Pos) /*!< TPI FIFO0: ETM2 Mask */
+
+#define TPI_FIFO0_ETM1_Pos 8U /*!< TPI FIFO0: ETM1 Position */
+#define TPI_FIFO0_ETM1_Msk (0xFFUL << TPI_FIFO0_ETM1_Pos) /*!< TPI FIFO0: ETM1 Mask */
+
+#define TPI_FIFO0_ETM0_Pos 0U /*!< TPI FIFO0: ETM0 Position */
+#define TPI_FIFO0_ETM0_Msk (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/) /*!< TPI FIFO0: ETM0 Mask */
+
+/* TPI ITATBCTR2 Register Definitions */
+#define TPI_ITATBCTR2_ATREADY_Pos 0U /*!< TPI ITATBCTR2: ATREADY Position */
+#define TPI_ITATBCTR2_ATREADY_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY_Pos*/) /*!< TPI ITATBCTR2: ATREADY Mask */
+
+/* TPI Integration ITM Data Register Definitions (FIFO1) */
+#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+
+#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */
+#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
+
+#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+
+#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */
+#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
+
+#define TPI_FIFO1_ITM2_Pos 16U /*!< TPI FIFO1: ITM2 Position */
+#define TPI_FIFO1_ITM2_Msk (0xFFUL << TPI_FIFO1_ITM2_Pos) /*!< TPI FIFO1: ITM2 Mask */
+
+#define TPI_FIFO1_ITM1_Pos 8U /*!< TPI FIFO1: ITM1 Position */
+#define TPI_FIFO1_ITM1_Msk (0xFFUL << TPI_FIFO1_ITM1_Pos) /*!< TPI FIFO1: ITM1 Mask */
+
+#define TPI_FIFO1_ITM0_Pos 0U /*!< TPI FIFO1: ITM0 Position */
+#define TPI_FIFO1_ITM0_Msk (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/) /*!< TPI FIFO1: ITM0 Mask */
+
+/* TPI ITATBCTR0 Register Definitions */
+#define TPI_ITATBCTR0_ATREADY_Pos 0U /*!< TPI ITATBCTR0: ATREADY Position */
+#define TPI_ITATBCTR0_ATREADY_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY_Pos*/) /*!< TPI ITATBCTR0: ATREADY Mask */
+
+/* TPI Integration Mode Control Register Definitions */
+#define TPI_ITCTRL_Mode_Pos 0U /*!< TPI ITCTRL: Mode Position */
+#define TPI_ITCTRL_Mode_Msk (0x1UL /*<< TPI_ITCTRL_Mode_Pos*/) /*!< TPI ITCTRL: Mode Mask */
+
+/* TPI DEVID Register Definitions */
+#define TPI_DEVID_NRZVALID_Pos 11U /*!< TPI DEVID: NRZVALID Position */
+#define TPI_DEVID_NRZVALID_Msk (0x1UL << TPI_DEVID_NRZVALID_Pos) /*!< TPI DEVID: NRZVALID Mask */
+
+#define TPI_DEVID_MANCVALID_Pos 10U /*!< TPI DEVID: MANCVALID Position */
+#define TPI_DEVID_MANCVALID_Msk (0x1UL << TPI_DEVID_MANCVALID_Pos) /*!< TPI DEVID: MANCVALID Mask */
+
+#define TPI_DEVID_PTINVALID_Pos 9U /*!< TPI DEVID: PTINVALID Position */
+#define TPI_DEVID_PTINVALID_Msk (0x1UL << TPI_DEVID_PTINVALID_Pos) /*!< TPI DEVID: PTINVALID Mask */
+
+#define TPI_DEVID_MinBufSz_Pos 6U /*!< TPI DEVID: MinBufSz Position */
+#define TPI_DEVID_MinBufSz_Msk (0x7UL << TPI_DEVID_MinBufSz_Pos) /*!< TPI DEVID: MinBufSz Mask */
+
+#define TPI_DEVID_AsynClkIn_Pos 5U /*!< TPI DEVID: AsynClkIn Position */
+#define TPI_DEVID_AsynClkIn_Msk (0x1UL << TPI_DEVID_AsynClkIn_Pos) /*!< TPI DEVID: AsynClkIn Mask */
+
+#define TPI_DEVID_NrTraceInput_Pos 0U /*!< TPI DEVID: NrTraceInput Position */
+#define TPI_DEVID_NrTraceInput_Msk (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/) /*!< TPI DEVID: NrTraceInput Mask */
+
+/* TPI DEVTYPE Register Definitions */
+#define TPI_DEVTYPE_MajorType_Pos 4U /*!< TPI DEVTYPE: MajorType Position */
+#define TPI_DEVTYPE_MajorType_Msk (0xFUL << TPI_DEVTYPE_MajorType_Pos) /*!< TPI DEVTYPE: MajorType Mask */
+
+#define TPI_DEVTYPE_SubType_Pos 0U /*!< TPI DEVTYPE: SubType Position */
+#define TPI_DEVTYPE_SubType_Msk (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/) /*!< TPI DEVTYPE: SubType Mask */
+
+/*@}*/ /* end of group CMSIS_TPI */
+
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_MPU Memory Protection Unit (MPU)
+ \brief Type definitions for the Memory Protection Unit (MPU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Memory Protection Unit (MPU).
+ */
+typedef struct
+{
+ __IM uint32_t TYPE; /*!< Offset: 0x000 (R/ ) MPU Type Register */
+ __IOM uint32_t CTRL; /*!< Offset: 0x004 (R/W) MPU Control Register */
+ __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) MPU Region Number Register */
+ __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) MPU Region Base Address Register */
+ __IOM uint32_t RLAR; /*!< Offset: 0x010 (R/W) MPU Region Limit Address Register */
+ __IOM uint32_t RBAR_A1; /*!< Offset: 0x014 (R/W) MPU Region Base Address Register Alias 1 */
+ __IOM uint32_t RLAR_A1; /*!< Offset: 0x018 (R/W) MPU Region Limit Address Register Alias 1 */
+ __IOM uint32_t RBAR_A2; /*!< Offset: 0x01C (R/W) MPU Region Base Address Register Alias 2 */
+ __IOM uint32_t RLAR_A2; /*!< Offset: 0x020 (R/W) MPU Region Limit Address Register Alias 2 */
+ __IOM uint32_t RBAR_A3; /*!< Offset: 0x024 (R/W) MPU Region Base Address Register Alias 3 */
+ __IOM uint32_t RLAR_A3; /*!< Offset: 0x028 (R/W) MPU Region Limit Address Register Alias 3 */
+ uint32_t RESERVED0[1];
+ union {
+ __IOM uint32_t MAIR[2];
+ struct {
+ __IOM uint32_t MAIR0; /*!< Offset: 0x030 (R/W) MPU Memory Attribute Indirection Register 0 */
+ __IOM uint32_t MAIR1; /*!< Offset: 0x034 (R/W) MPU Memory Attribute Indirection Register 1 */
+ };
+ };
+} MPU_Type;
+
+#define MPU_TYPE_RALIASES 4U
+
+/* MPU Type Register Definitions */
+#define MPU_TYPE_IREGION_Pos 16U /*!< MPU TYPE: IREGION Position */
+#define MPU_TYPE_IREGION_Msk (0xFFUL << MPU_TYPE_IREGION_Pos) /*!< MPU TYPE: IREGION Mask */
+
+#define MPU_TYPE_DREGION_Pos 8U /*!< MPU TYPE: DREGION Position */
+#define MPU_TYPE_DREGION_Msk (0xFFUL << MPU_TYPE_DREGION_Pos) /*!< MPU TYPE: DREGION Mask */
+
+#define MPU_TYPE_SEPARATE_Pos 0U /*!< MPU TYPE: SEPARATE Position */
+#define MPU_TYPE_SEPARATE_Msk (1UL /*<< MPU_TYPE_SEPARATE_Pos*/) /*!< MPU TYPE: SEPARATE Mask */
+
+/* MPU Control Register Definitions */
+#define MPU_CTRL_PRIVDEFENA_Pos 2U /*!< MPU CTRL: PRIVDEFENA Position */
+#define MPU_CTRL_PRIVDEFENA_Msk (1UL << MPU_CTRL_PRIVDEFENA_Pos) /*!< MPU CTRL: PRIVDEFENA Mask */
+
+#define MPU_CTRL_HFNMIENA_Pos 1U /*!< MPU CTRL: HFNMIENA Position */
+#define MPU_CTRL_HFNMIENA_Msk (1UL << MPU_CTRL_HFNMIENA_Pos) /*!< MPU CTRL: HFNMIENA Mask */
+
+#define MPU_CTRL_ENABLE_Pos 0U /*!< MPU CTRL: ENABLE Position */
+#define MPU_CTRL_ENABLE_Msk (1UL /*<< MPU_CTRL_ENABLE_Pos*/) /*!< MPU CTRL: ENABLE Mask */
+
+/* MPU Region Number Register Definitions */
+#define MPU_RNR_REGION_Pos 0U /*!< MPU RNR: REGION Position */
+#define MPU_RNR_REGION_Msk (0xFFUL /*<< MPU_RNR_REGION_Pos*/) /*!< MPU RNR: REGION Mask */
+
+/* MPU Region Base Address Register Definitions */
+#define MPU_RBAR_ADDR_Pos 5U /*!< MPU RBAR: ADDR Position */
+#define MPU_RBAR_ADDR_Msk (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos) /*!< MPU RBAR: ADDR Mask */
+
+#define MPU_RBAR_SH_Pos 3U /*!< MPU RBAR: SH Position */
+#define MPU_RBAR_SH_Msk (0x3UL << MPU_RBAR_SH_Pos) /*!< MPU RBAR: SH Mask */
+
+#define MPU_RBAR_AP_Pos 1U /*!< MPU RBAR: AP Position */
+#define MPU_RBAR_AP_Msk (0x3UL << MPU_RBAR_AP_Pos) /*!< MPU RBAR: AP Mask */
+
+#define MPU_RBAR_XN_Pos 0U /*!< MPU RBAR: XN Position */
+#define MPU_RBAR_XN_Msk (01UL /*<< MPU_RBAR_XN_Pos*/) /*!< MPU RBAR: XN Mask */
+
+/* MPU Region Limit Address Register Definitions */
+#define MPU_RLAR_LIMIT_Pos 5U /*!< MPU RLAR: LIMIT Position */
+#define MPU_RLAR_LIMIT_Msk (0x7FFFFFFUL << MPU_RLAR_LIMIT_Pos) /*!< MPU RLAR: LIMIT Mask */
+
+#define MPU_RLAR_PXN_Pos 4U /*!< MPU RLAR: PXN Position */
+#define MPU_RLAR_PXN_Msk (0x1UL << MPU_RLAR_PXN_Pos) /*!< MPU RLAR: PXN Mask */
+
+#define MPU_RLAR_AttrIndx_Pos 1U /*!< MPU RLAR: AttrIndx Position */
+#define MPU_RLAR_AttrIndx_Msk (0x7UL << MPU_RLAR_AttrIndx_Pos) /*!< MPU RLAR: AttrIndx Mask */
+
+#define MPU_RLAR_EN_Pos 0U /*!< MPU RLAR: Region enable bit Position */
+#define MPU_RLAR_EN_Msk (1UL /*<< MPU_RLAR_EN_Pos*/) /*!< MPU RLAR: Region enable bit Disable Mask */
+
+/* MPU Memory Attribute Indirection Register 0 Definitions */
+#define MPU_MAIR0_Attr3_Pos 24U /*!< MPU MAIR0: Attr3 Position */
+#define MPU_MAIR0_Attr3_Msk (0xFFUL << MPU_MAIR0_Attr3_Pos) /*!< MPU MAIR0: Attr3 Mask */
+
+#define MPU_MAIR0_Attr2_Pos 16U /*!< MPU MAIR0: Attr2 Position */
+#define MPU_MAIR0_Attr2_Msk (0xFFUL << MPU_MAIR0_Attr2_Pos) /*!< MPU MAIR0: Attr2 Mask */
+
+#define MPU_MAIR0_Attr1_Pos 8U /*!< MPU MAIR0: Attr1 Position */
+#define MPU_MAIR0_Attr1_Msk (0xFFUL << MPU_MAIR0_Attr1_Pos) /*!< MPU MAIR0: Attr1 Mask */
+
+#define MPU_MAIR0_Attr0_Pos 0U /*!< MPU MAIR0: Attr0 Position */
+#define MPU_MAIR0_Attr0_Msk (0xFFUL /*<< MPU_MAIR0_Attr0_Pos*/) /*!< MPU MAIR0: Attr0 Mask */
+
+/* MPU Memory Attribute Indirection Register 1 Definitions */
+#define MPU_MAIR1_Attr7_Pos 24U /*!< MPU MAIR1: Attr7 Position */
+#define MPU_MAIR1_Attr7_Msk (0xFFUL << MPU_MAIR1_Attr7_Pos) /*!< MPU MAIR1: Attr7 Mask */
+
+#define MPU_MAIR1_Attr6_Pos 16U /*!< MPU MAIR1: Attr6 Position */
+#define MPU_MAIR1_Attr6_Msk (0xFFUL << MPU_MAIR1_Attr6_Pos) /*!< MPU MAIR1: Attr6 Mask */
+
+#define MPU_MAIR1_Attr5_Pos 8U /*!< MPU MAIR1: Attr5 Position */
+#define MPU_MAIR1_Attr5_Msk (0xFFUL << MPU_MAIR1_Attr5_Pos) /*!< MPU MAIR1: Attr5 Mask */
+
+#define MPU_MAIR1_Attr4_Pos 0U /*!< MPU MAIR1: Attr4 Position */
+#define MPU_MAIR1_Attr4_Msk (0xFFUL /*<< MPU_MAIR1_Attr4_Pos*/) /*!< MPU MAIR1: Attr4 Mask */
+
+/*@} end of group CMSIS_MPU */
+#endif
+
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SAU Security Attribution Unit (SAU)
+ \brief Type definitions for the Security Attribution Unit (SAU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Security Attribution Unit (SAU).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SAU Control Register */
+ __IM uint32_t TYPE; /*!< Offset: 0x004 (R/ ) SAU Type Register */
+#if defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U)
+ __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) SAU Region Number Register */
+ __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) SAU Region Base Address Register */
+ __IOM uint32_t RLAR; /*!< Offset: 0x010 (R/W) SAU Region Limit Address Register */
+#else
+ uint32_t RESERVED0[3];
+#endif
+ __IOM uint32_t SFSR; /*!< Offset: 0x014 (R/W) Secure Fault Status Register */
+ __IOM uint32_t SFAR; /*!< Offset: 0x018 (R/W) Secure Fault Address Register */
+} SAU_Type;
+
+/* SAU Control Register Definitions */
+#define SAU_CTRL_ALLNS_Pos 1U /*!< SAU CTRL: ALLNS Position */
+#define SAU_CTRL_ALLNS_Msk (1UL << SAU_CTRL_ALLNS_Pos) /*!< SAU CTRL: ALLNS Mask */
+
+#define SAU_CTRL_ENABLE_Pos 0U /*!< SAU CTRL: ENABLE Position */
+#define SAU_CTRL_ENABLE_Msk (1UL /*<< SAU_CTRL_ENABLE_Pos*/) /*!< SAU CTRL: ENABLE Mask */
+
+/* SAU Type Register Definitions */
+#define SAU_TYPE_SREGION_Pos 0U /*!< SAU TYPE: SREGION Position */
+#define SAU_TYPE_SREGION_Msk (0xFFUL /*<< SAU_TYPE_SREGION_Pos*/) /*!< SAU TYPE: SREGION Mask */
+
+#if defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U)
+/* SAU Region Number Register Definitions */
+#define SAU_RNR_REGION_Pos 0U /*!< SAU RNR: REGION Position */
+#define SAU_RNR_REGION_Msk (0xFFUL /*<< SAU_RNR_REGION_Pos*/) /*!< SAU RNR: REGION Mask */
+
+/* SAU Region Base Address Register Definitions */
+#define SAU_RBAR_BADDR_Pos 5U /*!< SAU RBAR: BADDR Position */
+#define SAU_RBAR_BADDR_Msk (0x7FFFFFFUL << SAU_RBAR_BADDR_Pos) /*!< SAU RBAR: BADDR Mask */
+
+/* SAU Region Limit Address Register Definitions */
+#define SAU_RLAR_LADDR_Pos 5U /*!< SAU RLAR: LADDR Position */
+#define SAU_RLAR_LADDR_Msk (0x7FFFFFFUL << SAU_RLAR_LADDR_Pos) /*!< SAU RLAR: LADDR Mask */
+
+#define SAU_RLAR_NSC_Pos 1U /*!< SAU RLAR: NSC Position */
+#define SAU_RLAR_NSC_Msk (1UL << SAU_RLAR_NSC_Pos) /*!< SAU RLAR: NSC Mask */
+
+#define SAU_RLAR_ENABLE_Pos 0U /*!< SAU RLAR: ENABLE Position */
+#define SAU_RLAR_ENABLE_Msk (1UL /*<< SAU_RLAR_ENABLE_Pos*/) /*!< SAU RLAR: ENABLE Mask */
+
+#endif /* defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U) */
+
+/* Secure Fault Status Register Definitions */
+#define SAU_SFSR_LSERR_Pos 7U /*!< SAU SFSR: LSERR Position */
+#define SAU_SFSR_LSERR_Msk (1UL << SAU_SFSR_LSERR_Pos) /*!< SAU SFSR: LSERR Mask */
+
+#define SAU_SFSR_SFARVALID_Pos 6U /*!< SAU SFSR: SFARVALID Position */
+#define SAU_SFSR_SFARVALID_Msk (1UL << SAU_SFSR_SFARVALID_Pos) /*!< SAU SFSR: SFARVALID Mask */
+
+#define SAU_SFSR_LSPERR_Pos 5U /*!< SAU SFSR: LSPERR Position */
+#define SAU_SFSR_LSPERR_Msk (1UL << SAU_SFSR_LSPERR_Pos) /*!< SAU SFSR: LSPERR Mask */
+
+#define SAU_SFSR_INVTRAN_Pos 4U /*!< SAU SFSR: INVTRAN Position */
+#define SAU_SFSR_INVTRAN_Msk (1UL << SAU_SFSR_INVTRAN_Pos) /*!< SAU SFSR: INVTRAN Mask */
+
+#define SAU_SFSR_AUVIOL_Pos 3U /*!< SAU SFSR: AUVIOL Position */
+#define SAU_SFSR_AUVIOL_Msk (1UL << SAU_SFSR_AUVIOL_Pos) /*!< SAU SFSR: AUVIOL Mask */
+
+#define SAU_SFSR_INVER_Pos 2U /*!< SAU SFSR: INVER Position */
+#define SAU_SFSR_INVER_Msk (1UL << SAU_SFSR_INVER_Pos) /*!< SAU SFSR: INVER Mask */
+
+#define SAU_SFSR_INVIS_Pos 1U /*!< SAU SFSR: INVIS Position */
+#define SAU_SFSR_INVIS_Msk (1UL << SAU_SFSR_INVIS_Pos) /*!< SAU SFSR: INVIS Mask */
+
+#define SAU_SFSR_INVEP_Pos 0U /*!< SAU SFSR: INVEP Position */
+#define SAU_SFSR_INVEP_Msk (1UL /*<< SAU_SFSR_INVEP_Pos*/) /*!< SAU SFSR: INVEP Mask */
+
+/*@} end of group CMSIS_SAU */
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_FPU Floating Point Unit (FPU)
+ \brief Type definitions for the Floating Point Unit (FPU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Floating Point Unit (FPU).
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1U];
+ __IOM uint32_t FPCCR; /*!< Offset: 0x004 (R/W) Floating-Point Context Control Register */
+ __IOM uint32_t FPCAR; /*!< Offset: 0x008 (R/W) Floating-Point Context Address Register */
+ __IOM uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */
+ __IM uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */
+ __IM uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */
+} FPU_Type;
+
+/* Floating-Point Context Control Register Definitions */
+#define FPU_FPCCR_ASPEN_Pos 31U /*!< FPCCR: ASPEN bit Position */
+#define FPU_FPCCR_ASPEN_Msk (1UL << FPU_FPCCR_ASPEN_Pos) /*!< FPCCR: ASPEN bit Mask */
+
+#define FPU_FPCCR_LSPEN_Pos 30U /*!< FPCCR: LSPEN Position */
+#define FPU_FPCCR_LSPEN_Msk (1UL << FPU_FPCCR_LSPEN_Pos) /*!< FPCCR: LSPEN bit Mask */
+
+#define FPU_FPCCR_LSPENS_Pos 29U /*!< FPCCR: LSPENS Position */
+#define FPU_FPCCR_LSPENS_Msk (1UL << FPU_FPCCR_LSPENS_Pos) /*!< FPCCR: LSPENS bit Mask */
+
+#define FPU_FPCCR_CLRONRET_Pos 28U /*!< FPCCR: CLRONRET Position */
+#define FPU_FPCCR_CLRONRET_Msk (1UL << FPU_FPCCR_CLRONRET_Pos) /*!< FPCCR: CLRONRET bit Mask */
+
+#define FPU_FPCCR_CLRONRETS_Pos 27U /*!< FPCCR: CLRONRETS Position */
+#define FPU_FPCCR_CLRONRETS_Msk (1UL << FPU_FPCCR_CLRONRETS_Pos) /*!< FPCCR: CLRONRETS bit Mask */
+
+#define FPU_FPCCR_TS_Pos 26U /*!< FPCCR: TS Position */
+#define FPU_FPCCR_TS_Msk (1UL << FPU_FPCCR_TS_Pos) /*!< FPCCR: TS bit Mask */
+
+#define FPU_FPCCR_UFRDY_Pos 10U /*!< FPCCR: UFRDY Position */
+#define FPU_FPCCR_UFRDY_Msk (1UL << FPU_FPCCR_UFRDY_Pos) /*!< FPCCR: UFRDY bit Mask */
+
+#define FPU_FPCCR_SPLIMVIOL_Pos 9U /*!< FPCCR: SPLIMVIOL Position */
+#define FPU_FPCCR_SPLIMVIOL_Msk (1UL << FPU_FPCCR_SPLIMVIOL_Pos) /*!< FPCCR: SPLIMVIOL bit Mask */
+
+#define FPU_FPCCR_MONRDY_Pos 8U /*!< FPCCR: MONRDY Position */
+#define FPU_FPCCR_MONRDY_Msk (1UL << FPU_FPCCR_MONRDY_Pos) /*!< FPCCR: MONRDY bit Mask */
+
+#define FPU_FPCCR_SFRDY_Pos 7U /*!< FPCCR: SFRDY Position */
+#define FPU_FPCCR_SFRDY_Msk (1UL << FPU_FPCCR_SFRDY_Pos) /*!< FPCCR: SFRDY bit Mask */
+
+#define FPU_FPCCR_BFRDY_Pos 6U /*!< FPCCR: BFRDY Position */
+#define FPU_FPCCR_BFRDY_Msk (1UL << FPU_FPCCR_BFRDY_Pos) /*!< FPCCR: BFRDY bit Mask */
+
+#define FPU_FPCCR_MMRDY_Pos 5U /*!< FPCCR: MMRDY Position */
+#define FPU_FPCCR_MMRDY_Msk (1UL << FPU_FPCCR_MMRDY_Pos) /*!< FPCCR: MMRDY bit Mask */
+
+#define FPU_FPCCR_HFRDY_Pos 4U /*!< FPCCR: HFRDY Position */
+#define FPU_FPCCR_HFRDY_Msk (1UL << FPU_FPCCR_HFRDY_Pos) /*!< FPCCR: HFRDY bit Mask */
+
+#define FPU_FPCCR_THREAD_Pos 3U /*!< FPCCR: processor mode bit Position */
+#define FPU_FPCCR_THREAD_Msk (1UL << FPU_FPCCR_THREAD_Pos) /*!< FPCCR: processor mode active bit Mask */
+
+#define FPU_FPCCR_S_Pos 2U /*!< FPCCR: Security status of the FP context bit Position */
+#define FPU_FPCCR_S_Msk (1UL << FPU_FPCCR_S_Pos) /*!< FPCCR: Security status of the FP context bit Mask */
+
+#define FPU_FPCCR_USER_Pos 1U /*!< FPCCR: privilege level bit Position */
+#define FPU_FPCCR_USER_Msk (1UL << FPU_FPCCR_USER_Pos) /*!< FPCCR: privilege level bit Mask */
+
+#define FPU_FPCCR_LSPACT_Pos 0U /*!< FPCCR: Lazy state preservation active bit Position */
+#define FPU_FPCCR_LSPACT_Msk (1UL /*<< FPU_FPCCR_LSPACT_Pos*/) /*!< FPCCR: Lazy state preservation active bit Mask */
+
+/* Floating-Point Context Address Register Definitions */
+#define FPU_FPCAR_ADDRESS_Pos 3U /*!< FPCAR: ADDRESS bit Position */
+#define FPU_FPCAR_ADDRESS_Msk (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos) /*!< FPCAR: ADDRESS bit Mask */
+
+/* Floating-Point Default Status Control Register Definitions */
+#define FPU_FPDSCR_AHP_Pos 26U /*!< FPDSCR: AHP bit Position */
+#define FPU_FPDSCR_AHP_Msk (1UL << FPU_FPDSCR_AHP_Pos) /*!< FPDSCR: AHP bit Mask */
+
+#define FPU_FPDSCR_DN_Pos 25U /*!< FPDSCR: DN bit Position */
+#define FPU_FPDSCR_DN_Msk (1UL << FPU_FPDSCR_DN_Pos) /*!< FPDSCR: DN bit Mask */
+
+#define FPU_FPDSCR_FZ_Pos 24U /*!< FPDSCR: FZ bit Position */
+#define FPU_FPDSCR_FZ_Msk (1UL << FPU_FPDSCR_FZ_Pos) /*!< FPDSCR: FZ bit Mask */
+
+#define FPU_FPDSCR_RMode_Pos 22U /*!< FPDSCR: RMode bit Position */
+#define FPU_FPDSCR_RMode_Msk (3UL << FPU_FPDSCR_RMode_Pos) /*!< FPDSCR: RMode bit Mask */
+
+/* Media and FP Feature Register 0 Definitions */
+#define FPU_MVFR0_FP_rounding_modes_Pos 28U /*!< MVFR0: FP rounding modes bits Position */
+#define FPU_MVFR0_FP_rounding_modes_Msk (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos) /*!< MVFR0: FP rounding modes bits Mask */
+
+#define FPU_MVFR0_Short_vectors_Pos 24U /*!< MVFR0: Short vectors bits Position */
+#define FPU_MVFR0_Short_vectors_Msk (0xFUL << FPU_MVFR0_Short_vectors_Pos) /*!< MVFR0: Short vectors bits Mask */
+
+#define FPU_MVFR0_Square_root_Pos 20U /*!< MVFR0: Square root bits Position */
+#define FPU_MVFR0_Square_root_Msk (0xFUL << FPU_MVFR0_Square_root_Pos) /*!< MVFR0: Square root bits Mask */
+
+#define FPU_MVFR0_Divide_Pos 16U /*!< MVFR0: Divide bits Position */
+#define FPU_MVFR0_Divide_Msk (0xFUL << FPU_MVFR0_Divide_Pos) /*!< MVFR0: Divide bits Mask */
+
+#define FPU_MVFR0_FP_excep_trapping_Pos 12U /*!< MVFR0: FP exception trapping bits Position */
+#define FPU_MVFR0_FP_excep_trapping_Msk (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos) /*!< MVFR0: FP exception trapping bits Mask */
+
+#define FPU_MVFR0_Double_precision_Pos 8U /*!< MVFR0: Double-precision bits Position */
+#define FPU_MVFR0_Double_precision_Msk (0xFUL << FPU_MVFR0_Double_precision_Pos) /*!< MVFR0: Double-precision bits Mask */
+
+#define FPU_MVFR0_Single_precision_Pos 4U /*!< MVFR0: Single-precision bits Position */
+#define FPU_MVFR0_Single_precision_Msk (0xFUL << FPU_MVFR0_Single_precision_Pos) /*!< MVFR0: Single-precision bits Mask */
+
+#define FPU_MVFR0_A_SIMD_registers_Pos 0U /*!< MVFR0: A_SIMD registers bits Position */
+#define FPU_MVFR0_A_SIMD_registers_Msk (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/) /*!< MVFR0: A_SIMD registers bits Mask */
+
+/* Media and FP Feature Register 1 Definitions */
+#define FPU_MVFR1_FP_fused_MAC_Pos 28U /*!< MVFR1: FP fused MAC bits Position */
+#define FPU_MVFR1_FP_fused_MAC_Msk (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos) /*!< MVFR1: FP fused MAC bits Mask */
+
+#define FPU_MVFR1_FP_HPFP_Pos 24U /*!< MVFR1: FP HPFP bits Position */
+#define FPU_MVFR1_FP_HPFP_Msk (0xFUL << FPU_MVFR1_FP_HPFP_Pos) /*!< MVFR1: FP HPFP bits Mask */
+
+#define FPU_MVFR1_D_NaN_mode_Pos 4U /*!< MVFR1: D_NaN mode bits Position */
+#define FPU_MVFR1_D_NaN_mode_Msk (0xFUL << FPU_MVFR1_D_NaN_mode_Pos) /*!< MVFR1: D_NaN mode bits Mask */
+
+#define FPU_MVFR1_FtZ_mode_Pos 0U /*!< MVFR1: FtZ mode bits Position */
+#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */
+
+/*@} end of group CMSIS_FPU */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_CoreDebug Core Debug Registers (CoreDebug)
+ \brief Type definitions for the Core Debug Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the Core Debug Register (CoreDebug).
+ */
+typedef struct
+{
+ __IOM uint32_t DHCSR; /*!< Offset: 0x000 (R/W) Debug Halting Control and Status Register */
+ __OM uint32_t DCRSR; /*!< Offset: 0x004 ( /W) Debug Core Register Selector Register */
+ __IOM uint32_t DCRDR; /*!< Offset: 0x008 (R/W) Debug Core Register Data Register */
+ __IOM uint32_t DEMCR; /*!< Offset: 0x00C (R/W) Debug Exception and Monitor Control Register */
+ uint32_t RESERVED4[1U];
+ __IOM uint32_t DAUTHCTRL; /*!< Offset: 0x014 (R/W) Debug Authentication Control Register */
+ __IOM uint32_t DSCSR; /*!< Offset: 0x018 (R/W) Debug Security Control and Status Register */
+} CoreDebug_Type;
+
+/* Debug Halting Control and Status Register Definitions */
+#define CoreDebug_DHCSR_DBGKEY_Pos 16U /*!< CoreDebug DHCSR: DBGKEY Position */
+#define CoreDebug_DHCSR_DBGKEY_Msk (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos) /*!< CoreDebug DHCSR: DBGKEY Mask */
+
+#define CoreDebug_DHCSR_S_RESTART_ST_Pos 26U /*!< CoreDebug DHCSR: S_RESTART_ST Position */
+#define CoreDebug_DHCSR_S_RESTART_ST_Msk (1UL << CoreDebug_DHCSR_S_RESTART_ST_Pos) /*!< CoreDebug DHCSR: S_RESTART_ST Mask */
+
+#define CoreDebug_DHCSR_S_RESET_ST_Pos 25U /*!< CoreDebug DHCSR: S_RESET_ST Position */
+#define CoreDebug_DHCSR_S_RESET_ST_Msk (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos) /*!< CoreDebug DHCSR: S_RESET_ST Mask */
+
+#define CoreDebug_DHCSR_S_RETIRE_ST_Pos 24U /*!< CoreDebug DHCSR: S_RETIRE_ST Position */
+#define CoreDebug_DHCSR_S_RETIRE_ST_Msk (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos) /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */
+
+#define CoreDebug_DHCSR_S_LOCKUP_Pos 19U /*!< CoreDebug DHCSR: S_LOCKUP Position */
+#define CoreDebug_DHCSR_S_LOCKUP_Msk (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos) /*!< CoreDebug DHCSR: S_LOCKUP Mask */
+
+#define CoreDebug_DHCSR_S_SLEEP_Pos 18U /*!< CoreDebug DHCSR: S_SLEEP Position */
+#define CoreDebug_DHCSR_S_SLEEP_Msk (1UL << CoreDebug_DHCSR_S_SLEEP_Pos) /*!< CoreDebug DHCSR: S_SLEEP Mask */
+
+#define CoreDebug_DHCSR_S_HALT_Pos 17U /*!< CoreDebug DHCSR: S_HALT Position */
+#define CoreDebug_DHCSR_S_HALT_Msk (1UL << CoreDebug_DHCSR_S_HALT_Pos) /*!< CoreDebug DHCSR: S_HALT Mask */
+
+#define CoreDebug_DHCSR_S_REGRDY_Pos 16U /*!< CoreDebug DHCSR: S_REGRDY Position */
+#define CoreDebug_DHCSR_S_REGRDY_Msk (1UL << CoreDebug_DHCSR_S_REGRDY_Pos) /*!< CoreDebug DHCSR: S_REGRDY Mask */
+
+#define CoreDebug_DHCSR_C_SNAPSTALL_Pos 5U /*!< CoreDebug DHCSR: C_SNAPSTALL Position */
+#define CoreDebug_DHCSR_C_SNAPSTALL_Msk (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos) /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */
+
+#define CoreDebug_DHCSR_C_MASKINTS_Pos 3U /*!< CoreDebug DHCSR: C_MASKINTS Position */
+#define CoreDebug_DHCSR_C_MASKINTS_Msk (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos) /*!< CoreDebug DHCSR: C_MASKINTS Mask */
+
+#define CoreDebug_DHCSR_C_STEP_Pos 2U /*!< CoreDebug DHCSR: C_STEP Position */
+#define CoreDebug_DHCSR_C_STEP_Msk (1UL << CoreDebug_DHCSR_C_STEP_Pos) /*!< CoreDebug DHCSR: C_STEP Mask */
+
+#define CoreDebug_DHCSR_C_HALT_Pos 1U /*!< CoreDebug DHCSR: C_HALT Position */
+#define CoreDebug_DHCSR_C_HALT_Msk (1UL << CoreDebug_DHCSR_C_HALT_Pos) /*!< CoreDebug DHCSR: C_HALT Mask */
+
+#define CoreDebug_DHCSR_C_DEBUGEN_Pos 0U /*!< CoreDebug DHCSR: C_DEBUGEN Position */
+#define CoreDebug_DHCSR_C_DEBUGEN_Msk (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/) /*!< CoreDebug DHCSR: C_DEBUGEN Mask */
+
+/* Debug Core Register Selector Register Definitions */
+#define CoreDebug_DCRSR_REGWnR_Pos 16U /*!< CoreDebug DCRSR: REGWnR Position */
+#define CoreDebug_DCRSR_REGWnR_Msk (1UL << CoreDebug_DCRSR_REGWnR_Pos) /*!< CoreDebug DCRSR: REGWnR Mask */
+
+#define CoreDebug_DCRSR_REGSEL_Pos 0U /*!< CoreDebug DCRSR: REGSEL Position */
+#define CoreDebug_DCRSR_REGSEL_Msk (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/) /*!< CoreDebug DCRSR: REGSEL Mask */
+
+/* Debug Exception and Monitor Control Register Definitions */
+#define CoreDebug_DEMCR_TRCENA_Pos 24U /*!< CoreDebug DEMCR: TRCENA Position */
+#define CoreDebug_DEMCR_TRCENA_Msk (1UL << CoreDebug_DEMCR_TRCENA_Pos) /*!< CoreDebug DEMCR: TRCENA Mask */
+
+#define CoreDebug_DEMCR_MON_REQ_Pos 19U /*!< CoreDebug DEMCR: MON_REQ Position */
+#define CoreDebug_DEMCR_MON_REQ_Msk (1UL << CoreDebug_DEMCR_MON_REQ_Pos) /*!< CoreDebug DEMCR: MON_REQ Mask */
+
+#define CoreDebug_DEMCR_MON_STEP_Pos 18U /*!< CoreDebug DEMCR: MON_STEP Position */
+#define CoreDebug_DEMCR_MON_STEP_Msk (1UL << CoreDebug_DEMCR_MON_STEP_Pos) /*!< CoreDebug DEMCR: MON_STEP Mask */
+
+#define CoreDebug_DEMCR_MON_PEND_Pos 17U /*!< CoreDebug DEMCR: MON_PEND Position */
+#define CoreDebug_DEMCR_MON_PEND_Msk (1UL << CoreDebug_DEMCR_MON_PEND_Pos) /*!< CoreDebug DEMCR: MON_PEND Mask */
+
+#define CoreDebug_DEMCR_MON_EN_Pos 16U /*!< CoreDebug DEMCR: MON_EN Position */
+#define CoreDebug_DEMCR_MON_EN_Msk (1UL << CoreDebug_DEMCR_MON_EN_Pos) /*!< CoreDebug DEMCR: MON_EN Mask */
+
+#define CoreDebug_DEMCR_VC_HARDERR_Pos 10U /*!< CoreDebug DEMCR: VC_HARDERR Position */
+#define CoreDebug_DEMCR_VC_HARDERR_Msk (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos) /*!< CoreDebug DEMCR: VC_HARDERR Mask */
+
+#define CoreDebug_DEMCR_VC_INTERR_Pos 9U /*!< CoreDebug DEMCR: VC_INTERR Position */
+#define CoreDebug_DEMCR_VC_INTERR_Msk (1UL << CoreDebug_DEMCR_VC_INTERR_Pos) /*!< CoreDebug DEMCR: VC_INTERR Mask */
+
+#define CoreDebug_DEMCR_VC_BUSERR_Pos 8U /*!< CoreDebug DEMCR: VC_BUSERR Position */
+#define CoreDebug_DEMCR_VC_BUSERR_Msk (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos) /*!< CoreDebug DEMCR: VC_BUSERR Mask */
+
+#define CoreDebug_DEMCR_VC_STATERR_Pos 7U /*!< CoreDebug DEMCR: VC_STATERR Position */
+#define CoreDebug_DEMCR_VC_STATERR_Msk (1UL << CoreDebug_DEMCR_VC_STATERR_Pos) /*!< CoreDebug DEMCR: VC_STATERR Mask */
+
+#define CoreDebug_DEMCR_VC_CHKERR_Pos 6U /*!< CoreDebug DEMCR: VC_CHKERR Position */
+#define CoreDebug_DEMCR_VC_CHKERR_Msk (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos) /*!< CoreDebug DEMCR: VC_CHKERR Mask */
+
+#define CoreDebug_DEMCR_VC_NOCPERR_Pos 5U /*!< CoreDebug DEMCR: VC_NOCPERR Position */
+#define CoreDebug_DEMCR_VC_NOCPERR_Msk (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos) /*!< CoreDebug DEMCR: VC_NOCPERR Mask */
+
+#define CoreDebug_DEMCR_VC_MMERR_Pos 4U /*!< CoreDebug DEMCR: VC_MMERR Position */
+#define CoreDebug_DEMCR_VC_MMERR_Msk (1UL << CoreDebug_DEMCR_VC_MMERR_Pos) /*!< CoreDebug DEMCR: VC_MMERR Mask */
+
+#define CoreDebug_DEMCR_VC_CORERESET_Pos 0U /*!< CoreDebug DEMCR: VC_CORERESET Position */
+#define CoreDebug_DEMCR_VC_CORERESET_Msk (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/) /*!< CoreDebug DEMCR: VC_CORERESET Mask */
+
+/* Debug Authentication Control Register Definitions */
+#define CoreDebug_DAUTHCTRL_INTSPNIDEN_Pos 3U /*!< CoreDebug DAUTHCTRL: INTSPNIDEN, Position */
+#define CoreDebug_DAUTHCTRL_INTSPNIDEN_Msk (1UL << CoreDebug_DAUTHCTRL_INTSPNIDEN_Pos) /*!< CoreDebug DAUTHCTRL: INTSPNIDEN, Mask */
+
+#define CoreDebug_DAUTHCTRL_SPNIDENSEL_Pos 2U /*!< CoreDebug DAUTHCTRL: SPNIDENSEL Position */
+#define CoreDebug_DAUTHCTRL_SPNIDENSEL_Msk (1UL << CoreDebug_DAUTHCTRL_SPNIDENSEL_Pos) /*!< CoreDebug DAUTHCTRL: SPNIDENSEL Mask */
+
+#define CoreDebug_DAUTHCTRL_INTSPIDEN_Pos 1U /*!< CoreDebug DAUTHCTRL: INTSPIDEN Position */
+#define CoreDebug_DAUTHCTRL_INTSPIDEN_Msk (1UL << CoreDebug_DAUTHCTRL_INTSPIDEN_Pos) /*!< CoreDebug DAUTHCTRL: INTSPIDEN Mask */
+
+#define CoreDebug_DAUTHCTRL_SPIDENSEL_Pos 0U /*!< CoreDebug DAUTHCTRL: SPIDENSEL Position */
+#define CoreDebug_DAUTHCTRL_SPIDENSEL_Msk (1UL /*<< CoreDebug_DAUTHCTRL_SPIDENSEL_Pos*/) /*!< CoreDebug DAUTHCTRL: SPIDENSEL Mask */
+
+/* Debug Security Control and Status Register Definitions */
+#define CoreDebug_DSCSR_CDS_Pos 16U /*!< CoreDebug DSCSR: CDS Position */
+#define CoreDebug_DSCSR_CDS_Msk (1UL << CoreDebug_DSCSR_CDS_Pos) /*!< CoreDebug DSCSR: CDS Mask */
+
+#define CoreDebug_DSCSR_SBRSEL_Pos 1U /*!< CoreDebug DSCSR: SBRSEL Position */
+#define CoreDebug_DSCSR_SBRSEL_Msk (1UL << CoreDebug_DSCSR_SBRSEL_Pos) /*!< CoreDebug DSCSR: SBRSEL Mask */
+
+#define CoreDebug_DSCSR_SBRSELEN_Pos 0U /*!< CoreDebug DSCSR: SBRSELEN Position */
+#define CoreDebug_DSCSR_SBRSELEN_Msk (1UL /*<< CoreDebug_DSCSR_SBRSELEN_Pos*/) /*!< CoreDebug DSCSR: SBRSELEN Mask */
+
+/*@} end of group CMSIS_CoreDebug */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_core_bitfield Core register bit field macros
+ \brief Macros for use with bit field definitions (xxx_Pos, xxx_Msk).
+ @{
+ */
+
+/**
+ \brief Mask and shift a bit field value for use in a register bit range.
+ \param[in] field Name of the register bit field.
+ \param[in] value Value of the bit field. This parameter is interpreted as an uint32_t type.
+ \return Masked and shifted value.
+*/
+#define _VAL2FLD(field, value) (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+ \brief Mask and shift a register value to extract a bit filed value.
+ \param[in] field Name of the register bit field.
+ \param[in] value Value of register. This parameter is interpreted as an uint32_t type.
+ \return Masked and shifted bit field value.
+*/
+#define _FLD2VAL(field, value) (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/*@} end of group CMSIS_core_bitfield */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_core_base Core Definitions
+ \brief Definitions for base addresses, unions, and structures.
+ @{
+ */
+
+/* Memory mapping of Core Hardware */
+ #define SCS_BASE (0xE000E000UL) /*!< System Control Space Base Address */
+ #define ITM_BASE (0xE0000000UL) /*!< ITM Base Address */
+ #define DWT_BASE (0xE0001000UL) /*!< DWT Base Address */
+ #define TPI_BASE (0xE0040000UL) /*!< TPI Base Address */
+ #define CoreDebug_BASE (0xE000EDF0UL) /*!< Core Debug Base Address */
+ #define SysTick_BASE (SCS_BASE + 0x0010UL) /*!< SysTick Base Address */
+ #define NVIC_BASE (SCS_BASE + 0x0100UL) /*!< NVIC Base Address */
+ #define SCB_BASE (SCS_BASE + 0x0D00UL) /*!< System Control Block Base Address */
+
+ #define SCnSCB ((SCnSCB_Type *) SCS_BASE ) /*!< System control Register not in SCB */
+ #define SCB ((SCB_Type *) SCB_BASE ) /*!< SCB configuration struct */
+ #define SysTick ((SysTick_Type *) SysTick_BASE ) /*!< SysTick configuration struct */
+ #define NVIC ((NVIC_Type *) NVIC_BASE ) /*!< NVIC configuration struct */
+ #define ITM ((ITM_Type *) ITM_BASE ) /*!< ITM configuration struct */
+ #define DWT ((DWT_Type *) DWT_BASE ) /*!< DWT configuration struct */
+ #define TPI ((TPI_Type *) TPI_BASE ) /*!< TPI configuration struct */
+ #define CoreDebug ((CoreDebug_Type *) CoreDebug_BASE ) /*!< Core Debug configuration struct */
+
+ #if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+ #define MPU_BASE (SCS_BASE + 0x0D90UL) /*!< Memory Protection Unit */
+ #define MPU ((MPU_Type *) MPU_BASE ) /*!< Memory Protection Unit */
+ #endif
+
+ #if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+ #define SAU_BASE (SCS_BASE + 0x0DD0UL) /*!< Security Attribution Unit */
+ #define SAU ((SAU_Type *) SAU_BASE ) /*!< Security Attribution Unit */
+ #endif
+
+ #define FPU_BASE (SCS_BASE + 0x0F30UL) /*!< Floating Point Unit */
+ #define FPU ((FPU_Type *) FPU_BASE ) /*!< Floating Point Unit */
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+ #define SCS_BASE_NS (0xE002E000UL) /*!< System Control Space Base Address (non-secure address space) */
+ #define CoreDebug_BASE_NS (0xE002EDF0UL) /*!< Core Debug Base Address (non-secure address space) */
+ #define SysTick_BASE_NS (SCS_BASE_NS + 0x0010UL) /*!< SysTick Base Address (non-secure address space) */
+ #define NVIC_BASE_NS (SCS_BASE_NS + 0x0100UL) /*!< NVIC Base Address (non-secure address space) */
+ #define SCB_BASE_NS (SCS_BASE_NS + 0x0D00UL) /*!< System Control Block Base Address (non-secure address space) */
+
+ #define SCnSCB_NS ((SCnSCB_Type *) SCS_BASE_NS ) /*!< System control Register not in SCB(non-secure address space) */
+ #define SCB_NS ((SCB_Type *) SCB_BASE_NS ) /*!< SCB configuration struct (non-secure address space) */
+ #define SysTick_NS ((SysTick_Type *) SysTick_BASE_NS ) /*!< SysTick configuration struct (non-secure address space) */
+ #define NVIC_NS ((NVIC_Type *) NVIC_BASE_NS ) /*!< NVIC configuration struct (non-secure address space) */
+ #define CoreDebug_NS ((CoreDebug_Type *) CoreDebug_BASE_NS) /*!< Core Debug configuration struct (non-secure address space) */
+
+ #if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+ #define MPU_BASE_NS (SCS_BASE_NS + 0x0D90UL) /*!< Memory Protection Unit (non-secure address space) */
+ #define MPU_NS ((MPU_Type *) MPU_BASE_NS ) /*!< Memory Protection Unit (non-secure address space) */
+ #endif
+
+ #define FPU_BASE_NS (SCS_BASE_NS + 0x0F30UL) /*!< Floating Point Unit (non-secure address space) */
+ #define FPU_NS ((FPU_Type *) FPU_BASE_NS ) /*!< Floating Point Unit (non-secure address space) */
+
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+/*@} */
+
+
+
+/*******************************************************************************
+ * Hardware Abstraction Layer
+ Core Function Interface contains:
+ - Core NVIC Functions
+ - Core SysTick Functions
+ - Core Debug Functions
+ - Core Register Access Functions
+ ******************************************************************************/
+/**
+ \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference
+*/
+
+
+
+/* ########################## NVIC functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_NVICFunctions NVIC Functions
+ \brief Functions that manage interrupts and exceptions via the NVIC.
+ @{
+ */
+
+#ifdef CMSIS_NVIC_VIRTUAL
+ #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE
+ #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h"
+ #endif
+ #include CMSIS_NVIC_VIRTUAL_HEADER_FILE
+#else
+ #define NVIC_SetPriorityGrouping __NVIC_SetPriorityGrouping
+ #define NVIC_GetPriorityGrouping __NVIC_GetPriorityGrouping
+ #define NVIC_EnableIRQ __NVIC_EnableIRQ
+ #define NVIC_GetEnableIRQ __NVIC_GetEnableIRQ
+ #define NVIC_DisableIRQ __NVIC_DisableIRQ
+ #define NVIC_GetPendingIRQ __NVIC_GetPendingIRQ
+ #define NVIC_SetPendingIRQ __NVIC_SetPendingIRQ
+ #define NVIC_ClearPendingIRQ __NVIC_ClearPendingIRQ
+ #define NVIC_GetActive __NVIC_GetActive
+ #define NVIC_SetPriority __NVIC_SetPriority
+ #define NVIC_GetPriority __NVIC_GetPriority
+ #define NVIC_SystemReset __NVIC_SystemReset
+#endif /* CMSIS_NVIC_VIRTUAL */
+
+#ifdef CMSIS_VECTAB_VIRTUAL
+ #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+ #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
+ #endif
+ #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+ #define NVIC_SetVector __NVIC_SetVector
+ #define NVIC_GetVector __NVIC_GetVector
+#endif /* (CMSIS_VECTAB_VIRTUAL) */
+
+#define NVIC_USER_IRQ_OFFSET 16
+
+
+
+/**
+ \brief Set Priority Grouping
+ \details Sets the priority grouping field using the required unlock sequence.
+ The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+ Only values from 0..7 are used.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Priority grouping field.
+ */
+__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
+{
+ uint32_t reg_value;
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+
+ reg_value = SCB->AIRCR; /* read old register configuration */
+ reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
+ reg_value = (reg_value |
+ ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (PriorityGroupTmp << 8U) ); /* Insert write key and priorty group */
+ SCB->AIRCR = reg_value;
+}
+
+
+/**
+ \brief Get Priority Grouping
+ \details Reads the priority grouping field from the NVIC Interrupt Controller.
+ \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void)
+{
+ return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+ \brief Enable Interrupt
+ \details Enables a device specific interrupt in the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Interrupt Enable status
+ \details Returns a device specific interrupt enable status from the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt is not enabled.
+ \return 1 Interrupt is enabled.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Disable Interrupt
+ \details Disables a device specific interrupt in the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __DSB();
+ __ISB();
+ }
+}
+
+
+/**
+ \brief Get Pending Interrupt
+ \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not pending.
+ \return 1 Interrupt status is pending.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Pending Interrupt
+ \details Sets the pending bit of a device specific interrupt in the NVIC pending register.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Clear Pending Interrupt
+ \details Clears the pending bit of a device specific interrupt in the NVIC pending register.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Active Interrupt
+ \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not active.
+ \return 1 Interrupt status is active.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief Get Interrupt Target State
+ \details Reads the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ \return 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_GetTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Interrupt Target State
+ \details Sets the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_SetTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] |= ((uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)));
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Clear Interrupt Target State
+ \details Clears the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_ClearTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] &= ~((uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)));
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+
+/**
+ \brief Set Interrupt Priority
+ \details Sets the priority of a device specific interrupt or a processor exception.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \param [in] priority Priority to set.
+ \note The priority cannot be set for every processor exception.
+ */
+__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->IPR[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+ else
+ {
+ SCB->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+}
+
+
+/**
+ \brief Get Interrupt Priority
+ \details Reads the priority of a device specific interrupt or a processor exception.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Interrupt Priority.
+ Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn)
+{
+
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return(((uint32_t)NVIC->IPR[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS)));
+ }
+ else
+ {
+ return(((uint32_t)SCB->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+ }
+}
+
+
+/**
+ \brief Encode Priority
+ \details Encodes the priority for an interrupt with the given priority group,
+ preemptive priority value, and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Used priority group.
+ \param [in] PreemptPriority Preemptive priority value (starting from 0).
+ \param [in] SubPriority Subpriority value (starting from 0).
+ \return Encoded priority. Value can be used in the function \ref NVIC_SetPriority().
+ */
+__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ return (
+ ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) |
+ ((SubPriority & (uint32_t)((1UL << (SubPriorityBits )) - 1UL)))
+ );
+}
+
+
+/**
+ \brief Decode Priority
+ \details Decodes an interrupt priority value with a given priority group to
+ preemptive priority value and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
+ \param [in] Priority Priority value, which can be retrieved with the function \ref NVIC_GetPriority().
+ \param [in] PriorityGroup Used priority group.
+ \param [out] pPreemptPriority Preemptive priority value (starting from 0).
+ \param [out] pSubPriority Subpriority value (starting from 0).
+ */
+__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL);
+ *pSubPriority = (Priority ) & (uint32_t)((1UL << (SubPriorityBits )) - 1UL);
+}
+
+
+/**
+ \brief Set Interrupt Vector
+ \details Sets an interrupt vector in SRAM based interrupt vector table.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ VTOR must been relocated to SRAM before.
+ \param [in] IRQn Interrupt number
+ \param [in] vector Address of interrupt handler function
+ */
+__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
+{
+ uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
+}
+
+
+/**
+ \brief Get Interrupt Vector
+ \details Reads an interrupt vector from interrupt vector table.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Address of interrupt handler function
+ */
+__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
+{
+ uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+}
+
+
+/**
+ \brief System Reset
+ \details Initiates a system reset request to reset the MCU.
+ */
+__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
+{
+ __DSB(); /* Ensure all outstanding memory accesses included
+ buffered write are completed before reset */
+ SCB->AIRCR = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) |
+ SCB_AIRCR_SYSRESETREQ_Msk ); /* Keep priority group unchanged */
+ __DSB(); /* Ensure completion of memory access */
+
+ for(;;) /* wait until reset */
+ {
+ __NOP();
+ }
+}
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief Set Priority Grouping (non-secure)
+ \details Sets the non-secure priority grouping field when in secure state using the required unlock sequence.
+ The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+ Only values from 0..7 are used.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Priority grouping field.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPriorityGrouping_NS(uint32_t PriorityGroup)
+{
+ uint32_t reg_value;
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+
+ reg_value = SCB_NS->AIRCR; /* read old register configuration */
+ reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
+ reg_value = (reg_value |
+ ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (PriorityGroupTmp << 8U) ); /* Insert write key and priorty group */
+ SCB_NS->AIRCR = reg_value;
+}
+
+
+/**
+ \brief Get Priority Grouping (non-secure)
+ \details Reads the priority grouping field from the non-secure NVIC when in secure state.
+ \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPriorityGrouping_NS(void)
+{
+ return ((uint32_t)((SCB_NS->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+ \brief Enable Interrupt (non-secure)
+ \details Enables a device specific interrupt in the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_EnableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Interrupt Enable status (non-secure)
+ \details Returns a device specific interrupt enable status from the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt is not enabled.
+ \return 1 Interrupt is enabled.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetEnableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Disable Interrupt (non-secure)
+ \details Disables a device specific interrupt in the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_DisableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Pending Interrupt (non-secure)
+ \details Reads the NVIC pending register in the non-secure NVIC when in secure state and returns the pending bit for the specified device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not pending.
+ \return 1 Interrupt status is pending.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Pending Interrupt (non-secure)
+ \details Sets the pending bit of a device specific interrupt in the non-secure NVIC pending register when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Clear Pending Interrupt (non-secure)
+ \details Clears the pending bit of a device specific interrupt in the non-secure NVIC pending register when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_ClearPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Active Interrupt (non-secure)
+ \details Reads the active register in non-secure NVIC when in secure state and returns the active bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not active.
+ \return 1 Interrupt status is active.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetActive_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Interrupt Priority (non-secure)
+ \details Sets the priority of a non-secure device specific interrupt or a non-secure processor exception when in secure state.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \param [in] priority Priority to set.
+ \note The priority cannot be set for every non-secure processor exception.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPriority_NS(IRQn_Type IRQn, uint32_t priority)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->IPR[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+ else
+ {
+ SCB_NS->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+}
+
+
+/**
+ \brief Get Interrupt Priority (non-secure)
+ \details Reads the priority of a non-secure device specific interrupt or a non-secure processor exception when in secure state.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Interrupt Priority. Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPriority_NS(IRQn_Type IRQn)
+{
+
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return(((uint32_t)NVIC_NS->IPR[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS)));
+ }
+ else
+ {
+ return(((uint32_t)SCB_NS->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+ }
+}
+#endif /* defined (__ARM_FEATURE_CMSE) &&(__ARM_FEATURE_CMSE == 3U) */
+
+/*@} end of CMSIS_Core_NVICFunctions */
+
+/* ########################## MPU functions #################################### */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+
+#include "mpu_armv8.h"
+
+#endif
+
+/* ########################## FPU functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_FpuFunctions FPU Functions
+ \brief Function that provides FPU type.
+ @{
+ */
+
+/**
+ \brief get FPU type
+ \details returns the FPU type
+ \returns
+ - \b 0: No FPU
+ - \b 1: Single precision FPU
+ - \b 2: Double + Single precision FPU
+ */
+__STATIC_INLINE uint32_t SCB_GetFPUType(void)
+{
+ uint32_t mvfr0;
+
+ mvfr0 = FPU->MVFR0;
+ if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x220U)
+ {
+ return 2U; /* Double + Single precision FPU */
+ }
+ else if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U)
+ {
+ return 1U; /* Single precision FPU */
+ }
+ else
+ {
+ return 0U; /* No FPU */
+ }
+}
+
+
+/*@} end of CMSIS_Core_FpuFunctions */
+
+
+
+/* ########################## SAU functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_SAUFunctions SAU Functions
+ \brief Functions that configure the SAU.
+ @{
+ */
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+
+/**
+ \brief Enable SAU
+ \details Enables the Security Attribution Unit (SAU).
+ */
+__STATIC_INLINE void TZ_SAU_Enable(void)
+{
+ SAU->CTRL |= (SAU_CTRL_ENABLE_Msk);
+}
+
+
+
+/**
+ \brief Disable SAU
+ \details Disables the Security Attribution Unit (SAU).
+ */
+__STATIC_INLINE void TZ_SAU_Disable(void)
+{
+ SAU->CTRL &= ~(SAU_CTRL_ENABLE_Msk);
+}
+
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+/*@} end of CMSIS_Core_SAUFunctions */
+
+
+
+
+/* ################################## SysTick function ############################################ */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_SysTickFunctions SysTick Functions
+ \brief Functions that configure the System.
+ @{
+ */
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U)
+
+/**
+ \brief System Tick Configuration
+ \details Initializes the System Timer and its interrupt, and starts the System Tick Timer.
+ Counter is in free running mode to generate periodic interrupts.
+ \param [in] ticks Number of ticks between two interrupts.
+ \return 0 Function succeeded.
+ \return 1 Function failed.
+ \note When the variable __Vendor_SysTickConfig is set to 1, then the
+ function SysTick_Config is not included. In this case, the file device.h
+ must contain a vendor-specific implementation of this function.
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks)
+{
+ if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+ {
+ return (1UL); /* Reload value impossible */
+ }
+
+ SysTick->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */
+ NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+ SysTick->VAL = 0UL; /* Load the SysTick Counter Value */
+ SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk |
+ SysTick_CTRL_TICKINT_Msk |
+ SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */
+ return (0UL); /* Function successful */
+}
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief System Tick Configuration (non-secure)
+ \details Initializes the non-secure System Timer and its interrupt when in secure state, and starts the System Tick Timer.
+ Counter is in free running mode to generate periodic interrupts.
+ \param [in] ticks Number of ticks between two interrupts.
+ \return 0 Function succeeded.
+ \return 1 Function failed.
+ \note When the variable __Vendor_SysTickConfig is set to 1, then the
+ function TZ_SysTick_Config_NS is not included. In this case, the file device.h
+ must contain a vendor-specific implementation of this function.
+
+ */
+__STATIC_INLINE uint32_t TZ_SysTick_Config_NS(uint32_t ticks)
+{
+ if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+ {
+ return (1UL); /* Reload value impossible */
+ }
+
+ SysTick_NS->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */
+ TZ_NVIC_SetPriority_NS (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+ SysTick_NS->VAL = 0UL; /* Load the SysTick Counter Value */
+ SysTick_NS->CTRL = SysTick_CTRL_CLKSOURCE_Msk |
+ SysTick_CTRL_TICKINT_Msk |
+ SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */
+ return (0UL); /* Function successful */
+}
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+#endif
+
+/*@} end of CMSIS_Core_SysTickFunctions */
+
+
+
+/* ##################################### Debug In/Output function ########################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_core_DebugFunctions ITM Functions
+ \brief Functions that access the ITM debug interface.
+ @{
+ */
+
+extern volatile int32_t ITM_RxBuffer; /*!< External variable to receive characters. */
+#define ITM_RXBUFFER_EMPTY ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */
+
+
+/**
+ \brief ITM Send Character
+ \details Transmits a character via the ITM channel 0, and
+ \li Just returns when no debugger is connected that has booked the output.
+ \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted.
+ \param [in] ch Character to transmit.
+ \returns Character to transmit.
+ */
+__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch)
+{
+ if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) && /* ITM enabled */
+ ((ITM->TER & 1UL ) != 0UL) ) /* ITM Port #0 enabled */
+ {
+ while (ITM->PORT[0U].u32 == 0UL)
+ {
+ __NOP();
+ }
+ ITM->PORT[0U].u8 = (uint8_t)ch;
+ }
+ return (ch);
+}
+
+
+/**
+ \brief ITM Receive Character
+ \details Inputs a character via the external variable \ref ITM_RxBuffer.
+ \return Received character.
+ \return -1 No character pending.
+ */
+__STATIC_INLINE int32_t ITM_ReceiveChar (void)
+{
+ int32_t ch = -1; /* no character available */
+
+ if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY)
+ {
+ ch = ITM_RxBuffer;
+ ITM_RxBuffer = ITM_RXBUFFER_EMPTY; /* ready for next character */
+ }
+
+ return (ch);
+}
+
+
+/**
+ \brief ITM Check Character
+ \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer.
+ \return 0 No character available.
+ \return 1 Character available.
+ */
+__STATIC_INLINE int32_t ITM_CheckChar (void)
+{
+
+ if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY)
+ {
+ return (0); /* no character available */
+ }
+ else
+ {
+ return (1); /* character available */
+ }
+}
+
+/*@} end of CMSIS_core_DebugFunctions */
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_ARMV81MML_H_DEPENDANT */
+
+#endif /* __CMSIS_GENERIC */
diff --git a/Include/core_armv8mbl.h b/Include/core_armv8mbl.h
index 251e4ed..344dca5 100644
--- a/Include/core_armv8mbl.h
+++ b/Include/core_armv8mbl.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_armv8mbl.h
* @brief CMSIS Armv8-M Baseline Core Peripheral Access Layer Header File
- * @version V5.0.7
- * @date 22. June 2018
+ * @version V5.0.8
+ * @date 12. November 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -1223,7 +1223,7 @@ typedef struct
#define EXC_RETURN_DCRS (0x00000020UL) /* bit [5] stacking rules for called registers: 0=skipped 1=saved */
#define EXC_RETURN_FTYPE (0x00000010UL) /* bit [4] allocate stack for floating-point context: 0=done 1=skipped */
#define EXC_RETURN_MODE (0x00000008UL) /* bit [3] processor mode for return: 0=Handler mode 1=Thread mode */
-#define EXC_RETURN_SPSEL (0x00000002UL) /* bit [1] stack pointer used to restore context: 0=MSP 1=PSP */
+#define EXC_RETURN_SPSEL (0x00000004UL) /* bit [2] stack pointer used to restore context: 0=MSP 1=PSP */
#define EXC_RETURN_ES (0x00000001UL) /* bit [0] security state exception was taken to: 0=Non-secure 1=Secure */
/* Integrity Signature (from ARMv8-M Architecture Reference Manual) for exception context stacking */
@@ -1253,7 +1253,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -1552,6 +1554,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
uint32_t *vectors = (uint32_t *)0x0U;
#endif
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
}
diff --git a/Include/core_armv8mml.h b/Include/core_armv8mml.h
index 3a3148e..5ddb8ae 100644
--- a/Include/core_armv8mml.h
+++ b/Include/core_armv8mml.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_armv8mml.h
* @brief CMSIS Armv8-M Mainline Core Peripheral Access Layer Header File
- * @version V5.0.7
- * @date 06. July 2018
+ * @version V5.1.0
+ * @date 12. September 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -97,7 +97,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
#define __FPU_USED 1U
#else
@@ -538,14 +538,6 @@ typedef struct
__OM uint32_t DCCSW; /*!< Offset: 0x26C ( /W) D-Cache Clean by Set-way */
__OM uint32_t DCCIMVAC; /*!< Offset: 0x270 ( /W) D-Cache Clean and Invalidate by MVA to PoC */
__OM uint32_t DCCISW; /*!< Offset: 0x274 ( /W) D-Cache Clean and Invalidate by Set-way */
- uint32_t RESERVED7[6U];
- __IOM uint32_t ITCMCR; /*!< Offset: 0x290 (R/W) Instruction Tightly-Coupled Memory Control Register */
- __IOM uint32_t DTCMCR; /*!< Offset: 0x294 (R/W) Data Tightly-Coupled Memory Control Registers */
- __IOM uint32_t AHBPCR; /*!< Offset: 0x298 (R/W) AHBP Control Register */
- __IOM uint32_t CACR; /*!< Offset: 0x29C (R/W) L1 Cache Control Register */
- __IOM uint32_t AHBSCR; /*!< Offset: 0x2A0 (R/W) AHB Slave Control Register */
- uint32_t RESERVED8[1U];
- __IOM uint32_t ABFSR; /*!< Offset: 0x2A8 (R/W) Auxiliary Bus Fault Status Register */
} SCB_Type;
/* SCB CPUID Register Definitions */
@@ -921,78 +913,6 @@ typedef struct
#define SCB_DCCISW_SET_Pos 5U /*!< SCB DCCISW: Set Position */
#define SCB_DCCISW_SET_Msk (0x1FFUL << SCB_DCCISW_SET_Pos) /*!< SCB DCCISW: Set Mask */
-/* Instruction Tightly-Coupled Memory Control Register Definitions */
-#define SCB_ITCMCR_SZ_Pos 3U /*!< SCB ITCMCR: SZ Position */
-#define SCB_ITCMCR_SZ_Msk (0xFUL << SCB_ITCMCR_SZ_Pos) /*!< SCB ITCMCR: SZ Mask */
-
-#define SCB_ITCMCR_RETEN_Pos 2U /*!< SCB ITCMCR: RETEN Position */
-#define SCB_ITCMCR_RETEN_Msk (1UL << SCB_ITCMCR_RETEN_Pos) /*!< SCB ITCMCR: RETEN Mask */
-
-#define SCB_ITCMCR_RMW_Pos 1U /*!< SCB ITCMCR: RMW Position */
-#define SCB_ITCMCR_RMW_Msk (1UL << SCB_ITCMCR_RMW_Pos) /*!< SCB ITCMCR: RMW Mask */
-
-#define SCB_ITCMCR_EN_Pos 0U /*!< SCB ITCMCR: EN Position */
-#define SCB_ITCMCR_EN_Msk (1UL /*<< SCB_ITCMCR_EN_Pos*/) /*!< SCB ITCMCR: EN Mask */
-
-/* Data Tightly-Coupled Memory Control Register Definitions */
-#define SCB_DTCMCR_SZ_Pos 3U /*!< SCB DTCMCR: SZ Position */
-#define SCB_DTCMCR_SZ_Msk (0xFUL << SCB_DTCMCR_SZ_Pos) /*!< SCB DTCMCR: SZ Mask */
-
-#define SCB_DTCMCR_RETEN_Pos 2U /*!< SCB DTCMCR: RETEN Position */
-#define SCB_DTCMCR_RETEN_Msk (1UL << SCB_DTCMCR_RETEN_Pos) /*!< SCB DTCMCR: RETEN Mask */
-
-#define SCB_DTCMCR_RMW_Pos 1U /*!< SCB DTCMCR: RMW Position */
-#define SCB_DTCMCR_RMW_Msk (1UL << SCB_DTCMCR_RMW_Pos) /*!< SCB DTCMCR: RMW Mask */
-
-#define SCB_DTCMCR_EN_Pos 0U /*!< SCB DTCMCR: EN Position */
-#define SCB_DTCMCR_EN_Msk (1UL /*<< SCB_DTCMCR_EN_Pos*/) /*!< SCB DTCMCR: EN Mask */
-
-/* AHBP Control Register Definitions */
-#define SCB_AHBPCR_SZ_Pos 1U /*!< SCB AHBPCR: SZ Position */
-#define SCB_AHBPCR_SZ_Msk (7UL << SCB_AHBPCR_SZ_Pos) /*!< SCB AHBPCR: SZ Mask */
-
-#define SCB_AHBPCR_EN_Pos 0U /*!< SCB AHBPCR: EN Position */
-#define SCB_AHBPCR_EN_Msk (1UL /*<< SCB_AHBPCR_EN_Pos*/) /*!< SCB AHBPCR: EN Mask */
-
-/* L1 Cache Control Register Definitions */
-#define SCB_CACR_FORCEWT_Pos 2U /*!< SCB CACR: FORCEWT Position */
-#define SCB_CACR_FORCEWT_Msk (1UL << SCB_CACR_FORCEWT_Pos) /*!< SCB CACR: FORCEWT Mask */
-
-#define SCB_CACR_ECCEN_Pos 1U /*!< SCB CACR: ECCEN Position */
-#define SCB_CACR_ECCEN_Msk (1UL << SCB_CACR_ECCEN_Pos) /*!< SCB CACR: ECCEN Mask */
-
-#define SCB_CACR_SIWT_Pos 0U /*!< SCB CACR: SIWT Position */
-#define SCB_CACR_SIWT_Msk (1UL /*<< SCB_CACR_SIWT_Pos*/) /*!< SCB CACR: SIWT Mask */
-
-/* AHBS Control Register Definitions */
-#define SCB_AHBSCR_INITCOUNT_Pos 11U /*!< SCB AHBSCR: INITCOUNT Position */
-#define SCB_AHBSCR_INITCOUNT_Msk (0x1FUL << SCB_AHBPCR_INITCOUNT_Pos) /*!< SCB AHBSCR: INITCOUNT Mask */
-
-#define SCB_AHBSCR_TPRI_Pos 2U /*!< SCB AHBSCR: TPRI Position */
-#define SCB_AHBSCR_TPRI_Msk (0x1FFUL << SCB_AHBPCR_TPRI_Pos) /*!< SCB AHBSCR: TPRI Mask */
-
-#define SCB_AHBSCR_CTL_Pos 0U /*!< SCB AHBSCR: CTL Position*/
-#define SCB_AHBSCR_CTL_Msk (3UL /*<< SCB_AHBPCR_CTL_Pos*/) /*!< SCB AHBSCR: CTL Mask */
-
-/* Auxiliary Bus Fault Status Register Definitions */
-#define SCB_ABFSR_AXIMTYPE_Pos 8U /*!< SCB ABFSR: AXIMTYPE Position*/
-#define SCB_ABFSR_AXIMTYPE_Msk (3UL << SCB_ABFSR_AXIMTYPE_Pos) /*!< SCB ABFSR: AXIMTYPE Mask */
-
-#define SCB_ABFSR_EPPB_Pos 4U /*!< SCB ABFSR: EPPB Position*/
-#define SCB_ABFSR_EPPB_Msk (1UL << SCB_ABFSR_EPPB_Pos) /*!< SCB ABFSR: EPPB Mask */
-
-#define SCB_ABFSR_AXIM_Pos 3U /*!< SCB ABFSR: AXIM Position*/
-#define SCB_ABFSR_AXIM_Msk (1UL << SCB_ABFSR_AXIM_Pos) /*!< SCB ABFSR: AXIM Mask */
-
-#define SCB_ABFSR_AHBP_Pos 2U /*!< SCB ABFSR: AHBP Position*/
-#define SCB_ABFSR_AHBP_Msk (1UL << SCB_ABFSR_AHBP_Pos) /*!< SCB ABFSR: AHBP Mask */
-
-#define SCB_ABFSR_DTCM_Pos 1U /*!< SCB ABFSR: DTCM Position*/
-#define SCB_ABFSR_DTCM_Msk (1UL << SCB_ABFSR_DTCM_Pos) /*!< SCB ABFSR: DTCM Mask */
-
-#define SCB_ABFSR_ITCM_Pos 0U /*!< SCB ABFSR: ITCM Position*/
-#define SCB_ABFSR_ITCM_Msk (1UL /*<< SCB_ABFSR_ITCM_Pos*/) /*!< SCB ABFSR: ITCM Mask */
-
/*@} end of group CMSIS_SCB */
@@ -1097,10 +1017,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -1163,18 +1080,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -2093,7 +1998,7 @@ typedef struct
#define EXC_RETURN_DCRS (0x00000020UL) /* bit [5] stacking rules for called registers: 0=skipped 1=saved */
#define EXC_RETURN_FTYPE (0x00000010UL) /* bit [4] allocate stack for floating-point context: 0=done 1=skipped */
#define EXC_RETURN_MODE (0x00000008UL) /* bit [3] processor mode for return: 0=Handler mode 1=Thread mode */
-#define EXC_RETURN_SPSEL (0x00000002UL) /* bit [1] stack pointer used to restore context: 0=MSP 1=PSP */
+#define EXC_RETURN_SPSEL (0x00000004UL) /* bit [2] stack pointer used to restore context: 0=MSP 1=PSP */
#define EXC_RETURN_ES (0x00000001UL) /* bit [0] security state exception was taken to: 0=Non-secure 1=Secure */
/* Integrity Signature (from ARMv8-M Architecture Reference Manual) for exception context stacking */
@@ -2122,7 +2027,7 @@ __STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
reg_value = (reg_value |
((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
- (PriorityGroupTmp << 8U) ); /* Insert write key and priorty group */
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
SCB->AIRCR = reg_value;
}
@@ -2148,7 +2053,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -2440,6 +2347,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
uint32_t *vectors = (uint32_t *)SCB->VTOR;
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
}
@@ -2496,7 +2404,7 @@ __STATIC_INLINE void TZ_NVIC_SetPriorityGrouping_NS(uint32_t PriorityGroup)
reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
reg_value = (reg_value |
((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
- (PriorityGroupTmp << 8U) ); /* Insert write key and priorty group */
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
SCB_NS->AIRCR = reg_value;
}
diff --git a/Include/core_cm0.h b/Include/core_cm0.h
index f929bba..cafae5a 100644
--- a/Include/core_cm0.h
+++ b/Include/core_cm0.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_cm0.h
* @brief CMSIS Cortex-M0 Core Peripheral Access Layer Header File
- * @version V5.0.5
- * @date 28. May 2018
+ * @version V5.0.6
+ * @date 13. March 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -316,7 +316,7 @@ typedef struct
__IOM uint32_t ISER[1U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[31U];
__IOM uint32_t ICER[1U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[31U];
+ uint32_t RESERVED1[31U];
__IOM uint32_t ISPR[1U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[31U];
__IOM uint32_t ICPR[1U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -624,7 +624,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[0U] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -829,8 +831,9 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
*/
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
- uint32_t *vectors = (uint32_t *)0x0U;
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ uint32_t vectors = 0x0U;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ /* ARM Application Note 321 states that the M0 does not require the architectural barrier */
}
@@ -844,8 +847,8 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
*/
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
- uint32_t *vectors = (uint32_t *)0x0U;
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+ uint32_t vectors = 0x0U;
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
diff --git a/Include/core_cm0plus.h b/Include/core_cm0plus.h
index 424011a..d104965 100644
--- a/Include/core_cm0plus.h
+++ b/Include/core_cm0plus.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_cm0plus.h
* @brief CMSIS Cortex-M0+ Core Peripheral Access Layer Header File
- * @version V5.0.6
- * @date 28. May 2018
+ * @version V5.0.7
+ * @date 13. March 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -330,7 +330,7 @@ typedef struct
__IOM uint32_t ISER[1U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[31U];
__IOM uint32_t ICER[1U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[31U];
+ uint32_t RESERVED1[31U];
__IOM uint32_t ISPR[1U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[31U];
__IOM uint32_t ICPR[1U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -742,7 +742,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[0U] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -948,11 +950,12 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ uint32_t vectors = SCB->VTOR;
#else
- uint32_t *vectors = (uint32_t *)0x0U;
+ uint32_t vectors = 0x0U;
#endif
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ /* ARM Application Note 321 states that the M0+ does not require the architectural barrier */
}
@@ -967,12 +970,11 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ uint32_t vectors = SCB->VTOR;
#else
- uint32_t *vectors = (uint32_t *)0x0U;
+ uint32_t vectors = 0x0U;
#endif
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
-
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
diff --git a/Include/core_cm1.h b/Include/core_cm1.h
index 0ed678e..76b4569 100644
--- a/Include/core_cm1.h
+++ b/Include/core_cm1.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_cm1.h
* @brief CMSIS Cortex-M1 Core Peripheral Access Layer Header File
- * @version V1.0.0
- * @date 23. July 2018
+ * @version V1.0.1
+ * @date 12. November 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -651,7 +651,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[0U] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -858,6 +860,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
uint32_t *vectors = (uint32_t *)0x0U;
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ /* ARM Application Note 321 states that the M1 does not require the architectural barrier */
}
diff --git a/Include/core_cm23.h b/Include/core_cm23.h
index acbc5df..b79c6af 100644
--- a/Include/core_cm23.h
+++ b/Include/core_cm23.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_cm23.h
* @brief CMSIS Cortex-M23 Core Peripheral Access Layer Header File
- * @version V5.0.7
- * @date 22. June 2018
+ * @version V5.0.8
+ * @date 12. November 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -1298,7 +1298,7 @@ typedef struct
#define EXC_RETURN_DCRS (0x00000020UL) /* bit [5] stacking rules for called registers: 0=skipped 1=saved */
#define EXC_RETURN_FTYPE (0x00000010UL) /* bit [4] allocate stack for floating-point context: 0=done 1=skipped */
#define EXC_RETURN_MODE (0x00000008UL) /* bit [3] processor mode for return: 0=Handler mode 1=Thread mode */
-#define EXC_RETURN_SPSEL (0x00000002UL) /* bit [1] stack pointer used to restore context: 0=MSP 1=PSP */
+#define EXC_RETURN_SPSEL (0x00000004UL) /* bit [2] stack pointer used to restore context: 0=MSP 1=PSP */
#define EXC_RETURN_ES (0x00000001UL) /* bit [0] security state exception was taken to: 0=Non-secure 1=Secure */
/* Integrity Signature (from ARMv8-M Architecture Reference Manual) for exception context stacking */
@@ -1328,7 +1328,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -1627,6 +1629,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
uint32_t *vectors = (uint32_t *)0x0U;
#endif
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
}
diff --git a/Include/core_cm3.h b/Include/core_cm3.h
index 74bff64..8157ca7 100644
--- a/Include/core_cm3.h
+++ b/Include/core_cm3.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_cm3.h
* @brief CMSIS Cortex-M3 Core Peripheral Access Layer Header File
- * @version V5.0.8
- * @date 04. June 2018
+ * @version V5.1.0
+ * @date 13. March 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -342,7 +342,7 @@ typedef struct
__IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[24U];
__IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[24U];
+ uint32_t RESERVED1[24U];
__IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[24U];
__IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -668,6 +668,12 @@ typedef struct
#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
/* Auxiliary Control Register Definitions */
+#if defined (__CM3_REV) && (__CM3_REV >= 0x200U)
+#define SCnSCB_ACTLR_DISOOFP_Pos 9U /*!< ACTLR: DISOOFP Position */
+#define SCnSCB_ACTLR_DISOOFP_Msk (1UL << SCnSCB_ACTLR_DISOOFP_Pos) /*!< ACTLR: DISOOFP Mask */
+
+#define SCnSCB_ACTLR_DISFPCA_Pos 8U /*!< ACTLR: DISFPCA Position */
+#define SCnSCB_ACTLR_DISFPCA_Msk (1UL << SCnSCB_ACTLR_DISFPCA_Pos) /*!< ACTLR: DISFPCA Mask */
#define SCnSCB_ACTLR_DISFOLD_Pos 2U /*!< ACTLR: DISFOLD Position */
#define SCnSCB_ACTLR_DISFOLD_Msk (1UL << SCnSCB_ACTLR_DISFOLD_Pos) /*!< ACTLR: DISFOLD Mask */
@@ -677,6 +683,7 @@ typedef struct
#define SCnSCB_ACTLR_DISMCYCINT_Pos 0U /*!< ACTLR: DISMCYCINT Position */
#define SCnSCB_ACTLR_DISMCYCINT_Msk (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/) /*!< ACTLR: DISMCYCINT Mask */
+#endif
/*@} end of group CMSIS_SCnotSCB */
@@ -757,10 +764,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -811,18 +815,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -1055,13 +1047,13 @@ typedef struct
/* TPI Integration ETM Data Register Definitions (FIFO0) */
#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */
-#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */
#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */
-#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */
#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
@@ -1084,13 +1076,13 @@ typedef struct
/* TPI Integration ITM Data Register Definitions (FIFO1) */
#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */
-#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */
#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */
-#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */
#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
@@ -1512,7 +1504,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -1735,8 +1729,9 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
*/
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ /* ARM Application Note 321 states that the M3 does not require the architectural barrier */
}
@@ -1750,8 +1745,8 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
*/
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
@@ -1784,6 +1779,7 @@ __NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
#endif
+
/* ########################## FPU functions #################################### */
/**
\ingroup CMSIS_Core_FunctionInterface
diff --git a/Include/core_cm33.h b/Include/core_cm33.h
index 6cd2db7..7fed59a 100644
--- a/Include/core_cm33.h
+++ b/Include/core_cm33.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_cm33.h
* @brief CMSIS Cortex-M33 Core Peripheral Access Layer Header File
- * @version V5.0.9
- * @date 06. July 2018
+ * @version V5.1.0
+ * @date 12. November 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -97,7 +97,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined (__ARM_PCS_VFP)
+ #if defined (__ARM_FP)
#if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
#define __FPU_USED 1U
#else
@@ -538,14 +538,6 @@ typedef struct
__OM uint32_t DCCSW; /*!< Offset: 0x26C ( /W) D-Cache Clean by Set-way */
__OM uint32_t DCCIMVAC; /*!< Offset: 0x270 ( /W) D-Cache Clean and Invalidate by MVA to PoC */
__OM uint32_t DCCISW; /*!< Offset: 0x274 ( /W) D-Cache Clean and Invalidate by Set-way */
- uint32_t RESERVED7[6U];
- __IOM uint32_t ITCMCR; /*!< Offset: 0x290 (R/W) Instruction Tightly-Coupled Memory Control Register */
- __IOM uint32_t DTCMCR; /*!< Offset: 0x294 (R/W) Data Tightly-Coupled Memory Control Registers */
- __IOM uint32_t AHBPCR; /*!< Offset: 0x298 (R/W) AHBP Control Register */
- __IOM uint32_t CACR; /*!< Offset: 0x29C (R/W) L1 Cache Control Register */
- __IOM uint32_t AHBSCR; /*!< Offset: 0x2A0 (R/W) AHB Slave Control Register */
- uint32_t RESERVED8[1U];
- __IOM uint32_t ABFSR; /*!< Offset: 0x2A8 (R/W) Auxiliary Bus Fault Status Register */
} SCB_Type;
/* SCB CPUID Register Definitions */
@@ -921,78 +913,6 @@ typedef struct
#define SCB_DCCISW_SET_Pos 5U /*!< SCB DCCISW: Set Position */
#define SCB_DCCISW_SET_Msk (0x1FFUL << SCB_DCCISW_SET_Pos) /*!< SCB DCCISW: Set Mask */
-/* Instruction Tightly-Coupled Memory Control Register Definitions */
-#define SCB_ITCMCR_SZ_Pos 3U /*!< SCB ITCMCR: SZ Position */
-#define SCB_ITCMCR_SZ_Msk (0xFUL << SCB_ITCMCR_SZ_Pos) /*!< SCB ITCMCR: SZ Mask */
-
-#define SCB_ITCMCR_RETEN_Pos 2U /*!< SCB ITCMCR: RETEN Position */
-#define SCB_ITCMCR_RETEN_Msk (1UL << SCB_ITCMCR_RETEN_Pos) /*!< SCB ITCMCR: RETEN Mask */
-
-#define SCB_ITCMCR_RMW_Pos 1U /*!< SCB ITCMCR: RMW Position */
-#define SCB_ITCMCR_RMW_Msk (1UL << SCB_ITCMCR_RMW_Pos) /*!< SCB ITCMCR: RMW Mask */
-
-#define SCB_ITCMCR_EN_Pos 0U /*!< SCB ITCMCR: EN Position */
-#define SCB_ITCMCR_EN_Msk (1UL /*<< SCB_ITCMCR_EN_Pos*/) /*!< SCB ITCMCR: EN Mask */
-
-/* Data Tightly-Coupled Memory Control Register Definitions */
-#define SCB_DTCMCR_SZ_Pos 3U /*!< SCB DTCMCR: SZ Position */
-#define SCB_DTCMCR_SZ_Msk (0xFUL << SCB_DTCMCR_SZ_Pos) /*!< SCB DTCMCR: SZ Mask */
-
-#define SCB_DTCMCR_RETEN_Pos 2U /*!< SCB DTCMCR: RETEN Position */
-#define SCB_DTCMCR_RETEN_Msk (1UL << SCB_DTCMCR_RETEN_Pos) /*!< SCB DTCMCR: RETEN Mask */
-
-#define SCB_DTCMCR_RMW_Pos 1U /*!< SCB DTCMCR: RMW Position */
-#define SCB_DTCMCR_RMW_Msk (1UL << SCB_DTCMCR_RMW_Pos) /*!< SCB DTCMCR: RMW Mask */
-
-#define SCB_DTCMCR_EN_Pos 0U /*!< SCB DTCMCR: EN Position */
-#define SCB_DTCMCR_EN_Msk (1UL /*<< SCB_DTCMCR_EN_Pos*/) /*!< SCB DTCMCR: EN Mask */
-
-/* AHBP Control Register Definitions */
-#define SCB_AHBPCR_SZ_Pos 1U /*!< SCB AHBPCR: SZ Position */
-#define SCB_AHBPCR_SZ_Msk (7UL << SCB_AHBPCR_SZ_Pos) /*!< SCB AHBPCR: SZ Mask */
-
-#define SCB_AHBPCR_EN_Pos 0U /*!< SCB AHBPCR: EN Position */
-#define SCB_AHBPCR_EN_Msk (1UL /*<< SCB_AHBPCR_EN_Pos*/) /*!< SCB AHBPCR: EN Mask */
-
-/* L1 Cache Control Register Definitions */
-#define SCB_CACR_FORCEWT_Pos 2U /*!< SCB CACR: FORCEWT Position */
-#define SCB_CACR_FORCEWT_Msk (1UL << SCB_CACR_FORCEWT_Pos) /*!< SCB CACR: FORCEWT Mask */
-
-#define SCB_CACR_ECCEN_Pos 1U /*!< SCB CACR: ECCEN Position */
-#define SCB_CACR_ECCEN_Msk (1UL << SCB_CACR_ECCEN_Pos) /*!< SCB CACR: ECCEN Mask */
-
-#define SCB_CACR_SIWT_Pos 0U /*!< SCB CACR: SIWT Position */
-#define SCB_CACR_SIWT_Msk (1UL /*<< SCB_CACR_SIWT_Pos*/) /*!< SCB CACR: SIWT Mask */
-
-/* AHBS Control Register Definitions */
-#define SCB_AHBSCR_INITCOUNT_Pos 11U /*!< SCB AHBSCR: INITCOUNT Position */
-#define SCB_AHBSCR_INITCOUNT_Msk (0x1FUL << SCB_AHBPCR_INITCOUNT_Pos) /*!< SCB AHBSCR: INITCOUNT Mask */
-
-#define SCB_AHBSCR_TPRI_Pos 2U /*!< SCB AHBSCR: TPRI Position */
-#define SCB_AHBSCR_TPRI_Msk (0x1FFUL << SCB_AHBPCR_TPRI_Pos) /*!< SCB AHBSCR: TPRI Mask */
-
-#define SCB_AHBSCR_CTL_Pos 0U /*!< SCB AHBSCR: CTL Position*/
-#define SCB_AHBSCR_CTL_Msk (3UL /*<< SCB_AHBPCR_CTL_Pos*/) /*!< SCB AHBSCR: CTL Mask */
-
-/* Auxiliary Bus Fault Status Register Definitions */
-#define SCB_ABFSR_AXIMTYPE_Pos 8U /*!< SCB ABFSR: AXIMTYPE Position*/
-#define SCB_ABFSR_AXIMTYPE_Msk (3UL << SCB_ABFSR_AXIMTYPE_Pos) /*!< SCB ABFSR: AXIMTYPE Mask */
-
-#define SCB_ABFSR_EPPB_Pos 4U /*!< SCB ABFSR: EPPB Position*/
-#define SCB_ABFSR_EPPB_Msk (1UL << SCB_ABFSR_EPPB_Pos) /*!< SCB ABFSR: EPPB Mask */
-
-#define SCB_ABFSR_AXIM_Pos 3U /*!< SCB ABFSR: AXIM Position*/
-#define SCB_ABFSR_AXIM_Msk (1UL << SCB_ABFSR_AXIM_Pos) /*!< SCB ABFSR: AXIM Mask */
-
-#define SCB_ABFSR_AHBP_Pos 2U /*!< SCB ABFSR: AHBP Position*/
-#define SCB_ABFSR_AHBP_Msk (1UL << SCB_ABFSR_AHBP_Pos) /*!< SCB ABFSR: AHBP Mask */
-
-#define SCB_ABFSR_DTCM_Pos 1U /*!< SCB ABFSR: DTCM Position*/
-#define SCB_ABFSR_DTCM_Msk (1UL << SCB_ABFSR_DTCM_Pos) /*!< SCB ABFSR: DTCM Mask */
-
-#define SCB_ABFSR_ITCM_Pos 0U /*!< SCB ABFSR: ITCM Position*/
-#define SCB_ABFSR_ITCM_Msk (1UL /*<< SCB_ABFSR_ITCM_Pos*/) /*!< SCB ABFSR: ITCM Mask */
-
/*@} end of group CMSIS_SCB */
@@ -1097,10 +1017,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -1163,18 +1080,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -2168,7 +2073,7 @@ typedef struct
#define EXC_RETURN_DCRS (0x00000020UL) /* bit [5] stacking rules for called registers: 0=skipped 1=saved */
#define EXC_RETURN_FTYPE (0x00000010UL) /* bit [4] allocate stack for floating-point context: 0=done 1=skipped */
#define EXC_RETURN_MODE (0x00000008UL) /* bit [3] processor mode for return: 0=Handler mode 1=Thread mode */
-#define EXC_RETURN_SPSEL (0x00000002UL) /* bit [1] stack pointer used to restore context: 0=MSP 1=PSP */
+#define EXC_RETURN_SPSEL (0x00000004UL) /* bit [2] stack pointer used to restore context: 0=MSP 1=PSP */
#define EXC_RETURN_ES (0x00000001UL) /* bit [0] security state exception was taken to: 0=Non-secure 1=Secure */
/* Integrity Signature (from ARMv8-M Architecture Reference Manual) for exception context stacking */
@@ -2197,7 +2102,7 @@ __STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
reg_value = (reg_value |
((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
- (PriorityGroupTmp << 8U) ); /* Insert write key and priority group */
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
SCB->AIRCR = reg_value;
}
@@ -2223,7 +2128,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -2515,6 +2422,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
uint32_t *vectors = (uint32_t *)SCB->VTOR;
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
}
diff --git a/Include/core_cm35p.h b/Include/core_cm35p.h
new file mode 100644
index 0000000..5579c82
--- /dev/null
+++ b/Include/core_cm35p.h
@@ -0,0 +1,2910 @@
+/**************************************************************************//**
+ * @file core_cm35p.h
+ * @brief CMSIS Cortex-M35P Core Peripheral Access Layer Header File
+ * @version V1.0.0
+ * @date 12. November 2018
+ ******************************************************************************/
+/*
+ * Copyright (c) 2018 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined ( __ICCARM__ )
+ #pragma system_include /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+ #pragma clang system_header /* treat file as system include file */
+#endif
+
+#ifndef __CORE_CM35P_H_GENERIC
+#define __CORE_CM35P_H_GENERIC
+
+#include
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+ \page CMSIS_MISRA_Exceptions MISRA-C:2004 Compliance Exceptions
+ CMSIS violates the following MISRA-C:2004 rules:
+
+ \li Required Rule 8.5, object/function definition in header file.
+ Function definitions in header files are used to allow 'inlining'.
+
+ \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Unions are used for effective representation of core registers.
+
+ \li Advisory Rule 19.7, Function-like macro defined.
+ Function-like macros are used to allow more efficient code.
+ */
+
+
+/*******************************************************************************
+ * CMSIS definitions
+ ******************************************************************************/
+/**
+ \ingroup Cortex_M35P
+ @{
+ */
+
+#include "cmsis_version.h"
+
+/* CMSIS CM35P definitions */
+#define __CM35P_CMSIS_VERSION_MAIN (__CM_CMSIS_VERSION_MAIN) /*!< \deprecated [31:16] CMSIS HAL main version */
+#define __CM35P_CMSIS_VERSION_SUB (__CM_CMSIS_VERSION_SUB) /*!< \deprecated [15:0] CMSIS HAL sub version */
+#define __CM35P_CMSIS_VERSION ((__CM35P_CMSIS_VERSION_MAIN << 16U) | \
+ __CM35P_CMSIS_VERSION_SUB ) /*!< \deprecated CMSIS HAL version number */
+
+#define __CORTEX_M (35U) /*!< Cortex-M Core */
+
+/** __FPU_USED indicates whether an FPU is used or not.
+ For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions.
+*/
+#if defined ( __CC_ARM )
+ #if defined (__TARGET_FPU_VFP)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1U)
+ #if defined (__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+ #if defined (__ARM_FP)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1U)
+ #if defined (__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __GNUC__ )
+ #if defined (__VFP_FP__) && !defined(__SOFTFP__)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1U)
+ #if defined (__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __ICCARM__ )
+ #if defined (__ARMVFP__)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+ #if defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1U)
+ #if defined (__DSP_PRESENT) && (__DSP_PRESENT == 1U)
+ #define __DSP_USED 1U
+ #else
+ #error "Compiler generates DSP (SIMD) instructions for a devices without DSP extensions (check __DSP_PRESENT)"
+ #define __DSP_USED 0U
+ #endif
+ #else
+ #define __DSP_USED 0U
+ #endif
+
+#elif defined ( __TI_ARM__ )
+ #if defined (__TI_VFP_SUPPORT__)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#elif defined ( __TASKING__ )
+ #if defined (__FPU_VFP__)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#elif defined ( __CSMC__ )
+ #if ( __CSMC__ & 0x400U)
+ #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+ #define __FPU_USED 1U
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0U
+ #endif
+ #else
+ #define __FPU_USED 0U
+ #endif
+
+#endif
+
+#include "cmsis_compiler.h" /* CMSIS compiler specific defines */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM35P_H_GENERIC */
+
+#ifndef __CMSIS_GENERIC
+
+#ifndef __CORE_CM35P_H_DEPENDANT
+#define __CORE_CM35P_H_DEPENDANT
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* check device defines and use defaults */
+#if defined __CHECK_DEVICE_DEFINES
+ #ifndef __CM35P_REV
+ #define __CM35P_REV 0x0000U
+ #warning "__CM35P_REV not defined in device header file; using default!"
+ #endif
+
+ #ifndef __FPU_PRESENT
+ #define __FPU_PRESENT 0U
+ #warning "__FPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __MPU_PRESENT
+ #define __MPU_PRESENT 0U
+ #warning "__MPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __SAUREGION_PRESENT
+ #define __SAUREGION_PRESENT 0U
+ #warning "__SAUREGION_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __DSP_PRESENT
+ #define __DSP_PRESENT 0U
+ #warning "__DSP_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __NVIC_PRIO_BITS
+ #define __NVIC_PRIO_BITS 3U
+ #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
+ #endif
+
+ #ifndef __Vendor_SysTickConfig
+ #define __Vendor_SysTickConfig 0U
+ #warning "__Vendor_SysTickConfig not defined in device header file; using default!"
+ #endif
+#endif
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+ \defgroup CMSIS_glob_defs CMSIS Global Defines
+
+ IO Type Qualifiers are used
+ \li to specify the access to peripheral variables.
+ \li for automatic generation of peripheral register debug information.
+*/
+#ifdef __cplusplus
+ #define __I volatile /*!< Defines 'read only' permissions */
+#else
+ #define __I volatile const /*!< Defines 'read only' permissions */
+#endif
+#define __O volatile /*!< Defines 'write only' permissions */
+#define __IO volatile /*!< Defines 'read / write' permissions */
+
+/* following defines should be used for structure members */
+#define __IM volatile const /*! Defines 'read only' structure member permissions */
+#define __OM volatile /*! Defines 'write only' structure member permissions */
+#define __IOM volatile /*! Defines 'read / write' structure member permissions */
+
+/*@} end of group Cortex_M35P */
+
+
+
+/*******************************************************************************
+ * Register Abstraction
+ Core Register contain:
+ - Core Register
+ - Core NVIC Register
+ - Core SCB Register
+ - Core SysTick Register
+ - Core Debug Register
+ - Core MPU Register
+ - Core SAU Register
+ - Core FPU Register
+ ******************************************************************************/
+/**
+ \defgroup CMSIS_core_register Defines and Type Definitions
+ \brief Type definitions and defines for Cortex-M processor based devices.
+*/
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_CORE Status and Control Registers
+ \brief Core Register type definitions.
+ @{
+ */
+
+/**
+ \brief Union type to access the Application Program Status Register (APSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t _reserved0:16; /*!< bit: 0..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:7; /*!< bit: 20..26 Reserved */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} APSR_Type;
+
+/* APSR Register Definitions */
+#define APSR_N_Pos 31U /*!< APSR: N Position */
+#define APSR_N_Msk (1UL << APSR_N_Pos) /*!< APSR: N Mask */
+
+#define APSR_Z_Pos 30U /*!< APSR: Z Position */
+#define APSR_Z_Msk (1UL << APSR_Z_Pos) /*!< APSR: Z Mask */
+
+#define APSR_C_Pos 29U /*!< APSR: C Position */
+#define APSR_C_Msk (1UL << APSR_C_Pos) /*!< APSR: C Mask */
+
+#define APSR_V_Pos 28U /*!< APSR: V Position */
+#define APSR_V_Msk (1UL << APSR_V_Pos) /*!< APSR: V Mask */
+
+#define APSR_Q_Pos 27U /*!< APSR: Q Position */
+#define APSR_Q_Msk (1UL << APSR_Q_Pos) /*!< APSR: Q Mask */
+
+#define APSR_GE_Pos 16U /*!< APSR: GE Position */
+#define APSR_GE_Msk (0xFUL << APSR_GE_Pos) /*!< APSR: GE Mask */
+
+
+/**
+ \brief Union type to access the Interrupt Program Status Register (IPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:23; /*!< bit: 9..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} IPSR_Type;
+
+/* IPSR Register Definitions */
+#define IPSR_ISR_Pos 0U /*!< IPSR: ISR Position */
+#define IPSR_ISR_Msk (0x1FFUL /*<< IPSR_ISR_Pos*/) /*!< IPSR: ISR Mask */
+
+
+/**
+ \brief Union type to access the Special-Purpose Program Status Registers (xPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:7; /*!< bit: 9..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:4; /*!< bit: 20..23 Reserved */
+ uint32_t T:1; /*!< bit: 24 Thumb bit (read 0) */
+ uint32_t IT:2; /*!< bit: 25..26 saved IT state (read 0) */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} xPSR_Type;
+
+/* xPSR Register Definitions */
+#define xPSR_N_Pos 31U /*!< xPSR: N Position */
+#define xPSR_N_Msk (1UL << xPSR_N_Pos) /*!< xPSR: N Mask */
+
+#define xPSR_Z_Pos 30U /*!< xPSR: Z Position */
+#define xPSR_Z_Msk (1UL << xPSR_Z_Pos) /*!< xPSR: Z Mask */
+
+#define xPSR_C_Pos 29U /*!< xPSR: C Position */
+#define xPSR_C_Msk (1UL << xPSR_C_Pos) /*!< xPSR: C Mask */
+
+#define xPSR_V_Pos 28U /*!< xPSR: V Position */
+#define xPSR_V_Msk (1UL << xPSR_V_Pos) /*!< xPSR: V Mask */
+
+#define xPSR_Q_Pos 27U /*!< xPSR: Q Position */
+#define xPSR_Q_Msk (1UL << xPSR_Q_Pos) /*!< xPSR: Q Mask */
+
+#define xPSR_IT_Pos 25U /*!< xPSR: IT Position */
+#define xPSR_IT_Msk (3UL << xPSR_IT_Pos) /*!< xPSR: IT Mask */
+
+#define xPSR_T_Pos 24U /*!< xPSR: T Position */
+#define xPSR_T_Msk (1UL << xPSR_T_Pos) /*!< xPSR: T Mask */
+
+#define xPSR_GE_Pos 16U /*!< xPSR: GE Position */
+#define xPSR_GE_Msk (0xFUL << xPSR_GE_Pos) /*!< xPSR: GE Mask */
+
+#define xPSR_ISR_Pos 0U /*!< xPSR: ISR Position */
+#define xPSR_ISR_Msk (0x1FFUL /*<< xPSR_ISR_Pos*/) /*!< xPSR: ISR Mask */
+
+
+/**
+ \brief Union type to access the Control Registers (CONTROL).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t nPRIV:1; /*!< bit: 0 Execution privilege in Thread mode */
+ uint32_t SPSEL:1; /*!< bit: 1 Stack-pointer select */
+ uint32_t FPCA:1; /*!< bit: 2 Floating-point context active */
+ uint32_t SFPA:1; /*!< bit: 3 Secure floating-point active */
+ uint32_t _reserved1:28; /*!< bit: 4..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} CONTROL_Type;
+
+/* CONTROL Register Definitions */
+#define CONTROL_SFPA_Pos 3U /*!< CONTROL: SFPA Position */
+#define CONTROL_SFPA_Msk (1UL << CONTROL_SFPA_Pos) /*!< CONTROL: SFPA Mask */
+
+#define CONTROL_FPCA_Pos 2U /*!< CONTROL: FPCA Position */
+#define CONTROL_FPCA_Msk (1UL << CONTROL_FPCA_Pos) /*!< CONTROL: FPCA Mask */
+
+#define CONTROL_SPSEL_Pos 1U /*!< CONTROL: SPSEL Position */
+#define CONTROL_SPSEL_Msk (1UL << CONTROL_SPSEL_Pos) /*!< CONTROL: SPSEL Mask */
+
+#define CONTROL_nPRIV_Pos 0U /*!< CONTROL: nPRIV Position */
+#define CONTROL_nPRIV_Msk (1UL /*<< CONTROL_nPRIV_Pos*/) /*!< CONTROL: nPRIV Mask */
+
+/*@} end of group CMSIS_CORE */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_NVIC Nested Vectored Interrupt Controller (NVIC)
+ \brief Type definitions for the NVIC Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the Nested Vectored Interrupt Controller (NVIC).
+ */
+typedef struct
+{
+ __IOM uint32_t ISER[16U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
+ uint32_t RESERVED0[16U];
+ __IOM uint32_t ICER[16U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
+ uint32_t RSERVED1[16U];
+ __IOM uint32_t ISPR[16U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
+ uint32_t RESERVED2[16U];
+ __IOM uint32_t ICPR[16U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
+ uint32_t RESERVED3[16U];
+ __IOM uint32_t IABR[16U]; /*!< Offset: 0x200 (R/W) Interrupt Active bit Register */
+ uint32_t RESERVED4[16U];
+ __IOM uint32_t ITNS[16U]; /*!< Offset: 0x280 (R/W) Interrupt Non-Secure State Register */
+ uint32_t RESERVED5[16U];
+ __IOM uint8_t IPR[496U]; /*!< Offset: 0x300 (R/W) Interrupt Priority Register (8Bit wide) */
+ uint32_t RESERVED6[580U];
+ __OM uint32_t STIR; /*!< Offset: 0xE00 ( /W) Software Trigger Interrupt Register */
+} NVIC_Type;
+
+/* Software Triggered Interrupt Register Definitions */
+#define NVIC_STIR_INTID_Pos 0U /*!< STIR: INTLINESNUM Position */
+#define NVIC_STIR_INTID_Msk (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/) /*!< STIR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_NVIC */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCB System Control Block (SCB)
+ \brief Type definitions for the System Control Block Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Control Block (SCB).
+ */
+typedef struct
+{
+ __IM uint32_t CPUID; /*!< Offset: 0x000 (R/ ) CPUID Base Register */
+ __IOM uint32_t ICSR; /*!< Offset: 0x004 (R/W) Interrupt Control and State Register */
+ __IOM uint32_t VTOR; /*!< Offset: 0x008 (R/W) Vector Table Offset Register */
+ __IOM uint32_t AIRCR; /*!< Offset: 0x00C (R/W) Application Interrupt and Reset Control Register */
+ __IOM uint32_t SCR; /*!< Offset: 0x010 (R/W) System Control Register */
+ __IOM uint32_t CCR; /*!< Offset: 0x014 (R/W) Configuration Control Register */
+ __IOM uint8_t SHPR[12U]; /*!< Offset: 0x018 (R/W) System Handlers Priority Registers (4-7, 8-11, 12-15) */
+ __IOM uint32_t SHCSR; /*!< Offset: 0x024 (R/W) System Handler Control and State Register */
+ __IOM uint32_t CFSR; /*!< Offset: 0x028 (R/W) Configurable Fault Status Register */
+ __IOM uint32_t HFSR; /*!< Offset: 0x02C (R/W) HardFault Status Register */
+ __IOM uint32_t DFSR; /*!< Offset: 0x030 (R/W) Debug Fault Status Register */
+ __IOM uint32_t MMFAR; /*!< Offset: 0x034 (R/W) MemManage Fault Address Register */
+ __IOM uint32_t BFAR; /*!< Offset: 0x038 (R/W) BusFault Address Register */
+ __IOM uint32_t AFSR; /*!< Offset: 0x03C (R/W) Auxiliary Fault Status Register */
+ __IM uint32_t ID_PFR[2U]; /*!< Offset: 0x040 (R/ ) Processor Feature Register */
+ __IM uint32_t ID_DFR; /*!< Offset: 0x048 (R/ ) Debug Feature Register */
+ __IM uint32_t ID_ADR; /*!< Offset: 0x04C (R/ ) Auxiliary Feature Register */
+ __IM uint32_t ID_MMFR[4U]; /*!< Offset: 0x050 (R/ ) Memory Model Feature Register */
+ __IM uint32_t ID_ISAR[6U]; /*!< Offset: 0x060 (R/ ) Instruction Set Attributes Register */
+ __IM uint32_t CLIDR; /*!< Offset: 0x078 (R/ ) Cache Level ID register */
+ __IM uint32_t CTR; /*!< Offset: 0x07C (R/ ) Cache Type register */
+ __IM uint32_t CCSIDR; /*!< Offset: 0x080 (R/ ) Cache Size ID Register */
+ __IOM uint32_t CSSELR; /*!< Offset: 0x084 (R/W) Cache Size Selection Register */
+ __IOM uint32_t CPACR; /*!< Offset: 0x088 (R/W) Coprocessor Access Control Register */
+ __IOM uint32_t NSACR; /*!< Offset: 0x08C (R/W) Non-Secure Access Control Register */
+ uint32_t RESERVED3[92U];
+ __OM uint32_t STIR; /*!< Offset: 0x200 ( /W) Software Triggered Interrupt Register */
+ uint32_t RESERVED4[15U];
+ __IM uint32_t MVFR0; /*!< Offset: 0x240 (R/ ) Media and VFP Feature Register 0 */
+ __IM uint32_t MVFR1; /*!< Offset: 0x244 (R/ ) Media and VFP Feature Register 1 */
+ __IM uint32_t MVFR2; /*!< Offset: 0x248 (R/ ) Media and VFP Feature Register 2 */
+ uint32_t RESERVED5[1U];
+ __OM uint32_t ICIALLU; /*!< Offset: 0x250 ( /W) I-Cache Invalidate All to PoU */
+ uint32_t RESERVED6[1U];
+ __OM uint32_t ICIMVAU; /*!< Offset: 0x258 ( /W) I-Cache Invalidate by MVA to PoU */
+ __OM uint32_t DCIMVAC; /*!< Offset: 0x25C ( /W) D-Cache Invalidate by MVA to PoC */
+ __OM uint32_t DCISW; /*!< Offset: 0x260 ( /W) D-Cache Invalidate by Set-way */
+ __OM uint32_t DCCMVAU; /*!< Offset: 0x264 ( /W) D-Cache Clean by MVA to PoU */
+ __OM uint32_t DCCMVAC; /*!< Offset: 0x268 ( /W) D-Cache Clean by MVA to PoC */
+ __OM uint32_t DCCSW; /*!< Offset: 0x26C ( /W) D-Cache Clean by Set-way */
+ __OM uint32_t DCCIMVAC; /*!< Offset: 0x270 ( /W) D-Cache Clean and Invalidate by MVA to PoC */
+ __OM uint32_t DCCISW; /*!< Offset: 0x274 ( /W) D-Cache Clean and Invalidate by Set-way */
+} SCB_Type;
+
+/* SCB CPUID Register Definitions */
+#define SCB_CPUID_IMPLEMENTER_Pos 24U /*!< SCB CPUID: IMPLEMENTER Position */
+#define SCB_CPUID_IMPLEMENTER_Msk (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos) /*!< SCB CPUID: IMPLEMENTER Mask */
+
+#define SCB_CPUID_VARIANT_Pos 20U /*!< SCB CPUID: VARIANT Position */
+#define SCB_CPUID_VARIANT_Msk (0xFUL << SCB_CPUID_VARIANT_Pos) /*!< SCB CPUID: VARIANT Mask */
+
+#define SCB_CPUID_ARCHITECTURE_Pos 16U /*!< SCB CPUID: ARCHITECTURE Position */
+#define SCB_CPUID_ARCHITECTURE_Msk (0xFUL << SCB_CPUID_ARCHITECTURE_Pos) /*!< SCB CPUID: ARCHITECTURE Mask */
+
+#define SCB_CPUID_PARTNO_Pos 4U /*!< SCB CPUID: PARTNO Position */
+#define SCB_CPUID_PARTNO_Msk (0xFFFUL << SCB_CPUID_PARTNO_Pos) /*!< SCB CPUID: PARTNO Mask */
+
+#define SCB_CPUID_REVISION_Pos 0U /*!< SCB CPUID: REVISION Position */
+#define SCB_CPUID_REVISION_Msk (0xFUL /*<< SCB_CPUID_REVISION_Pos*/) /*!< SCB CPUID: REVISION Mask */
+
+/* SCB Interrupt Control State Register Definitions */
+#define SCB_ICSR_PENDNMISET_Pos 31U /*!< SCB ICSR: PENDNMISET Position */
+#define SCB_ICSR_PENDNMISET_Msk (1UL << SCB_ICSR_PENDNMISET_Pos) /*!< SCB ICSR: PENDNMISET Mask */
+
+#define SCB_ICSR_NMIPENDSET_Pos SCB_ICSR_PENDNMISET_Pos /*!< SCB ICSR: NMIPENDSET Position, backward compatibility */
+#define SCB_ICSR_NMIPENDSET_Msk SCB_ICSR_PENDNMISET_Msk /*!< SCB ICSR: NMIPENDSET Mask, backward compatibility */
+
+#define SCB_ICSR_PENDNMICLR_Pos 30U /*!< SCB ICSR: PENDNMICLR Position */
+#define SCB_ICSR_PENDNMICLR_Msk (1UL << SCB_ICSR_PENDNMICLR_Pos) /*!< SCB ICSR: PENDNMICLR Mask */
+
+#define SCB_ICSR_PENDSVSET_Pos 28U /*!< SCB ICSR: PENDSVSET Position */
+#define SCB_ICSR_PENDSVSET_Msk (1UL << SCB_ICSR_PENDSVSET_Pos) /*!< SCB ICSR: PENDSVSET Mask */
+
+#define SCB_ICSR_PENDSVCLR_Pos 27U /*!< SCB ICSR: PENDSVCLR Position */
+#define SCB_ICSR_PENDSVCLR_Msk (1UL << SCB_ICSR_PENDSVCLR_Pos) /*!< SCB ICSR: PENDSVCLR Mask */
+
+#define SCB_ICSR_PENDSTSET_Pos 26U /*!< SCB ICSR: PENDSTSET Position */
+#define SCB_ICSR_PENDSTSET_Msk (1UL << SCB_ICSR_PENDSTSET_Pos) /*!< SCB ICSR: PENDSTSET Mask */
+
+#define SCB_ICSR_PENDSTCLR_Pos 25U /*!< SCB ICSR: PENDSTCLR Position */
+#define SCB_ICSR_PENDSTCLR_Msk (1UL << SCB_ICSR_PENDSTCLR_Pos) /*!< SCB ICSR: PENDSTCLR Mask */
+
+#define SCB_ICSR_STTNS_Pos 24U /*!< SCB ICSR: STTNS Position (Security Extension) */
+#define SCB_ICSR_STTNS_Msk (1UL << SCB_ICSR_STTNS_Pos) /*!< SCB ICSR: STTNS Mask (Security Extension) */
+
+#define SCB_ICSR_ISRPREEMPT_Pos 23U /*!< SCB ICSR: ISRPREEMPT Position */
+#define SCB_ICSR_ISRPREEMPT_Msk (1UL << SCB_ICSR_ISRPREEMPT_Pos) /*!< SCB ICSR: ISRPREEMPT Mask */
+
+#define SCB_ICSR_ISRPENDING_Pos 22U /*!< SCB ICSR: ISRPENDING Position */
+#define SCB_ICSR_ISRPENDING_Msk (1UL << SCB_ICSR_ISRPENDING_Pos) /*!< SCB ICSR: ISRPENDING Mask */
+
+#define SCB_ICSR_VECTPENDING_Pos 12U /*!< SCB ICSR: VECTPENDING Position */
+#define SCB_ICSR_VECTPENDING_Msk (0x1FFUL << SCB_ICSR_VECTPENDING_Pos) /*!< SCB ICSR: VECTPENDING Mask */
+
+#define SCB_ICSR_RETTOBASE_Pos 11U /*!< SCB ICSR: RETTOBASE Position */
+#define SCB_ICSR_RETTOBASE_Msk (1UL << SCB_ICSR_RETTOBASE_Pos) /*!< SCB ICSR: RETTOBASE Mask */
+
+#define SCB_ICSR_VECTACTIVE_Pos 0U /*!< SCB ICSR: VECTACTIVE Position */
+#define SCB_ICSR_VECTACTIVE_Msk (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/) /*!< SCB ICSR: VECTACTIVE Mask */
+
+/* SCB Vector Table Offset Register Definitions */
+#define SCB_VTOR_TBLOFF_Pos 7U /*!< SCB VTOR: TBLOFF Position */
+#define SCB_VTOR_TBLOFF_Msk (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos) /*!< SCB VTOR: TBLOFF Mask */
+
+/* SCB Application Interrupt and Reset Control Register Definitions */
+#define SCB_AIRCR_VECTKEY_Pos 16U /*!< SCB AIRCR: VECTKEY Position */
+#define SCB_AIRCR_VECTKEY_Msk (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos) /*!< SCB AIRCR: VECTKEY Mask */
+
+#define SCB_AIRCR_VECTKEYSTAT_Pos 16U /*!< SCB AIRCR: VECTKEYSTAT Position */
+#define SCB_AIRCR_VECTKEYSTAT_Msk (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos) /*!< SCB AIRCR: VECTKEYSTAT Mask */
+
+#define SCB_AIRCR_ENDIANESS_Pos 15U /*!< SCB AIRCR: ENDIANESS Position */
+#define SCB_AIRCR_ENDIANESS_Msk (1UL << SCB_AIRCR_ENDIANESS_Pos) /*!< SCB AIRCR: ENDIANESS Mask */
+
+#define SCB_AIRCR_PRIS_Pos 14U /*!< SCB AIRCR: PRIS Position */
+#define SCB_AIRCR_PRIS_Msk (1UL << SCB_AIRCR_PRIS_Pos) /*!< SCB AIRCR: PRIS Mask */
+
+#define SCB_AIRCR_BFHFNMINS_Pos 13U /*!< SCB AIRCR: BFHFNMINS Position */
+#define SCB_AIRCR_BFHFNMINS_Msk (1UL << SCB_AIRCR_BFHFNMINS_Pos) /*!< SCB AIRCR: BFHFNMINS Mask */
+
+#define SCB_AIRCR_PRIGROUP_Pos 8U /*!< SCB AIRCR: PRIGROUP Position */
+#define SCB_AIRCR_PRIGROUP_Msk (7UL << SCB_AIRCR_PRIGROUP_Pos) /*!< SCB AIRCR: PRIGROUP Mask */
+
+#define SCB_AIRCR_SYSRESETREQS_Pos 3U /*!< SCB AIRCR: SYSRESETREQS Position */
+#define SCB_AIRCR_SYSRESETREQS_Msk (1UL << SCB_AIRCR_SYSRESETREQS_Pos) /*!< SCB AIRCR: SYSRESETREQS Mask */
+
+#define SCB_AIRCR_SYSRESETREQ_Pos 2U /*!< SCB AIRCR: SYSRESETREQ Position */
+#define SCB_AIRCR_SYSRESETREQ_Msk (1UL << SCB_AIRCR_SYSRESETREQ_Pos) /*!< SCB AIRCR: SYSRESETREQ Mask */
+
+#define SCB_AIRCR_VECTCLRACTIVE_Pos 1U /*!< SCB AIRCR: VECTCLRACTIVE Position */
+#define SCB_AIRCR_VECTCLRACTIVE_Msk (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos) /*!< SCB AIRCR: VECTCLRACTIVE Mask */
+
+/* SCB System Control Register Definitions */
+#define SCB_SCR_SEVONPEND_Pos 4U /*!< SCB SCR: SEVONPEND Position */
+#define SCB_SCR_SEVONPEND_Msk (1UL << SCB_SCR_SEVONPEND_Pos) /*!< SCB SCR: SEVONPEND Mask */
+
+#define SCB_SCR_SLEEPDEEPS_Pos 3U /*!< SCB SCR: SLEEPDEEPS Position */
+#define SCB_SCR_SLEEPDEEPS_Msk (1UL << SCB_SCR_SLEEPDEEPS_Pos) /*!< SCB SCR: SLEEPDEEPS Mask */
+
+#define SCB_SCR_SLEEPDEEP_Pos 2U /*!< SCB SCR: SLEEPDEEP Position */
+#define SCB_SCR_SLEEPDEEP_Msk (1UL << SCB_SCR_SLEEPDEEP_Pos) /*!< SCB SCR: SLEEPDEEP Mask */
+
+#define SCB_SCR_SLEEPONEXIT_Pos 1U /*!< SCB SCR: SLEEPONEXIT Position */
+#define SCB_SCR_SLEEPONEXIT_Msk (1UL << SCB_SCR_SLEEPONEXIT_Pos) /*!< SCB SCR: SLEEPONEXIT Mask */
+
+/* SCB Configuration Control Register Definitions */
+#define SCB_CCR_BP_Pos 18U /*!< SCB CCR: BP Position */
+#define SCB_CCR_BP_Msk (1UL << SCB_CCR_BP_Pos) /*!< SCB CCR: BP Mask */
+
+#define SCB_CCR_IC_Pos 17U /*!< SCB CCR: IC Position */
+#define SCB_CCR_IC_Msk (1UL << SCB_CCR_IC_Pos) /*!< SCB CCR: IC Mask */
+
+#define SCB_CCR_DC_Pos 16U /*!< SCB CCR: DC Position */
+#define SCB_CCR_DC_Msk (1UL << SCB_CCR_DC_Pos) /*!< SCB CCR: DC Mask */
+
+#define SCB_CCR_STKOFHFNMIGN_Pos 10U /*!< SCB CCR: STKOFHFNMIGN Position */
+#define SCB_CCR_STKOFHFNMIGN_Msk (1UL << SCB_CCR_STKOFHFNMIGN_Pos) /*!< SCB CCR: STKOFHFNMIGN Mask */
+
+#define SCB_CCR_BFHFNMIGN_Pos 8U /*!< SCB CCR: BFHFNMIGN Position */
+#define SCB_CCR_BFHFNMIGN_Msk (1UL << SCB_CCR_BFHFNMIGN_Pos) /*!< SCB CCR: BFHFNMIGN Mask */
+
+#define SCB_CCR_DIV_0_TRP_Pos 4U /*!< SCB CCR: DIV_0_TRP Position */
+#define SCB_CCR_DIV_0_TRP_Msk (1UL << SCB_CCR_DIV_0_TRP_Pos) /*!< SCB CCR: DIV_0_TRP Mask */
+
+#define SCB_CCR_UNALIGN_TRP_Pos 3U /*!< SCB CCR: UNALIGN_TRP Position */
+#define SCB_CCR_UNALIGN_TRP_Msk (1UL << SCB_CCR_UNALIGN_TRP_Pos) /*!< SCB CCR: UNALIGN_TRP Mask */
+
+#define SCB_CCR_USERSETMPEND_Pos 1U /*!< SCB CCR: USERSETMPEND Position */
+#define SCB_CCR_USERSETMPEND_Msk (1UL << SCB_CCR_USERSETMPEND_Pos) /*!< SCB CCR: USERSETMPEND Mask */
+
+/* SCB System Handler Control and State Register Definitions */
+#define SCB_SHCSR_HARDFAULTPENDED_Pos 21U /*!< SCB SHCSR: HARDFAULTPENDED Position */
+#define SCB_SHCSR_HARDFAULTPENDED_Msk (1UL << SCB_SHCSR_HARDFAULTPENDED_Pos) /*!< SCB SHCSR: HARDFAULTPENDED Mask */
+
+#define SCB_SHCSR_SECUREFAULTPENDED_Pos 20U /*!< SCB SHCSR: SECUREFAULTPENDED Position */
+#define SCB_SHCSR_SECUREFAULTPENDED_Msk (1UL << SCB_SHCSR_SECUREFAULTPENDED_Pos) /*!< SCB SHCSR: SECUREFAULTPENDED Mask */
+
+#define SCB_SHCSR_SECUREFAULTENA_Pos 19U /*!< SCB SHCSR: SECUREFAULTENA Position */
+#define SCB_SHCSR_SECUREFAULTENA_Msk (1UL << SCB_SHCSR_SECUREFAULTENA_Pos) /*!< SCB SHCSR: SECUREFAULTENA Mask */
+
+#define SCB_SHCSR_USGFAULTENA_Pos 18U /*!< SCB SHCSR: USGFAULTENA Position */
+#define SCB_SHCSR_USGFAULTENA_Msk (1UL << SCB_SHCSR_USGFAULTENA_Pos) /*!< SCB SHCSR: USGFAULTENA Mask */
+
+#define SCB_SHCSR_BUSFAULTENA_Pos 17U /*!< SCB SHCSR: BUSFAULTENA Position */
+#define SCB_SHCSR_BUSFAULTENA_Msk (1UL << SCB_SHCSR_BUSFAULTENA_Pos) /*!< SCB SHCSR: BUSFAULTENA Mask */
+
+#define SCB_SHCSR_MEMFAULTENA_Pos 16U /*!< SCB SHCSR: MEMFAULTENA Position */
+#define SCB_SHCSR_MEMFAULTENA_Msk (1UL << SCB_SHCSR_MEMFAULTENA_Pos) /*!< SCB SHCSR: MEMFAULTENA Mask */
+
+#define SCB_SHCSR_SVCALLPENDED_Pos 15U /*!< SCB SHCSR: SVCALLPENDED Position */
+#define SCB_SHCSR_SVCALLPENDED_Msk (1UL << SCB_SHCSR_SVCALLPENDED_Pos) /*!< SCB SHCSR: SVCALLPENDED Mask */
+
+#define SCB_SHCSR_BUSFAULTPENDED_Pos 14U /*!< SCB SHCSR: BUSFAULTPENDED Position */
+#define SCB_SHCSR_BUSFAULTPENDED_Msk (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos) /*!< SCB SHCSR: BUSFAULTPENDED Mask */
+
+#define SCB_SHCSR_MEMFAULTPENDED_Pos 13U /*!< SCB SHCSR: MEMFAULTPENDED Position */
+#define SCB_SHCSR_MEMFAULTPENDED_Msk (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos) /*!< SCB SHCSR: MEMFAULTPENDED Mask */
+
+#define SCB_SHCSR_USGFAULTPENDED_Pos 12U /*!< SCB SHCSR: USGFAULTPENDED Position */
+#define SCB_SHCSR_USGFAULTPENDED_Msk (1UL << SCB_SHCSR_USGFAULTPENDED_Pos) /*!< SCB SHCSR: USGFAULTPENDED Mask */
+
+#define SCB_SHCSR_SYSTICKACT_Pos 11U /*!< SCB SHCSR: SYSTICKACT Position */
+#define SCB_SHCSR_SYSTICKACT_Msk (1UL << SCB_SHCSR_SYSTICKACT_Pos) /*!< SCB SHCSR: SYSTICKACT Mask */
+
+#define SCB_SHCSR_PENDSVACT_Pos 10U /*!< SCB SHCSR: PENDSVACT Position */
+#define SCB_SHCSR_PENDSVACT_Msk (1UL << SCB_SHCSR_PENDSVACT_Pos) /*!< SCB SHCSR: PENDSVACT Mask */
+
+#define SCB_SHCSR_MONITORACT_Pos 8U /*!< SCB SHCSR: MONITORACT Position */
+#define SCB_SHCSR_MONITORACT_Msk (1UL << SCB_SHCSR_MONITORACT_Pos) /*!< SCB SHCSR: MONITORACT Mask */
+
+#define SCB_SHCSR_SVCALLACT_Pos 7U /*!< SCB SHCSR: SVCALLACT Position */
+#define SCB_SHCSR_SVCALLACT_Msk (1UL << SCB_SHCSR_SVCALLACT_Pos) /*!< SCB SHCSR: SVCALLACT Mask */
+
+#define SCB_SHCSR_NMIACT_Pos 5U /*!< SCB SHCSR: NMIACT Position */
+#define SCB_SHCSR_NMIACT_Msk (1UL << SCB_SHCSR_NMIACT_Pos) /*!< SCB SHCSR: NMIACT Mask */
+
+#define SCB_SHCSR_SECUREFAULTACT_Pos 4U /*!< SCB SHCSR: SECUREFAULTACT Position */
+#define SCB_SHCSR_SECUREFAULTACT_Msk (1UL << SCB_SHCSR_SECUREFAULTACT_Pos) /*!< SCB SHCSR: SECUREFAULTACT Mask */
+
+#define SCB_SHCSR_USGFAULTACT_Pos 3U /*!< SCB SHCSR: USGFAULTACT Position */
+#define SCB_SHCSR_USGFAULTACT_Msk (1UL << SCB_SHCSR_USGFAULTACT_Pos) /*!< SCB SHCSR: USGFAULTACT Mask */
+
+#define SCB_SHCSR_HARDFAULTACT_Pos 2U /*!< SCB SHCSR: HARDFAULTACT Position */
+#define SCB_SHCSR_HARDFAULTACT_Msk (1UL << SCB_SHCSR_HARDFAULTACT_Pos) /*!< SCB SHCSR: HARDFAULTACT Mask */
+
+#define SCB_SHCSR_BUSFAULTACT_Pos 1U /*!< SCB SHCSR: BUSFAULTACT Position */
+#define SCB_SHCSR_BUSFAULTACT_Msk (1UL << SCB_SHCSR_BUSFAULTACT_Pos) /*!< SCB SHCSR: BUSFAULTACT Mask */
+
+#define SCB_SHCSR_MEMFAULTACT_Pos 0U /*!< SCB SHCSR: MEMFAULTACT Position */
+#define SCB_SHCSR_MEMFAULTACT_Msk (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/) /*!< SCB SHCSR: MEMFAULTACT Mask */
+
+/* SCB Configurable Fault Status Register Definitions */
+#define SCB_CFSR_USGFAULTSR_Pos 16U /*!< SCB CFSR: Usage Fault Status Register Position */
+#define SCB_CFSR_USGFAULTSR_Msk (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos) /*!< SCB CFSR: Usage Fault Status Register Mask */
+
+#define SCB_CFSR_BUSFAULTSR_Pos 8U /*!< SCB CFSR: Bus Fault Status Register Position */
+#define SCB_CFSR_BUSFAULTSR_Msk (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos) /*!< SCB CFSR: Bus Fault Status Register Mask */
+
+#define SCB_CFSR_MEMFAULTSR_Pos 0U /*!< SCB CFSR: Memory Manage Fault Status Register Position */
+#define SCB_CFSR_MEMFAULTSR_Msk (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/) /*!< SCB CFSR: Memory Manage Fault Status Register Mask */
+
+/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_MMARVALID_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 7U) /*!< SCB CFSR (MMFSR): MMARVALID Position */
+#define SCB_CFSR_MMARVALID_Msk (1UL << SCB_CFSR_MMARVALID_Pos) /*!< SCB CFSR (MMFSR): MMARVALID Mask */
+
+#define SCB_CFSR_MLSPERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 5U) /*!< SCB CFSR (MMFSR): MLSPERR Position */
+#define SCB_CFSR_MLSPERR_Msk (1UL << SCB_CFSR_MLSPERR_Pos) /*!< SCB CFSR (MMFSR): MLSPERR Mask */
+
+#define SCB_CFSR_MSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 4U) /*!< SCB CFSR (MMFSR): MSTKERR Position */
+#define SCB_CFSR_MSTKERR_Msk (1UL << SCB_CFSR_MSTKERR_Pos) /*!< SCB CFSR (MMFSR): MSTKERR Mask */
+
+#define SCB_CFSR_MUNSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 3U) /*!< SCB CFSR (MMFSR): MUNSTKERR Position */
+#define SCB_CFSR_MUNSTKERR_Msk (1UL << SCB_CFSR_MUNSTKERR_Pos) /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */
+
+#define SCB_CFSR_DACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 1U) /*!< SCB CFSR (MMFSR): DACCVIOL Position */
+#define SCB_CFSR_DACCVIOL_Msk (1UL << SCB_CFSR_DACCVIOL_Pos) /*!< SCB CFSR (MMFSR): DACCVIOL Mask */
+
+#define SCB_CFSR_IACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 0U) /*!< SCB CFSR (MMFSR): IACCVIOL Position */
+#define SCB_CFSR_IACCVIOL_Msk (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/) /*!< SCB CFSR (MMFSR): IACCVIOL Mask */
+
+/* BusFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_BFARVALID_Pos (SCB_CFSR_BUSFAULTSR_Pos + 7U) /*!< SCB CFSR (BFSR): BFARVALID Position */
+#define SCB_CFSR_BFARVALID_Msk (1UL << SCB_CFSR_BFARVALID_Pos) /*!< SCB CFSR (BFSR): BFARVALID Mask */
+
+#define SCB_CFSR_LSPERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 5U) /*!< SCB CFSR (BFSR): LSPERR Position */
+#define SCB_CFSR_LSPERR_Msk (1UL << SCB_CFSR_LSPERR_Pos) /*!< SCB CFSR (BFSR): LSPERR Mask */
+
+#define SCB_CFSR_STKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 4U) /*!< SCB CFSR (BFSR): STKERR Position */
+#define SCB_CFSR_STKERR_Msk (1UL << SCB_CFSR_STKERR_Pos) /*!< SCB CFSR (BFSR): STKERR Mask */
+
+#define SCB_CFSR_UNSTKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 3U) /*!< SCB CFSR (BFSR): UNSTKERR Position */
+#define SCB_CFSR_UNSTKERR_Msk (1UL << SCB_CFSR_UNSTKERR_Pos) /*!< SCB CFSR (BFSR): UNSTKERR Mask */
+
+#define SCB_CFSR_IMPRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 2U) /*!< SCB CFSR (BFSR): IMPRECISERR Position */
+#define SCB_CFSR_IMPRECISERR_Msk (1UL << SCB_CFSR_IMPRECISERR_Pos) /*!< SCB CFSR (BFSR): IMPRECISERR Mask */
+
+#define SCB_CFSR_PRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 1U) /*!< SCB CFSR (BFSR): PRECISERR Position */
+#define SCB_CFSR_PRECISERR_Msk (1UL << SCB_CFSR_PRECISERR_Pos) /*!< SCB CFSR (BFSR): PRECISERR Mask */
+
+#define SCB_CFSR_IBUSERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 0U) /*!< SCB CFSR (BFSR): IBUSERR Position */
+#define SCB_CFSR_IBUSERR_Msk (1UL << SCB_CFSR_IBUSERR_Pos) /*!< SCB CFSR (BFSR): IBUSERR Mask */
+
+/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_DIVBYZERO_Pos (SCB_CFSR_USGFAULTSR_Pos + 9U) /*!< SCB CFSR (UFSR): DIVBYZERO Position */
+#define SCB_CFSR_DIVBYZERO_Msk (1UL << SCB_CFSR_DIVBYZERO_Pos) /*!< SCB CFSR (UFSR): DIVBYZERO Mask */
+
+#define SCB_CFSR_UNALIGNED_Pos (SCB_CFSR_USGFAULTSR_Pos + 8U) /*!< SCB CFSR (UFSR): UNALIGNED Position */
+#define SCB_CFSR_UNALIGNED_Msk (1UL << SCB_CFSR_UNALIGNED_Pos) /*!< SCB CFSR (UFSR): UNALIGNED Mask */
+
+#define SCB_CFSR_STKOF_Pos (SCB_CFSR_USGFAULTSR_Pos + 4U) /*!< SCB CFSR (UFSR): STKOF Position */
+#define SCB_CFSR_STKOF_Msk (1UL << SCB_CFSR_STKOF_Pos) /*!< SCB CFSR (UFSR): STKOF Mask */
+
+#define SCB_CFSR_NOCP_Pos (SCB_CFSR_USGFAULTSR_Pos + 3U) /*!< SCB CFSR (UFSR): NOCP Position */
+#define SCB_CFSR_NOCP_Msk (1UL << SCB_CFSR_NOCP_Pos) /*!< SCB CFSR (UFSR): NOCP Mask */
+
+#define SCB_CFSR_INVPC_Pos (SCB_CFSR_USGFAULTSR_Pos + 2U) /*!< SCB CFSR (UFSR): INVPC Position */
+#define SCB_CFSR_INVPC_Msk (1UL << SCB_CFSR_INVPC_Pos) /*!< SCB CFSR (UFSR): INVPC Mask */
+
+#define SCB_CFSR_INVSTATE_Pos (SCB_CFSR_USGFAULTSR_Pos + 1U) /*!< SCB CFSR (UFSR): INVSTATE Position */
+#define SCB_CFSR_INVSTATE_Msk (1UL << SCB_CFSR_INVSTATE_Pos) /*!< SCB CFSR (UFSR): INVSTATE Mask */
+
+#define SCB_CFSR_UNDEFINSTR_Pos (SCB_CFSR_USGFAULTSR_Pos + 0U) /*!< SCB CFSR (UFSR): UNDEFINSTR Position */
+#define SCB_CFSR_UNDEFINSTR_Msk (1UL << SCB_CFSR_UNDEFINSTR_Pos) /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */
+
+/* SCB Hard Fault Status Register Definitions */
+#define SCB_HFSR_DEBUGEVT_Pos 31U /*!< SCB HFSR: DEBUGEVT Position */
+#define SCB_HFSR_DEBUGEVT_Msk (1UL << SCB_HFSR_DEBUGEVT_Pos) /*!< SCB HFSR: DEBUGEVT Mask */
+
+#define SCB_HFSR_FORCED_Pos 30U /*!< SCB HFSR: FORCED Position */
+#define SCB_HFSR_FORCED_Msk (1UL << SCB_HFSR_FORCED_Pos) /*!< SCB HFSR: FORCED Mask */
+
+#define SCB_HFSR_VECTTBL_Pos 1U /*!< SCB HFSR: VECTTBL Position */
+#define SCB_HFSR_VECTTBL_Msk (1UL << SCB_HFSR_VECTTBL_Pos) /*!< SCB HFSR: VECTTBL Mask */
+
+/* SCB Debug Fault Status Register Definitions */
+#define SCB_DFSR_EXTERNAL_Pos 4U /*!< SCB DFSR: EXTERNAL Position */
+#define SCB_DFSR_EXTERNAL_Msk (1UL << SCB_DFSR_EXTERNAL_Pos) /*!< SCB DFSR: EXTERNAL Mask */
+
+#define SCB_DFSR_VCATCH_Pos 3U /*!< SCB DFSR: VCATCH Position */
+#define SCB_DFSR_VCATCH_Msk (1UL << SCB_DFSR_VCATCH_Pos) /*!< SCB DFSR: VCATCH Mask */
+
+#define SCB_DFSR_DWTTRAP_Pos 2U /*!< SCB DFSR: DWTTRAP Position */
+#define SCB_DFSR_DWTTRAP_Msk (1UL << SCB_DFSR_DWTTRAP_Pos) /*!< SCB DFSR: DWTTRAP Mask */
+
+#define SCB_DFSR_BKPT_Pos 1U /*!< SCB DFSR: BKPT Position */
+#define SCB_DFSR_BKPT_Msk (1UL << SCB_DFSR_BKPT_Pos) /*!< SCB DFSR: BKPT Mask */
+
+#define SCB_DFSR_HALTED_Pos 0U /*!< SCB DFSR: HALTED Position */
+#define SCB_DFSR_HALTED_Msk (1UL /*<< SCB_DFSR_HALTED_Pos*/) /*!< SCB DFSR: HALTED Mask */
+
+/* SCB Non-Secure Access Control Register Definitions */
+#define SCB_NSACR_CP11_Pos 11U /*!< SCB NSACR: CP11 Position */
+#define SCB_NSACR_CP11_Msk (1UL << SCB_NSACR_CP11_Pos) /*!< SCB NSACR: CP11 Mask */
+
+#define SCB_NSACR_CP10_Pos 10U /*!< SCB NSACR: CP10 Position */
+#define SCB_NSACR_CP10_Msk (1UL << SCB_NSACR_CP10_Pos) /*!< SCB NSACR: CP10 Mask */
+
+#define SCB_NSACR_CPn_Pos 0U /*!< SCB NSACR: CPn Position */
+#define SCB_NSACR_CPn_Msk (1UL /*<< SCB_NSACR_CPn_Pos*/) /*!< SCB NSACR: CPn Mask */
+
+/* SCB Cache Level ID Register Definitions */
+#define SCB_CLIDR_LOUU_Pos 27U /*!< SCB CLIDR: LoUU Position */
+#define SCB_CLIDR_LOUU_Msk (7UL << SCB_CLIDR_LOUU_Pos) /*!< SCB CLIDR: LoUU Mask */
+
+#define SCB_CLIDR_LOC_Pos 24U /*!< SCB CLIDR: LoC Position */
+#define SCB_CLIDR_LOC_Msk (7UL << SCB_CLIDR_LOC_Pos) /*!< SCB CLIDR: LoC Mask */
+
+/* SCB Cache Type Register Definitions */
+#define SCB_CTR_FORMAT_Pos 29U /*!< SCB CTR: Format Position */
+#define SCB_CTR_FORMAT_Msk (7UL << SCB_CTR_FORMAT_Pos) /*!< SCB CTR: Format Mask */
+
+#define SCB_CTR_CWG_Pos 24U /*!< SCB CTR: CWG Position */
+#define SCB_CTR_CWG_Msk (0xFUL << SCB_CTR_CWG_Pos) /*!< SCB CTR: CWG Mask */
+
+#define SCB_CTR_ERG_Pos 20U /*!< SCB CTR: ERG Position */
+#define SCB_CTR_ERG_Msk (0xFUL << SCB_CTR_ERG_Pos) /*!< SCB CTR: ERG Mask */
+
+#define SCB_CTR_DMINLINE_Pos 16U /*!< SCB CTR: DminLine Position */
+#define SCB_CTR_DMINLINE_Msk (0xFUL << SCB_CTR_DMINLINE_Pos) /*!< SCB CTR: DminLine Mask */
+
+#define SCB_CTR_IMINLINE_Pos 0U /*!< SCB CTR: ImInLine Position */
+#define SCB_CTR_IMINLINE_Msk (0xFUL /*<< SCB_CTR_IMINLINE_Pos*/) /*!< SCB CTR: ImInLine Mask */
+
+/* SCB Cache Size ID Register Definitions */
+#define SCB_CCSIDR_WT_Pos 31U /*!< SCB CCSIDR: WT Position */
+#define SCB_CCSIDR_WT_Msk (1UL << SCB_CCSIDR_WT_Pos) /*!< SCB CCSIDR: WT Mask */
+
+#define SCB_CCSIDR_WB_Pos 30U /*!< SCB CCSIDR: WB Position */
+#define SCB_CCSIDR_WB_Msk (1UL << SCB_CCSIDR_WB_Pos) /*!< SCB CCSIDR: WB Mask */
+
+#define SCB_CCSIDR_RA_Pos 29U /*!< SCB CCSIDR: RA Position */
+#define SCB_CCSIDR_RA_Msk (1UL << SCB_CCSIDR_RA_Pos) /*!< SCB CCSIDR: RA Mask */
+
+#define SCB_CCSIDR_WA_Pos 28U /*!< SCB CCSIDR: WA Position */
+#define SCB_CCSIDR_WA_Msk (1UL << SCB_CCSIDR_WA_Pos) /*!< SCB CCSIDR: WA Mask */
+
+#define SCB_CCSIDR_NUMSETS_Pos 13U /*!< SCB CCSIDR: NumSets Position */
+#define SCB_CCSIDR_NUMSETS_Msk (0x7FFFUL << SCB_CCSIDR_NUMSETS_Pos) /*!< SCB CCSIDR: NumSets Mask */
+
+#define SCB_CCSIDR_ASSOCIATIVITY_Pos 3U /*!< SCB CCSIDR: Associativity Position */
+#define SCB_CCSIDR_ASSOCIATIVITY_Msk (0x3FFUL << SCB_CCSIDR_ASSOCIATIVITY_Pos) /*!< SCB CCSIDR: Associativity Mask */
+
+#define SCB_CCSIDR_LINESIZE_Pos 0U /*!< SCB CCSIDR: LineSize Position */
+#define SCB_CCSIDR_LINESIZE_Msk (7UL /*<< SCB_CCSIDR_LINESIZE_Pos*/) /*!< SCB CCSIDR: LineSize Mask */
+
+/* SCB Cache Size Selection Register Definitions */
+#define SCB_CSSELR_LEVEL_Pos 1U /*!< SCB CSSELR: Level Position */
+#define SCB_CSSELR_LEVEL_Msk (7UL << SCB_CSSELR_LEVEL_Pos) /*!< SCB CSSELR: Level Mask */
+
+#define SCB_CSSELR_IND_Pos 0U /*!< SCB CSSELR: InD Position */
+#define SCB_CSSELR_IND_Msk (1UL /*<< SCB_CSSELR_IND_Pos*/) /*!< SCB CSSELR: InD Mask */
+
+/* SCB Software Triggered Interrupt Register Definitions */
+#define SCB_STIR_INTID_Pos 0U /*!< SCB STIR: INTID Position */
+#define SCB_STIR_INTID_Msk (0x1FFUL /*<< SCB_STIR_INTID_Pos*/) /*!< SCB STIR: INTID Mask */
+
+/* SCB D-Cache Invalidate by Set-way Register Definitions */
+#define SCB_DCISW_WAY_Pos 30U /*!< SCB DCISW: Way Position */
+#define SCB_DCISW_WAY_Msk (3UL << SCB_DCISW_WAY_Pos) /*!< SCB DCISW: Way Mask */
+
+#define SCB_DCISW_SET_Pos 5U /*!< SCB DCISW: Set Position */
+#define SCB_DCISW_SET_Msk (0x1FFUL << SCB_DCISW_SET_Pos) /*!< SCB DCISW: Set Mask */
+
+/* SCB D-Cache Clean by Set-way Register Definitions */
+#define SCB_DCCSW_WAY_Pos 30U /*!< SCB DCCSW: Way Position */
+#define SCB_DCCSW_WAY_Msk (3UL << SCB_DCCSW_WAY_Pos) /*!< SCB DCCSW: Way Mask */
+
+#define SCB_DCCSW_SET_Pos 5U /*!< SCB DCCSW: Set Position */
+#define SCB_DCCSW_SET_Msk (0x1FFUL << SCB_DCCSW_SET_Pos) /*!< SCB DCCSW: Set Mask */
+
+/* SCB D-Cache Clean and Invalidate by Set-way Register Definitions */
+#define SCB_DCCISW_WAY_Pos 30U /*!< SCB DCCISW: Way Position */
+#define SCB_DCCISW_WAY_Msk (3UL << SCB_DCCISW_WAY_Pos) /*!< SCB DCCISW: Way Mask */
+
+#define SCB_DCCISW_SET_Pos 5U /*!< SCB DCCISW: Set Position */
+#define SCB_DCCISW_SET_Msk (0x1FFUL << SCB_DCCISW_SET_Pos) /*!< SCB DCCISW: Set Mask */
+
+/*@} end of group CMSIS_SCB */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB)
+ \brief Type definitions for the System Control and ID Register not in the SCB
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Control and ID Register not in the SCB.
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1U];
+ __IM uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */
+ __IOM uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */
+ __IOM uint32_t CPPWR; /*!< Offset: 0x00C (R/W) Coprocessor Power Control Register */
+} SCnSCB_Type;
+
+/* Interrupt Controller Type Register Definitions */
+#define SCnSCB_ICTR_INTLINESNUM_Pos 0U /*!< ICTR: INTLINESNUM Position */
+#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_SCnotSCB */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SysTick System Tick Timer (SysTick)
+ \brief Type definitions for the System Timer Registers.
+ @{
+ */
+
+/**
+ \brief Structure type to access the System Timer (SysTick).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */
+ __IOM uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */
+ __IOM uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */
+ __IM uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */
+} SysTick_Type;
+
+/* SysTick Control / Status Register Definitions */
+#define SysTick_CTRL_COUNTFLAG_Pos 16U /*!< SysTick CTRL: COUNTFLAG Position */
+#define SysTick_CTRL_COUNTFLAG_Msk (1UL << SysTick_CTRL_COUNTFLAG_Pos) /*!< SysTick CTRL: COUNTFLAG Mask */
+
+#define SysTick_CTRL_CLKSOURCE_Pos 2U /*!< SysTick CTRL: CLKSOURCE Position */
+#define SysTick_CTRL_CLKSOURCE_Msk (1UL << SysTick_CTRL_CLKSOURCE_Pos) /*!< SysTick CTRL: CLKSOURCE Mask */
+
+#define SysTick_CTRL_TICKINT_Pos 1U /*!< SysTick CTRL: TICKINT Position */
+#define SysTick_CTRL_TICKINT_Msk (1UL << SysTick_CTRL_TICKINT_Pos) /*!< SysTick CTRL: TICKINT Mask */
+
+#define SysTick_CTRL_ENABLE_Pos 0U /*!< SysTick CTRL: ENABLE Position */
+#define SysTick_CTRL_ENABLE_Msk (1UL /*<< SysTick_CTRL_ENABLE_Pos*/) /*!< SysTick CTRL: ENABLE Mask */
+
+/* SysTick Reload Register Definitions */
+#define SysTick_LOAD_RELOAD_Pos 0U /*!< SysTick LOAD: RELOAD Position */
+#define SysTick_LOAD_RELOAD_Msk (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/) /*!< SysTick LOAD: RELOAD Mask */
+
+/* SysTick Current Register Definitions */
+#define SysTick_VAL_CURRENT_Pos 0U /*!< SysTick VAL: CURRENT Position */
+#define SysTick_VAL_CURRENT_Msk (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/) /*!< SysTick VAL: CURRENT Mask */
+
+/* SysTick Calibration Register Definitions */
+#define SysTick_CALIB_NOREF_Pos 31U /*!< SysTick CALIB: NOREF Position */
+#define SysTick_CALIB_NOREF_Msk (1UL << SysTick_CALIB_NOREF_Pos) /*!< SysTick CALIB: NOREF Mask */
+
+#define SysTick_CALIB_SKEW_Pos 30U /*!< SysTick CALIB: SKEW Position */
+#define SysTick_CALIB_SKEW_Msk (1UL << SysTick_CALIB_SKEW_Pos) /*!< SysTick CALIB: SKEW Mask */
+
+#define SysTick_CALIB_TENMS_Pos 0U /*!< SysTick CALIB: TENMS Position */
+#define SysTick_CALIB_TENMS_Msk (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/) /*!< SysTick CALIB: TENMS Mask */
+
+/*@} end of group CMSIS_SysTick */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_ITM Instrumentation Trace Macrocell (ITM)
+ \brief Type definitions for the Instrumentation Trace Macrocell (ITM)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Instrumentation Trace Macrocell Register (ITM).
+ */
+typedef struct
+{
+ __OM union
+ {
+ __OM uint8_t u8; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 8-bit */
+ __OM uint16_t u16; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 16-bit */
+ __OM uint32_t u32; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 32-bit */
+ } PORT [32U]; /*!< Offset: 0x000 ( /W) ITM Stimulus Port Registers */
+ uint32_t RESERVED0[864U];
+ __IOM uint32_t TER; /*!< Offset: 0xE00 (R/W) ITM Trace Enable Register */
+ uint32_t RESERVED1[15U];
+ __IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
+ uint32_t RESERVED2[15U];
+ __IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
+ uint32_t RESERVED3[32U];
+ uint32_t RESERVED4[43U];
+ __OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
+ __IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
+ uint32_t RESERVED5[1U];
+ __IM uint32_t DEVARCH; /*!< Offset: 0xFBC (R/ ) ITM Device Architecture Register */
+ uint32_t RESERVED6[4U];
+ __IM uint32_t PID4; /*!< Offset: 0xFD0 (R/ ) ITM Peripheral Identification Register #4 */
+ __IM uint32_t PID5; /*!< Offset: 0xFD4 (R/ ) ITM Peripheral Identification Register #5 */
+ __IM uint32_t PID6; /*!< Offset: 0xFD8 (R/ ) ITM Peripheral Identification Register #6 */
+ __IM uint32_t PID7; /*!< Offset: 0xFDC (R/ ) ITM Peripheral Identification Register #7 */
+ __IM uint32_t PID0; /*!< Offset: 0xFE0 (R/ ) ITM Peripheral Identification Register #0 */
+ __IM uint32_t PID1; /*!< Offset: 0xFE4 (R/ ) ITM Peripheral Identification Register #1 */
+ __IM uint32_t PID2; /*!< Offset: 0xFE8 (R/ ) ITM Peripheral Identification Register #2 */
+ __IM uint32_t PID3; /*!< Offset: 0xFEC (R/ ) ITM Peripheral Identification Register #3 */
+ __IM uint32_t CID0; /*!< Offset: 0xFF0 (R/ ) ITM Component Identification Register #0 */
+ __IM uint32_t CID1; /*!< Offset: 0xFF4 (R/ ) ITM Component Identification Register #1 */
+ __IM uint32_t CID2; /*!< Offset: 0xFF8 (R/ ) ITM Component Identification Register #2 */
+ __IM uint32_t CID3; /*!< Offset: 0xFFC (R/ ) ITM Component Identification Register #3 */
+} ITM_Type;
+
+/* ITM Stimulus Port Register Definitions */
+#define ITM_STIM_DISABLED_Pos 1U /*!< ITM STIM: DISABLED Position */
+#define ITM_STIM_DISABLED_Msk (0x1UL << ITM_STIM_DISABLED_Pos) /*!< ITM STIM: DISABLED Mask */
+
+#define ITM_STIM_FIFOREADY_Pos 0U /*!< ITM STIM: FIFOREADY Position */
+#define ITM_STIM_FIFOREADY_Msk (0x1UL /*<< ITM_STIM_FIFOREADY_Pos*/) /*!< ITM STIM: FIFOREADY Mask */
+
+/* ITM Trace Privilege Register Definitions */
+#define ITM_TPR_PRIVMASK_Pos 0U /*!< ITM TPR: PRIVMASK Position */
+#define ITM_TPR_PRIVMASK_Msk (0xFFFFFFFFUL /*<< ITM_TPR_PRIVMASK_Pos*/) /*!< ITM TPR: PRIVMASK Mask */
+
+/* ITM Trace Control Register Definitions */
+#define ITM_TCR_BUSY_Pos 23U /*!< ITM TCR: BUSY Position */
+#define ITM_TCR_BUSY_Msk (1UL << ITM_TCR_BUSY_Pos) /*!< ITM TCR: BUSY Mask */
+
+#define ITM_TCR_TRACEBUSID_Pos 16U /*!< ITM TCR: ATBID Position */
+#define ITM_TCR_TRACEBUSID_Msk (0x7FUL << ITM_TCR_TRACEBUSID_Pos) /*!< ITM TCR: ATBID Mask */
+
+#define ITM_TCR_GTSFREQ_Pos 10U /*!< ITM TCR: Global timestamp frequency Position */
+#define ITM_TCR_GTSFREQ_Msk (3UL << ITM_TCR_GTSFREQ_Pos) /*!< ITM TCR: Global timestamp frequency Mask */
+
+#define ITM_TCR_TSPRESCALE_Pos 8U /*!< ITM TCR: TSPRESCALE Position */
+#define ITM_TCR_TSPRESCALE_Msk (3UL << ITM_TCR_TSPRESCALE_Pos) /*!< ITM TCR: TSPRESCALE Mask */
+
+#define ITM_TCR_STALLENA_Pos 5U /*!< ITM TCR: STALLENA Position */
+#define ITM_TCR_STALLENA_Msk (1UL << ITM_TCR_STALLENA_Pos) /*!< ITM TCR: STALLENA Mask */
+
+#define ITM_TCR_SWOENA_Pos 4U /*!< ITM TCR: SWOENA Position */
+#define ITM_TCR_SWOENA_Msk (1UL << ITM_TCR_SWOENA_Pos) /*!< ITM TCR: SWOENA Mask */
+
+#define ITM_TCR_DWTENA_Pos 3U /*!< ITM TCR: DWTENA Position */
+#define ITM_TCR_DWTENA_Msk (1UL << ITM_TCR_DWTENA_Pos) /*!< ITM TCR: DWTENA Mask */
+
+#define ITM_TCR_SYNCENA_Pos 2U /*!< ITM TCR: SYNCENA Position */
+#define ITM_TCR_SYNCENA_Msk (1UL << ITM_TCR_SYNCENA_Pos) /*!< ITM TCR: SYNCENA Mask */
+
+#define ITM_TCR_TSENA_Pos 1U /*!< ITM TCR: TSENA Position */
+#define ITM_TCR_TSENA_Msk (1UL << ITM_TCR_TSENA_Pos) /*!< ITM TCR: TSENA Mask */
+
+#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
+#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
+
+/* ITM Lock Status Register Definitions */
+#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
+#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
+
+#define ITM_LSR_Access_Pos 1U /*!< ITM LSR: Access Position */
+#define ITM_LSR_Access_Msk (1UL << ITM_LSR_Access_Pos) /*!< ITM LSR: Access Mask */
+
+#define ITM_LSR_Present_Pos 0U /*!< ITM LSR: Present Position */
+#define ITM_LSR_Present_Msk (1UL /*<< ITM_LSR_Present_Pos*/) /*!< ITM LSR: Present Mask */
+
+/*@}*/ /* end of group CMSIS_ITM */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_DWT Data Watchpoint and Trace (DWT)
+ \brief Type definitions for the Data Watchpoint and Trace (DWT)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Data Watchpoint and Trace Register (DWT).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) Control Register */
+ __IOM uint32_t CYCCNT; /*!< Offset: 0x004 (R/W) Cycle Count Register */
+ __IOM uint32_t CPICNT; /*!< Offset: 0x008 (R/W) CPI Count Register */
+ __IOM uint32_t EXCCNT; /*!< Offset: 0x00C (R/W) Exception Overhead Count Register */
+ __IOM uint32_t SLEEPCNT; /*!< Offset: 0x010 (R/W) Sleep Count Register */
+ __IOM uint32_t LSUCNT; /*!< Offset: 0x014 (R/W) LSU Count Register */
+ __IOM uint32_t FOLDCNT; /*!< Offset: 0x018 (R/W) Folded-instruction Count Register */
+ __IM uint32_t PCSR; /*!< Offset: 0x01C (R/ ) Program Counter Sample Register */
+ __IOM uint32_t COMP0; /*!< Offset: 0x020 (R/W) Comparator Register 0 */
+ uint32_t RESERVED1[1U];
+ __IOM uint32_t FUNCTION0; /*!< Offset: 0x028 (R/W) Function Register 0 */
+ uint32_t RESERVED2[1U];
+ __IOM uint32_t COMP1; /*!< Offset: 0x030 (R/W) Comparator Register 1 */
+ uint32_t RESERVED3[1U];
+ __IOM uint32_t FUNCTION1; /*!< Offset: 0x038 (R/W) Function Register 1 */
+ uint32_t RESERVED4[1U];
+ __IOM uint32_t COMP2; /*!< Offset: 0x040 (R/W) Comparator Register 2 */
+ uint32_t RESERVED5[1U];
+ __IOM uint32_t FUNCTION2; /*!< Offset: 0x048 (R/W) Function Register 2 */
+ uint32_t RESERVED6[1U];
+ __IOM uint32_t COMP3; /*!< Offset: 0x050 (R/W) Comparator Register 3 */
+ uint32_t RESERVED7[1U];
+ __IOM uint32_t FUNCTION3; /*!< Offset: 0x058 (R/W) Function Register 3 */
+ uint32_t RESERVED8[1U];
+ __IOM uint32_t COMP4; /*!< Offset: 0x060 (R/W) Comparator Register 4 */
+ uint32_t RESERVED9[1U];
+ __IOM uint32_t FUNCTION4; /*!< Offset: 0x068 (R/W) Function Register 4 */
+ uint32_t RESERVED10[1U];
+ __IOM uint32_t COMP5; /*!< Offset: 0x070 (R/W) Comparator Register 5 */
+ uint32_t RESERVED11[1U];
+ __IOM uint32_t FUNCTION5; /*!< Offset: 0x078 (R/W) Function Register 5 */
+ uint32_t RESERVED12[1U];
+ __IOM uint32_t COMP6; /*!< Offset: 0x080 (R/W) Comparator Register 6 */
+ uint32_t RESERVED13[1U];
+ __IOM uint32_t FUNCTION6; /*!< Offset: 0x088 (R/W) Function Register 6 */
+ uint32_t RESERVED14[1U];
+ __IOM uint32_t COMP7; /*!< Offset: 0x090 (R/W) Comparator Register 7 */
+ uint32_t RESERVED15[1U];
+ __IOM uint32_t FUNCTION7; /*!< Offset: 0x098 (R/W) Function Register 7 */
+ uint32_t RESERVED16[1U];
+ __IOM uint32_t COMP8; /*!< Offset: 0x0A0 (R/W) Comparator Register 8 */
+ uint32_t RESERVED17[1U];
+ __IOM uint32_t FUNCTION8; /*!< Offset: 0x0A8 (R/W) Function Register 8 */
+ uint32_t RESERVED18[1U];
+ __IOM uint32_t COMP9; /*!< Offset: 0x0B0 (R/W) Comparator Register 9 */
+ uint32_t RESERVED19[1U];
+ __IOM uint32_t FUNCTION9; /*!< Offset: 0x0B8 (R/W) Function Register 9 */
+ uint32_t RESERVED20[1U];
+ __IOM uint32_t COMP10; /*!< Offset: 0x0C0 (R/W) Comparator Register 10 */
+ uint32_t RESERVED21[1U];
+ __IOM uint32_t FUNCTION10; /*!< Offset: 0x0C8 (R/W) Function Register 10 */
+ uint32_t RESERVED22[1U];
+ __IOM uint32_t COMP11; /*!< Offset: 0x0D0 (R/W) Comparator Register 11 */
+ uint32_t RESERVED23[1U];
+ __IOM uint32_t FUNCTION11; /*!< Offset: 0x0D8 (R/W) Function Register 11 */
+ uint32_t RESERVED24[1U];
+ __IOM uint32_t COMP12; /*!< Offset: 0x0E0 (R/W) Comparator Register 12 */
+ uint32_t RESERVED25[1U];
+ __IOM uint32_t FUNCTION12; /*!< Offset: 0x0E8 (R/W) Function Register 12 */
+ uint32_t RESERVED26[1U];
+ __IOM uint32_t COMP13; /*!< Offset: 0x0F0 (R/W) Comparator Register 13 */
+ uint32_t RESERVED27[1U];
+ __IOM uint32_t FUNCTION13; /*!< Offset: 0x0F8 (R/W) Function Register 13 */
+ uint32_t RESERVED28[1U];
+ __IOM uint32_t COMP14; /*!< Offset: 0x100 (R/W) Comparator Register 14 */
+ uint32_t RESERVED29[1U];
+ __IOM uint32_t FUNCTION14; /*!< Offset: 0x108 (R/W) Function Register 14 */
+ uint32_t RESERVED30[1U];
+ __IOM uint32_t COMP15; /*!< Offset: 0x110 (R/W) Comparator Register 15 */
+ uint32_t RESERVED31[1U];
+ __IOM uint32_t FUNCTION15; /*!< Offset: 0x118 (R/W) Function Register 15 */
+ uint32_t RESERVED32[934U];
+ __IM uint32_t LSR; /*!< Offset: 0xFB4 (R ) Lock Status Register */
+ uint32_t RESERVED33[1U];
+ __IM uint32_t DEVARCH; /*!< Offset: 0xFBC (R/ ) Device Architecture Register */
+} DWT_Type;
+
+/* DWT Control Register Definitions */
+#define DWT_CTRL_NUMCOMP_Pos 28U /*!< DWT CTRL: NUMCOMP Position */
+#define DWT_CTRL_NUMCOMP_Msk (0xFUL << DWT_CTRL_NUMCOMP_Pos) /*!< DWT CTRL: NUMCOMP Mask */
+
+#define DWT_CTRL_NOTRCPKT_Pos 27U /*!< DWT CTRL: NOTRCPKT Position */
+#define DWT_CTRL_NOTRCPKT_Msk (0x1UL << DWT_CTRL_NOTRCPKT_Pos) /*!< DWT CTRL: NOTRCPKT Mask */
+
+#define DWT_CTRL_NOEXTTRIG_Pos 26U /*!< DWT CTRL: NOEXTTRIG Position */
+#define DWT_CTRL_NOEXTTRIG_Msk (0x1UL << DWT_CTRL_NOEXTTRIG_Pos) /*!< DWT CTRL: NOEXTTRIG Mask */
+
+#define DWT_CTRL_NOCYCCNT_Pos 25U /*!< DWT CTRL: NOCYCCNT Position */
+#define DWT_CTRL_NOCYCCNT_Msk (0x1UL << DWT_CTRL_NOCYCCNT_Pos) /*!< DWT CTRL: NOCYCCNT Mask */
+
+#define DWT_CTRL_NOPRFCNT_Pos 24U /*!< DWT CTRL: NOPRFCNT Position */
+#define DWT_CTRL_NOPRFCNT_Msk (0x1UL << DWT_CTRL_NOPRFCNT_Pos) /*!< DWT CTRL: NOPRFCNT Mask */
+
+#define DWT_CTRL_CYCDISS_Pos 23U /*!< DWT CTRL: CYCDISS Position */
+#define DWT_CTRL_CYCDISS_Msk (0x1UL << DWT_CTRL_CYCDISS_Pos) /*!< DWT CTRL: CYCDISS Mask */
+
+#define DWT_CTRL_CYCEVTENA_Pos 22U /*!< DWT CTRL: CYCEVTENA Position */
+#define DWT_CTRL_CYCEVTENA_Msk (0x1UL << DWT_CTRL_CYCEVTENA_Pos) /*!< DWT CTRL: CYCEVTENA Mask */
+
+#define DWT_CTRL_FOLDEVTENA_Pos 21U /*!< DWT CTRL: FOLDEVTENA Position */
+#define DWT_CTRL_FOLDEVTENA_Msk (0x1UL << DWT_CTRL_FOLDEVTENA_Pos) /*!< DWT CTRL: FOLDEVTENA Mask */
+
+#define DWT_CTRL_LSUEVTENA_Pos 20U /*!< DWT CTRL: LSUEVTENA Position */
+#define DWT_CTRL_LSUEVTENA_Msk (0x1UL << DWT_CTRL_LSUEVTENA_Pos) /*!< DWT CTRL: LSUEVTENA Mask */
+
+#define DWT_CTRL_SLEEPEVTENA_Pos 19U /*!< DWT CTRL: SLEEPEVTENA Position */
+#define DWT_CTRL_SLEEPEVTENA_Msk (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos) /*!< DWT CTRL: SLEEPEVTENA Mask */
+
+#define DWT_CTRL_EXCEVTENA_Pos 18U /*!< DWT CTRL: EXCEVTENA Position */
+#define DWT_CTRL_EXCEVTENA_Msk (0x1UL << DWT_CTRL_EXCEVTENA_Pos) /*!< DWT CTRL: EXCEVTENA Mask */
+
+#define DWT_CTRL_CPIEVTENA_Pos 17U /*!< DWT CTRL: CPIEVTENA Position */
+#define DWT_CTRL_CPIEVTENA_Msk (0x1UL << DWT_CTRL_CPIEVTENA_Pos) /*!< DWT CTRL: CPIEVTENA Mask */
+
+#define DWT_CTRL_EXCTRCENA_Pos 16U /*!< DWT CTRL: EXCTRCENA Position */
+#define DWT_CTRL_EXCTRCENA_Msk (0x1UL << DWT_CTRL_EXCTRCENA_Pos) /*!< DWT CTRL: EXCTRCENA Mask */
+
+#define DWT_CTRL_PCSAMPLENA_Pos 12U /*!< DWT CTRL: PCSAMPLENA Position */
+#define DWT_CTRL_PCSAMPLENA_Msk (0x1UL << DWT_CTRL_PCSAMPLENA_Pos) /*!< DWT CTRL: PCSAMPLENA Mask */
+
+#define DWT_CTRL_SYNCTAP_Pos 10U /*!< DWT CTRL: SYNCTAP Position */
+#define DWT_CTRL_SYNCTAP_Msk (0x3UL << DWT_CTRL_SYNCTAP_Pos) /*!< DWT CTRL: SYNCTAP Mask */
+
+#define DWT_CTRL_CYCTAP_Pos 9U /*!< DWT CTRL: CYCTAP Position */
+#define DWT_CTRL_CYCTAP_Msk (0x1UL << DWT_CTRL_CYCTAP_Pos) /*!< DWT CTRL: CYCTAP Mask */
+
+#define DWT_CTRL_POSTINIT_Pos 5U /*!< DWT CTRL: POSTINIT Position */
+#define DWT_CTRL_POSTINIT_Msk (0xFUL << DWT_CTRL_POSTINIT_Pos) /*!< DWT CTRL: POSTINIT Mask */
+
+#define DWT_CTRL_POSTPRESET_Pos 1U /*!< DWT CTRL: POSTPRESET Position */
+#define DWT_CTRL_POSTPRESET_Msk (0xFUL << DWT_CTRL_POSTPRESET_Pos) /*!< DWT CTRL: POSTPRESET Mask */
+
+#define DWT_CTRL_CYCCNTENA_Pos 0U /*!< DWT CTRL: CYCCNTENA Position */
+#define DWT_CTRL_CYCCNTENA_Msk (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/) /*!< DWT CTRL: CYCCNTENA Mask */
+
+/* DWT CPI Count Register Definitions */
+#define DWT_CPICNT_CPICNT_Pos 0U /*!< DWT CPICNT: CPICNT Position */
+#define DWT_CPICNT_CPICNT_Msk (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/) /*!< DWT CPICNT: CPICNT Mask */
+
+/* DWT Exception Overhead Count Register Definitions */
+#define DWT_EXCCNT_EXCCNT_Pos 0U /*!< DWT EXCCNT: EXCCNT Position */
+#define DWT_EXCCNT_EXCCNT_Msk (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/) /*!< DWT EXCCNT: EXCCNT Mask */
+
+/* DWT Sleep Count Register Definitions */
+#define DWT_SLEEPCNT_SLEEPCNT_Pos 0U /*!< DWT SLEEPCNT: SLEEPCNT Position */
+#define DWT_SLEEPCNT_SLEEPCNT_Msk (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/) /*!< DWT SLEEPCNT: SLEEPCNT Mask */
+
+/* DWT LSU Count Register Definitions */
+#define DWT_LSUCNT_LSUCNT_Pos 0U /*!< DWT LSUCNT: LSUCNT Position */
+#define DWT_LSUCNT_LSUCNT_Msk (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/) /*!< DWT LSUCNT: LSUCNT Mask */
+
+/* DWT Folded-instruction Count Register Definitions */
+#define DWT_FOLDCNT_FOLDCNT_Pos 0U /*!< DWT FOLDCNT: FOLDCNT Position */
+#define DWT_FOLDCNT_FOLDCNT_Msk (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/) /*!< DWT FOLDCNT: FOLDCNT Mask */
+
+/* DWT Comparator Function Register Definitions */
+#define DWT_FUNCTION_ID_Pos 27U /*!< DWT FUNCTION: ID Position */
+#define DWT_FUNCTION_ID_Msk (0x1FUL << DWT_FUNCTION_ID_Pos) /*!< DWT FUNCTION: ID Mask */
+
+#define DWT_FUNCTION_MATCHED_Pos 24U /*!< DWT FUNCTION: MATCHED Position */
+#define DWT_FUNCTION_MATCHED_Msk (0x1UL << DWT_FUNCTION_MATCHED_Pos) /*!< DWT FUNCTION: MATCHED Mask */
+
+#define DWT_FUNCTION_DATAVSIZE_Pos 10U /*!< DWT FUNCTION: DATAVSIZE Position */
+#define DWT_FUNCTION_DATAVSIZE_Msk (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos) /*!< DWT FUNCTION: DATAVSIZE Mask */
+
+#define DWT_FUNCTION_ACTION_Pos 4U /*!< DWT FUNCTION: ACTION Position */
+#define DWT_FUNCTION_ACTION_Msk (0x1UL << DWT_FUNCTION_ACTION_Pos) /*!< DWT FUNCTION: ACTION Mask */
+
+#define DWT_FUNCTION_MATCH_Pos 0U /*!< DWT FUNCTION: MATCH Position */
+#define DWT_FUNCTION_MATCH_Msk (0xFUL /*<< DWT_FUNCTION_MATCH_Pos*/) /*!< DWT FUNCTION: MATCH Mask */
+
+/*@}*/ /* end of group CMSIS_DWT */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_TPI Trace Port Interface (TPI)
+ \brief Type definitions for the Trace Port Interface (TPI)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Trace Port Interface Register (TPI).
+ */
+typedef struct
+{
+ __IM uint32_t SSPSR; /*!< Offset: 0x000 (R/ ) Supported Parallel Port Size Register */
+ __IOM uint32_t CSPSR; /*!< Offset: 0x004 (R/W) Current Parallel Port Size Register */
+ uint32_t RESERVED0[2U];
+ __IOM uint32_t ACPR; /*!< Offset: 0x010 (R/W) Asynchronous Clock Prescaler Register */
+ uint32_t RESERVED1[55U];
+ __IOM uint32_t SPPR; /*!< Offset: 0x0F0 (R/W) Selected Pin Protocol Register */
+ uint32_t RESERVED2[131U];
+ __IM uint32_t FFSR; /*!< Offset: 0x300 (R/ ) Formatter and Flush Status Register */
+ __IOM uint32_t FFCR; /*!< Offset: 0x304 (R/W) Formatter and Flush Control Register */
+ __IOM uint32_t PSCR; /*!< Offset: 0x308 (R/W) Periodic Synchronization Control Register */
+ uint32_t RESERVED3[759U];
+ __IM uint32_t TRIGGER; /*!< Offset: 0xEE8 (R/ ) TRIGGER Register */
+ __IM uint32_t ITFTTD0; /*!< Offset: 0xEEC (R/ ) Integration Test FIFO Test Data 0 Register */
+ __IOM uint32_t ITATBCTR2; /*!< Offset: 0xEF0 (R/W) Integration Test ATB Control Register 2 */
+ uint32_t RESERVED4[1U];
+ __IM uint32_t ITATBCTR0; /*!< Offset: 0xEF8 (R/ ) Integration Test ATB Control Register 0 */
+ __IM uint32_t ITFTTD1; /*!< Offset: 0xEFC (R/ ) Integration Test FIFO Test Data 1 Register */
+ __IOM uint32_t ITCTRL; /*!< Offset: 0xF00 (R/W) Integration Mode Control */
+ uint32_t RESERVED5[39U];
+ __IOM uint32_t CLAIMSET; /*!< Offset: 0xFA0 (R/W) Claim tag set */
+ __IOM uint32_t CLAIMCLR; /*!< Offset: 0xFA4 (R/W) Claim tag clear */
+ uint32_t RESERVED7[8U];
+ __IM uint32_t DEVID; /*!< Offset: 0xFC8 (R/ ) Device Configuration Register */
+ __IM uint32_t DEVTYPE; /*!< Offset: 0xFCC (R/ ) Device Type Identifier Register */
+} TPI_Type;
+
+/* TPI Asynchronous Clock Prescaler Register Definitions */
+#define TPI_ACPR_PRESCALER_Pos 0U /*!< TPI ACPR: PRESCALER Position */
+#define TPI_ACPR_PRESCALER_Msk (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/) /*!< TPI ACPR: PRESCALER Mask */
+
+/* TPI Selected Pin Protocol Register Definitions */
+#define TPI_SPPR_TXMODE_Pos 0U /*!< TPI SPPR: TXMODE Position */
+#define TPI_SPPR_TXMODE_Msk (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/) /*!< TPI SPPR: TXMODE Mask */
+
+/* TPI Formatter and Flush Status Register Definitions */
+#define TPI_FFSR_FtNonStop_Pos 3U /*!< TPI FFSR: FtNonStop Position */
+#define TPI_FFSR_FtNonStop_Msk (0x1UL << TPI_FFSR_FtNonStop_Pos) /*!< TPI FFSR: FtNonStop Mask */
+
+#define TPI_FFSR_TCPresent_Pos 2U /*!< TPI FFSR: TCPresent Position */
+#define TPI_FFSR_TCPresent_Msk (0x1UL << TPI_FFSR_TCPresent_Pos) /*!< TPI FFSR: TCPresent Mask */
+
+#define TPI_FFSR_FtStopped_Pos 1U /*!< TPI FFSR: FtStopped Position */
+#define TPI_FFSR_FtStopped_Msk (0x1UL << TPI_FFSR_FtStopped_Pos) /*!< TPI FFSR: FtStopped Mask */
+
+#define TPI_FFSR_FlInProg_Pos 0U /*!< TPI FFSR: FlInProg Position */
+#define TPI_FFSR_FlInProg_Msk (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/) /*!< TPI FFSR: FlInProg Mask */
+
+/* TPI Formatter and Flush Control Register Definitions */
+#define TPI_FFCR_TrigIn_Pos 8U /*!< TPI FFCR: TrigIn Position */
+#define TPI_FFCR_TrigIn_Msk (0x1UL << TPI_FFCR_TrigIn_Pos) /*!< TPI FFCR: TrigIn Mask */
+
+#define TPI_FFCR_FOnMan_Pos 6U /*!< TPI FFCR: FOnMan Position */
+#define TPI_FFCR_FOnMan_Msk (0x1UL << TPI_FFCR_FOnMan_Pos) /*!< TPI FFCR: FOnMan Mask */
+
+#define TPI_FFCR_EnFCont_Pos 1U /*!< TPI FFCR: EnFCont Position */
+#define TPI_FFCR_EnFCont_Msk (0x1UL << TPI_FFCR_EnFCont_Pos) /*!< TPI FFCR: EnFCont Mask */
+
+/* TPI TRIGGER Register Definitions */
+#define TPI_TRIGGER_TRIGGER_Pos 0U /*!< TPI TRIGGER: TRIGGER Position */
+#define TPI_TRIGGER_TRIGGER_Msk (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/) /*!< TPI TRIGGER: TRIGGER Mask */
+
+/* TPI Integration Test FIFO Test Data 0 Register Definitions */
+#define TPI_ITFTTD0_ATB_IF2_ATVALID_Pos 29U /*!< TPI ITFTTD0: ATB Interface 2 ATVALIDPosition */
+#define TPI_ITFTTD0_ATB_IF2_ATVALID_Msk (0x3UL << TPI_ITFTTD0_ATB_IF2_ATVALID_Pos) /*!< TPI ITFTTD0: ATB Interface 2 ATVALID Mask */
+
+#define TPI_ITFTTD0_ATB_IF2_bytecount_Pos 27U /*!< TPI ITFTTD0: ATB Interface 2 byte count Position */
+#define TPI_ITFTTD0_ATB_IF2_bytecount_Msk (0x3UL << TPI_ITFTTD0_ATB_IF2_bytecount_Pos) /*!< TPI ITFTTD0: ATB Interface 2 byte count Mask */
+
+#define TPI_ITFTTD0_ATB_IF1_ATVALID_Pos 26U /*!< TPI ITFTTD0: ATB Interface 1 ATVALID Position */
+#define TPI_ITFTTD0_ATB_IF1_ATVALID_Msk (0x3UL << TPI_ITFTTD0_ATB_IF1_ATVALID_Pos) /*!< TPI ITFTTD0: ATB Interface 1 ATVALID Mask */
+
+#define TPI_ITFTTD0_ATB_IF1_bytecount_Pos 24U /*!< TPI ITFTTD0: ATB Interface 1 byte count Position */
+#define TPI_ITFTTD0_ATB_IF1_bytecount_Msk (0x3UL << TPI_ITFTTD0_ATB_IF1_bytecount_Pos) /*!< TPI ITFTTD0: ATB Interface 1 byte countt Mask */
+
+#define TPI_ITFTTD0_ATB_IF1_data2_Pos 16U /*!< TPI ITFTTD0: ATB Interface 1 data2 Position */
+#define TPI_ITFTTD0_ATB_IF1_data2_Msk (0xFFUL << TPI_ITFTTD0_ATB_IF1_data1_Pos) /*!< TPI ITFTTD0: ATB Interface 1 data2 Mask */
+
+#define TPI_ITFTTD0_ATB_IF1_data1_Pos 8U /*!< TPI ITFTTD0: ATB Interface 1 data1 Position */
+#define TPI_ITFTTD0_ATB_IF1_data1_Msk (0xFFUL << TPI_ITFTTD0_ATB_IF1_data1_Pos) /*!< TPI ITFTTD0: ATB Interface 1 data1 Mask */
+
+#define TPI_ITFTTD0_ATB_IF1_data0_Pos 0U /*!< TPI ITFTTD0: ATB Interface 1 data0 Position */
+#define TPI_ITFTTD0_ATB_IF1_data0_Msk (0xFFUL /*<< TPI_ITFTTD0_ATB_IF1_data0_Pos*/) /*!< TPI ITFTTD0: ATB Interface 1 data0 Mask */
+
+/* TPI Integration Test ATB Control Register 2 Register Definitions */
+#define TPI_ITATBCTR2_AFVALID2S_Pos 1U /*!< TPI ITATBCTR2: AFVALID2S Position */
+#define TPI_ITATBCTR2_AFVALID2S_Msk (0x1UL << TPI_ITATBCTR2_AFVALID2S_Pos) /*!< TPI ITATBCTR2: AFVALID2SS Mask */
+
+#define TPI_ITATBCTR2_AFVALID1S_Pos 1U /*!< TPI ITATBCTR2: AFVALID1S Position */
+#define TPI_ITATBCTR2_AFVALID1S_Msk (0x1UL << TPI_ITATBCTR2_AFVALID1S_Pos) /*!< TPI ITATBCTR2: AFVALID1SS Mask */
+
+#define TPI_ITATBCTR2_ATREADY2S_Pos 0U /*!< TPI ITATBCTR2: ATREADY2S Position */
+#define TPI_ITATBCTR2_ATREADY2S_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY2S_Pos*/) /*!< TPI ITATBCTR2: ATREADY2S Mask */
+
+#define TPI_ITATBCTR2_ATREADY1S_Pos 0U /*!< TPI ITATBCTR2: ATREADY1S Position */
+#define TPI_ITATBCTR2_ATREADY1S_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY1S_Pos*/) /*!< TPI ITATBCTR2: ATREADY1S Mask */
+
+/* TPI Integration Test FIFO Test Data 1 Register Definitions */
+#define TPI_ITFTTD1_ATB_IF2_ATVALID_Pos 29U /*!< TPI ITFTTD1: ATB Interface 2 ATVALID Position */
+#define TPI_ITFTTD1_ATB_IF2_ATVALID_Msk (0x3UL << TPI_ITFTTD1_ATB_IF2_ATVALID_Pos) /*!< TPI ITFTTD1: ATB Interface 2 ATVALID Mask */
+
+#define TPI_ITFTTD1_ATB_IF2_bytecount_Pos 27U /*!< TPI ITFTTD1: ATB Interface 2 byte count Position */
+#define TPI_ITFTTD1_ATB_IF2_bytecount_Msk (0x3UL << TPI_ITFTTD1_ATB_IF2_bytecount_Pos) /*!< TPI ITFTTD1: ATB Interface 2 byte count Mask */
+
+#define TPI_ITFTTD1_ATB_IF1_ATVALID_Pos 26U /*!< TPI ITFTTD1: ATB Interface 1 ATVALID Position */
+#define TPI_ITFTTD1_ATB_IF1_ATVALID_Msk (0x3UL << TPI_ITFTTD1_ATB_IF1_ATVALID_Pos) /*!< TPI ITFTTD1: ATB Interface 1 ATVALID Mask */
+
+#define TPI_ITFTTD1_ATB_IF1_bytecount_Pos 24U /*!< TPI ITFTTD1: ATB Interface 1 byte count Position */
+#define TPI_ITFTTD1_ATB_IF1_bytecount_Msk (0x3UL << TPI_ITFTTD1_ATB_IF1_bytecount_Pos) /*!< TPI ITFTTD1: ATB Interface 1 byte countt Mask */
+
+#define TPI_ITFTTD1_ATB_IF2_data2_Pos 16U /*!< TPI ITFTTD1: ATB Interface 2 data2 Position */
+#define TPI_ITFTTD1_ATB_IF2_data2_Msk (0xFFUL << TPI_ITFTTD1_ATB_IF2_data1_Pos) /*!< TPI ITFTTD1: ATB Interface 2 data2 Mask */
+
+#define TPI_ITFTTD1_ATB_IF2_data1_Pos 8U /*!< TPI ITFTTD1: ATB Interface 2 data1 Position */
+#define TPI_ITFTTD1_ATB_IF2_data1_Msk (0xFFUL << TPI_ITFTTD1_ATB_IF2_data1_Pos) /*!< TPI ITFTTD1: ATB Interface 2 data1 Mask */
+
+#define TPI_ITFTTD1_ATB_IF2_data0_Pos 0U /*!< TPI ITFTTD1: ATB Interface 2 data0 Position */
+#define TPI_ITFTTD1_ATB_IF2_data0_Msk (0xFFUL /*<< TPI_ITFTTD1_ATB_IF2_data0_Pos*/) /*!< TPI ITFTTD1: ATB Interface 2 data0 Mask */
+
+/* TPI Integration Test ATB Control Register 0 Definitions */
+#define TPI_ITATBCTR0_AFVALID2S_Pos 1U /*!< TPI ITATBCTR0: AFVALID2S Position */
+#define TPI_ITATBCTR0_AFVALID2S_Msk (0x1UL << TPI_ITATBCTR0_AFVALID2S_Pos) /*!< TPI ITATBCTR0: AFVALID2SS Mask */
+
+#define TPI_ITATBCTR0_AFVALID1S_Pos 1U /*!< TPI ITATBCTR0: AFVALID1S Position */
+#define TPI_ITATBCTR0_AFVALID1S_Msk (0x1UL << TPI_ITATBCTR0_AFVALID1S_Pos) /*!< TPI ITATBCTR0: AFVALID1SS Mask */
+
+#define TPI_ITATBCTR0_ATREADY2S_Pos 0U /*!< TPI ITATBCTR0: ATREADY2S Position */
+#define TPI_ITATBCTR0_ATREADY2S_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY2S_Pos*/) /*!< TPI ITATBCTR0: ATREADY2S Mask */
+
+#define TPI_ITATBCTR0_ATREADY1S_Pos 0U /*!< TPI ITATBCTR0: ATREADY1S Position */
+#define TPI_ITATBCTR0_ATREADY1S_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY1S_Pos*/) /*!< TPI ITATBCTR0: ATREADY1S Mask */
+
+/* TPI Integration Mode Control Register Definitions */
+#define TPI_ITCTRL_Mode_Pos 0U /*!< TPI ITCTRL: Mode Position */
+#define TPI_ITCTRL_Mode_Msk (0x3UL /*<< TPI_ITCTRL_Mode_Pos*/) /*!< TPI ITCTRL: Mode Mask */
+
+/* TPI DEVID Register Definitions */
+#define TPI_DEVID_NRZVALID_Pos 11U /*!< TPI DEVID: NRZVALID Position */
+#define TPI_DEVID_NRZVALID_Msk (0x1UL << TPI_DEVID_NRZVALID_Pos) /*!< TPI DEVID: NRZVALID Mask */
+
+#define TPI_DEVID_MANCVALID_Pos 10U /*!< TPI DEVID: MANCVALID Position */
+#define TPI_DEVID_MANCVALID_Msk (0x1UL << TPI_DEVID_MANCVALID_Pos) /*!< TPI DEVID: MANCVALID Mask */
+
+#define TPI_DEVID_PTINVALID_Pos 9U /*!< TPI DEVID: PTINVALID Position */
+#define TPI_DEVID_PTINVALID_Msk (0x1UL << TPI_DEVID_PTINVALID_Pos) /*!< TPI DEVID: PTINVALID Mask */
+
+#define TPI_DEVID_FIFOSZ_Pos 6U /*!< TPI DEVID: FIFOSZ Position */
+#define TPI_DEVID_FIFOSZ_Msk (0x7UL << TPI_DEVID_FIFOSZ_Pos) /*!< TPI DEVID: FIFOSZ Mask */
+
+#define TPI_DEVID_NrTraceInput_Pos 0U /*!< TPI DEVID: NrTraceInput Position */
+#define TPI_DEVID_NrTraceInput_Msk (0x3FUL /*<< TPI_DEVID_NrTraceInput_Pos*/) /*!< TPI DEVID: NrTraceInput Mask */
+
+/* TPI DEVTYPE Register Definitions */
+#define TPI_DEVTYPE_SubType_Pos 4U /*!< TPI DEVTYPE: SubType Position */
+#define TPI_DEVTYPE_SubType_Msk (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/) /*!< TPI DEVTYPE: SubType Mask */
+
+#define TPI_DEVTYPE_MajorType_Pos 0U /*!< TPI DEVTYPE: MajorType Position */
+#define TPI_DEVTYPE_MajorType_Msk (0xFUL << TPI_DEVTYPE_MajorType_Pos) /*!< TPI DEVTYPE: MajorType Mask */
+
+/*@}*/ /* end of group CMSIS_TPI */
+
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_MPU Memory Protection Unit (MPU)
+ \brief Type definitions for the Memory Protection Unit (MPU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Memory Protection Unit (MPU).
+ */
+typedef struct
+{
+ __IM uint32_t TYPE; /*!< Offset: 0x000 (R/ ) MPU Type Register */
+ __IOM uint32_t CTRL; /*!< Offset: 0x004 (R/W) MPU Control Register */
+ __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) MPU Region Number Register */
+ __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) MPU Region Base Address Register */
+ __IOM uint32_t RLAR; /*!< Offset: 0x010 (R/W) MPU Region Limit Address Register */
+ __IOM uint32_t RBAR_A1; /*!< Offset: 0x014 (R/W) MPU Region Base Address Register Alias 1 */
+ __IOM uint32_t RLAR_A1; /*!< Offset: 0x018 (R/W) MPU Region Limit Address Register Alias 1 */
+ __IOM uint32_t RBAR_A2; /*!< Offset: 0x01C (R/W) MPU Region Base Address Register Alias 2 */
+ __IOM uint32_t RLAR_A2; /*!< Offset: 0x020 (R/W) MPU Region Limit Address Register Alias 2 */
+ __IOM uint32_t RBAR_A3; /*!< Offset: 0x024 (R/W) MPU Region Base Address Register Alias 3 */
+ __IOM uint32_t RLAR_A3; /*!< Offset: 0x028 (R/W) MPU Region Limit Address Register Alias 3 */
+ uint32_t RESERVED0[1];
+ union {
+ __IOM uint32_t MAIR[2];
+ struct {
+ __IOM uint32_t MAIR0; /*!< Offset: 0x030 (R/W) MPU Memory Attribute Indirection Register 0 */
+ __IOM uint32_t MAIR1; /*!< Offset: 0x034 (R/W) MPU Memory Attribute Indirection Register 1 */
+ };
+ };
+} MPU_Type;
+
+#define MPU_TYPE_RALIASES 4U
+
+/* MPU Type Register Definitions */
+#define MPU_TYPE_IREGION_Pos 16U /*!< MPU TYPE: IREGION Position */
+#define MPU_TYPE_IREGION_Msk (0xFFUL << MPU_TYPE_IREGION_Pos) /*!< MPU TYPE: IREGION Mask */
+
+#define MPU_TYPE_DREGION_Pos 8U /*!< MPU TYPE: DREGION Position */
+#define MPU_TYPE_DREGION_Msk (0xFFUL << MPU_TYPE_DREGION_Pos) /*!< MPU TYPE: DREGION Mask */
+
+#define MPU_TYPE_SEPARATE_Pos 0U /*!< MPU TYPE: SEPARATE Position */
+#define MPU_TYPE_SEPARATE_Msk (1UL /*<< MPU_TYPE_SEPARATE_Pos*/) /*!< MPU TYPE: SEPARATE Mask */
+
+/* MPU Control Register Definitions */
+#define MPU_CTRL_PRIVDEFENA_Pos 2U /*!< MPU CTRL: PRIVDEFENA Position */
+#define MPU_CTRL_PRIVDEFENA_Msk (1UL << MPU_CTRL_PRIVDEFENA_Pos) /*!< MPU CTRL: PRIVDEFENA Mask */
+
+#define MPU_CTRL_HFNMIENA_Pos 1U /*!< MPU CTRL: HFNMIENA Position */
+#define MPU_CTRL_HFNMIENA_Msk (1UL << MPU_CTRL_HFNMIENA_Pos) /*!< MPU CTRL: HFNMIENA Mask */
+
+#define MPU_CTRL_ENABLE_Pos 0U /*!< MPU CTRL: ENABLE Position */
+#define MPU_CTRL_ENABLE_Msk (1UL /*<< MPU_CTRL_ENABLE_Pos*/) /*!< MPU CTRL: ENABLE Mask */
+
+/* MPU Region Number Register Definitions */
+#define MPU_RNR_REGION_Pos 0U /*!< MPU RNR: REGION Position */
+#define MPU_RNR_REGION_Msk (0xFFUL /*<< MPU_RNR_REGION_Pos*/) /*!< MPU RNR: REGION Mask */
+
+/* MPU Region Base Address Register Definitions */
+#define MPU_RBAR_BASE_Pos 5U /*!< MPU RBAR: BASE Position */
+#define MPU_RBAR_BASE_Msk (0x7FFFFFFUL << MPU_RBAR_BASE_Pos) /*!< MPU RBAR: BASE Mask */
+
+#define MPU_RBAR_SH_Pos 3U /*!< MPU RBAR: SH Position */
+#define MPU_RBAR_SH_Msk (0x3UL << MPU_RBAR_SH_Pos) /*!< MPU RBAR: SH Mask */
+
+#define MPU_RBAR_AP_Pos 1U /*!< MPU RBAR: AP Position */
+#define MPU_RBAR_AP_Msk (0x3UL << MPU_RBAR_AP_Pos) /*!< MPU RBAR: AP Mask */
+
+#define MPU_RBAR_XN_Pos 0U /*!< MPU RBAR: XN Position */
+#define MPU_RBAR_XN_Msk (01UL /*<< MPU_RBAR_XN_Pos*/) /*!< MPU RBAR: XN Mask */
+
+/* MPU Region Limit Address Register Definitions */
+#define MPU_RLAR_LIMIT_Pos 5U /*!< MPU RLAR: LIMIT Position */
+#define MPU_RLAR_LIMIT_Msk (0x7FFFFFFUL << MPU_RLAR_LIMIT_Pos) /*!< MPU RLAR: LIMIT Mask */
+
+#define MPU_RLAR_AttrIndx_Pos 1U /*!< MPU RLAR: AttrIndx Position */
+#define MPU_RLAR_AttrIndx_Msk (0x7UL << MPU_RLAR_AttrIndx_Pos) /*!< MPU RLAR: AttrIndx Mask */
+
+#define MPU_RLAR_EN_Pos 0U /*!< MPU RLAR: Region enable bit Position */
+#define MPU_RLAR_EN_Msk (1UL /*<< MPU_RLAR_EN_Pos*/) /*!< MPU RLAR: Region enable bit Disable Mask */
+
+/* MPU Memory Attribute Indirection Register 0 Definitions */
+#define MPU_MAIR0_Attr3_Pos 24U /*!< MPU MAIR0: Attr3 Position */
+#define MPU_MAIR0_Attr3_Msk (0xFFUL << MPU_MAIR0_Attr3_Pos) /*!< MPU MAIR0: Attr3 Mask */
+
+#define MPU_MAIR0_Attr2_Pos 16U /*!< MPU MAIR0: Attr2 Position */
+#define MPU_MAIR0_Attr2_Msk (0xFFUL << MPU_MAIR0_Attr2_Pos) /*!< MPU MAIR0: Attr2 Mask */
+
+#define MPU_MAIR0_Attr1_Pos 8U /*!< MPU MAIR0: Attr1 Position */
+#define MPU_MAIR0_Attr1_Msk (0xFFUL << MPU_MAIR0_Attr1_Pos) /*!< MPU MAIR0: Attr1 Mask */
+
+#define MPU_MAIR0_Attr0_Pos 0U /*!< MPU MAIR0: Attr0 Position */
+#define MPU_MAIR0_Attr0_Msk (0xFFUL /*<< MPU_MAIR0_Attr0_Pos*/) /*!< MPU MAIR0: Attr0 Mask */
+
+/* MPU Memory Attribute Indirection Register 1 Definitions */
+#define MPU_MAIR1_Attr7_Pos 24U /*!< MPU MAIR1: Attr7 Position */
+#define MPU_MAIR1_Attr7_Msk (0xFFUL << MPU_MAIR1_Attr7_Pos) /*!< MPU MAIR1: Attr7 Mask */
+
+#define MPU_MAIR1_Attr6_Pos 16U /*!< MPU MAIR1: Attr6 Position */
+#define MPU_MAIR1_Attr6_Msk (0xFFUL << MPU_MAIR1_Attr6_Pos) /*!< MPU MAIR1: Attr6 Mask */
+
+#define MPU_MAIR1_Attr5_Pos 8U /*!< MPU MAIR1: Attr5 Position */
+#define MPU_MAIR1_Attr5_Msk (0xFFUL << MPU_MAIR1_Attr5_Pos) /*!< MPU MAIR1: Attr5 Mask */
+
+#define MPU_MAIR1_Attr4_Pos 0U /*!< MPU MAIR1: Attr4 Position */
+#define MPU_MAIR1_Attr4_Msk (0xFFUL /*<< MPU_MAIR1_Attr4_Pos*/) /*!< MPU MAIR1: Attr4 Mask */
+
+/*@} end of group CMSIS_MPU */
+#endif
+
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_SAU Security Attribution Unit (SAU)
+ \brief Type definitions for the Security Attribution Unit (SAU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Security Attribution Unit (SAU).
+ */
+typedef struct
+{
+ __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SAU Control Register */
+ __IM uint32_t TYPE; /*!< Offset: 0x004 (R/ ) SAU Type Register */
+#if defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U)
+ __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) SAU Region Number Register */
+ __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) SAU Region Base Address Register */
+ __IOM uint32_t RLAR; /*!< Offset: 0x010 (R/W) SAU Region Limit Address Register */
+#else
+ uint32_t RESERVED0[3];
+#endif
+ __IOM uint32_t SFSR; /*!< Offset: 0x014 (R/W) Secure Fault Status Register */
+ __IOM uint32_t SFAR; /*!< Offset: 0x018 (R/W) Secure Fault Address Register */
+} SAU_Type;
+
+/* SAU Control Register Definitions */
+#define SAU_CTRL_ALLNS_Pos 1U /*!< SAU CTRL: ALLNS Position */
+#define SAU_CTRL_ALLNS_Msk (1UL << SAU_CTRL_ALLNS_Pos) /*!< SAU CTRL: ALLNS Mask */
+
+#define SAU_CTRL_ENABLE_Pos 0U /*!< SAU CTRL: ENABLE Position */
+#define SAU_CTRL_ENABLE_Msk (1UL /*<< SAU_CTRL_ENABLE_Pos*/) /*!< SAU CTRL: ENABLE Mask */
+
+/* SAU Type Register Definitions */
+#define SAU_TYPE_SREGION_Pos 0U /*!< SAU TYPE: SREGION Position */
+#define SAU_TYPE_SREGION_Msk (0xFFUL /*<< SAU_TYPE_SREGION_Pos*/) /*!< SAU TYPE: SREGION Mask */
+
+#if defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U)
+/* SAU Region Number Register Definitions */
+#define SAU_RNR_REGION_Pos 0U /*!< SAU RNR: REGION Position */
+#define SAU_RNR_REGION_Msk (0xFFUL /*<< SAU_RNR_REGION_Pos*/) /*!< SAU RNR: REGION Mask */
+
+/* SAU Region Base Address Register Definitions */
+#define SAU_RBAR_BADDR_Pos 5U /*!< SAU RBAR: BADDR Position */
+#define SAU_RBAR_BADDR_Msk (0x7FFFFFFUL << SAU_RBAR_BADDR_Pos) /*!< SAU RBAR: BADDR Mask */
+
+/* SAU Region Limit Address Register Definitions */
+#define SAU_RLAR_LADDR_Pos 5U /*!< SAU RLAR: LADDR Position */
+#define SAU_RLAR_LADDR_Msk (0x7FFFFFFUL << SAU_RLAR_LADDR_Pos) /*!< SAU RLAR: LADDR Mask */
+
+#define SAU_RLAR_NSC_Pos 1U /*!< SAU RLAR: NSC Position */
+#define SAU_RLAR_NSC_Msk (1UL << SAU_RLAR_NSC_Pos) /*!< SAU RLAR: NSC Mask */
+
+#define SAU_RLAR_ENABLE_Pos 0U /*!< SAU RLAR: ENABLE Position */
+#define SAU_RLAR_ENABLE_Msk (1UL /*<< SAU_RLAR_ENABLE_Pos*/) /*!< SAU RLAR: ENABLE Mask */
+
+#endif /* defined (__SAUREGION_PRESENT) && (__SAUREGION_PRESENT == 1U) */
+
+/* Secure Fault Status Register Definitions */
+#define SAU_SFSR_LSERR_Pos 7U /*!< SAU SFSR: LSERR Position */
+#define SAU_SFSR_LSERR_Msk (1UL << SAU_SFSR_LSERR_Pos) /*!< SAU SFSR: LSERR Mask */
+
+#define SAU_SFSR_SFARVALID_Pos 6U /*!< SAU SFSR: SFARVALID Position */
+#define SAU_SFSR_SFARVALID_Msk (1UL << SAU_SFSR_SFARVALID_Pos) /*!< SAU SFSR: SFARVALID Mask */
+
+#define SAU_SFSR_LSPERR_Pos 5U /*!< SAU SFSR: LSPERR Position */
+#define SAU_SFSR_LSPERR_Msk (1UL << SAU_SFSR_LSPERR_Pos) /*!< SAU SFSR: LSPERR Mask */
+
+#define SAU_SFSR_INVTRAN_Pos 4U /*!< SAU SFSR: INVTRAN Position */
+#define SAU_SFSR_INVTRAN_Msk (1UL << SAU_SFSR_INVTRAN_Pos) /*!< SAU SFSR: INVTRAN Mask */
+
+#define SAU_SFSR_AUVIOL_Pos 3U /*!< SAU SFSR: AUVIOL Position */
+#define SAU_SFSR_AUVIOL_Msk (1UL << SAU_SFSR_AUVIOL_Pos) /*!< SAU SFSR: AUVIOL Mask */
+
+#define SAU_SFSR_INVER_Pos 2U /*!< SAU SFSR: INVER Position */
+#define SAU_SFSR_INVER_Msk (1UL << SAU_SFSR_INVER_Pos) /*!< SAU SFSR: INVER Mask */
+
+#define SAU_SFSR_INVIS_Pos 1U /*!< SAU SFSR: INVIS Position */
+#define SAU_SFSR_INVIS_Msk (1UL << SAU_SFSR_INVIS_Pos) /*!< SAU SFSR: INVIS Mask */
+
+#define SAU_SFSR_INVEP_Pos 0U /*!< SAU SFSR: INVEP Position */
+#define SAU_SFSR_INVEP_Msk (1UL /*<< SAU_SFSR_INVEP_Pos*/) /*!< SAU SFSR: INVEP Mask */
+
+/*@} end of group CMSIS_SAU */
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_FPU Floating Point Unit (FPU)
+ \brief Type definitions for the Floating Point Unit (FPU)
+ @{
+ */
+
+/**
+ \brief Structure type to access the Floating Point Unit (FPU).
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1U];
+ __IOM uint32_t FPCCR; /*!< Offset: 0x004 (R/W) Floating-Point Context Control Register */
+ __IOM uint32_t FPCAR; /*!< Offset: 0x008 (R/W) Floating-Point Context Address Register */
+ __IOM uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */
+ __IM uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */
+ __IM uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */
+} FPU_Type;
+
+/* Floating-Point Context Control Register Definitions */
+#define FPU_FPCCR_ASPEN_Pos 31U /*!< FPCCR: ASPEN bit Position */
+#define FPU_FPCCR_ASPEN_Msk (1UL << FPU_FPCCR_ASPEN_Pos) /*!< FPCCR: ASPEN bit Mask */
+
+#define FPU_FPCCR_LSPEN_Pos 30U /*!< FPCCR: LSPEN Position */
+#define FPU_FPCCR_LSPEN_Msk (1UL << FPU_FPCCR_LSPEN_Pos) /*!< FPCCR: LSPEN bit Mask */
+
+#define FPU_FPCCR_LSPENS_Pos 29U /*!< FPCCR: LSPENS Position */
+#define FPU_FPCCR_LSPENS_Msk (1UL << FPU_FPCCR_LSPENS_Pos) /*!< FPCCR: LSPENS bit Mask */
+
+#define FPU_FPCCR_CLRONRET_Pos 28U /*!< FPCCR: CLRONRET Position */
+#define FPU_FPCCR_CLRONRET_Msk (1UL << FPU_FPCCR_CLRONRET_Pos) /*!< FPCCR: CLRONRET bit Mask */
+
+#define FPU_FPCCR_CLRONRETS_Pos 27U /*!< FPCCR: CLRONRETS Position */
+#define FPU_FPCCR_CLRONRETS_Msk (1UL << FPU_FPCCR_CLRONRETS_Pos) /*!< FPCCR: CLRONRETS bit Mask */
+
+#define FPU_FPCCR_TS_Pos 26U /*!< FPCCR: TS Position */
+#define FPU_FPCCR_TS_Msk (1UL << FPU_FPCCR_TS_Pos) /*!< FPCCR: TS bit Mask */
+
+#define FPU_FPCCR_UFRDY_Pos 10U /*!< FPCCR: UFRDY Position */
+#define FPU_FPCCR_UFRDY_Msk (1UL << FPU_FPCCR_UFRDY_Pos) /*!< FPCCR: UFRDY bit Mask */
+
+#define FPU_FPCCR_SPLIMVIOL_Pos 9U /*!< FPCCR: SPLIMVIOL Position */
+#define FPU_FPCCR_SPLIMVIOL_Msk (1UL << FPU_FPCCR_SPLIMVIOL_Pos) /*!< FPCCR: SPLIMVIOL bit Mask */
+
+#define FPU_FPCCR_MONRDY_Pos 8U /*!< FPCCR: MONRDY Position */
+#define FPU_FPCCR_MONRDY_Msk (1UL << FPU_FPCCR_MONRDY_Pos) /*!< FPCCR: MONRDY bit Mask */
+
+#define FPU_FPCCR_SFRDY_Pos 7U /*!< FPCCR: SFRDY Position */
+#define FPU_FPCCR_SFRDY_Msk (1UL << FPU_FPCCR_SFRDY_Pos) /*!< FPCCR: SFRDY bit Mask */
+
+#define FPU_FPCCR_BFRDY_Pos 6U /*!< FPCCR: BFRDY Position */
+#define FPU_FPCCR_BFRDY_Msk (1UL << FPU_FPCCR_BFRDY_Pos) /*!< FPCCR: BFRDY bit Mask */
+
+#define FPU_FPCCR_MMRDY_Pos 5U /*!< FPCCR: MMRDY Position */
+#define FPU_FPCCR_MMRDY_Msk (1UL << FPU_FPCCR_MMRDY_Pos) /*!< FPCCR: MMRDY bit Mask */
+
+#define FPU_FPCCR_HFRDY_Pos 4U /*!< FPCCR: HFRDY Position */
+#define FPU_FPCCR_HFRDY_Msk (1UL << FPU_FPCCR_HFRDY_Pos) /*!< FPCCR: HFRDY bit Mask */
+
+#define FPU_FPCCR_THREAD_Pos 3U /*!< FPCCR: processor mode bit Position */
+#define FPU_FPCCR_THREAD_Msk (1UL << FPU_FPCCR_THREAD_Pos) /*!< FPCCR: processor mode active bit Mask */
+
+#define FPU_FPCCR_S_Pos 2U /*!< FPCCR: Security status of the FP context bit Position */
+#define FPU_FPCCR_S_Msk (1UL << FPU_FPCCR_S_Pos) /*!< FPCCR: Security status of the FP context bit Mask */
+
+#define FPU_FPCCR_USER_Pos 1U /*!< FPCCR: privilege level bit Position */
+#define FPU_FPCCR_USER_Msk (1UL << FPU_FPCCR_USER_Pos) /*!< FPCCR: privilege level bit Mask */
+
+#define FPU_FPCCR_LSPACT_Pos 0U /*!< FPCCR: Lazy state preservation active bit Position */
+#define FPU_FPCCR_LSPACT_Msk (1UL /*<< FPU_FPCCR_LSPACT_Pos*/) /*!< FPCCR: Lazy state preservation active bit Mask */
+
+/* Floating-Point Context Address Register Definitions */
+#define FPU_FPCAR_ADDRESS_Pos 3U /*!< FPCAR: ADDRESS bit Position */
+#define FPU_FPCAR_ADDRESS_Msk (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos) /*!< FPCAR: ADDRESS bit Mask */
+
+/* Floating-Point Default Status Control Register Definitions */
+#define FPU_FPDSCR_AHP_Pos 26U /*!< FPDSCR: AHP bit Position */
+#define FPU_FPDSCR_AHP_Msk (1UL << FPU_FPDSCR_AHP_Pos) /*!< FPDSCR: AHP bit Mask */
+
+#define FPU_FPDSCR_DN_Pos 25U /*!< FPDSCR: DN bit Position */
+#define FPU_FPDSCR_DN_Msk (1UL << FPU_FPDSCR_DN_Pos) /*!< FPDSCR: DN bit Mask */
+
+#define FPU_FPDSCR_FZ_Pos 24U /*!< FPDSCR: FZ bit Position */
+#define FPU_FPDSCR_FZ_Msk (1UL << FPU_FPDSCR_FZ_Pos) /*!< FPDSCR: FZ bit Mask */
+
+#define FPU_FPDSCR_RMode_Pos 22U /*!< FPDSCR: RMode bit Position */
+#define FPU_FPDSCR_RMode_Msk (3UL << FPU_FPDSCR_RMode_Pos) /*!< FPDSCR: RMode bit Mask */
+
+/* Media and FP Feature Register 0 Definitions */
+#define FPU_MVFR0_FP_rounding_modes_Pos 28U /*!< MVFR0: FP rounding modes bits Position */
+#define FPU_MVFR0_FP_rounding_modes_Msk (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos) /*!< MVFR0: FP rounding modes bits Mask */
+
+#define FPU_MVFR0_Short_vectors_Pos 24U /*!< MVFR0: Short vectors bits Position */
+#define FPU_MVFR0_Short_vectors_Msk (0xFUL << FPU_MVFR0_Short_vectors_Pos) /*!< MVFR0: Short vectors bits Mask */
+
+#define FPU_MVFR0_Square_root_Pos 20U /*!< MVFR0: Square root bits Position */
+#define FPU_MVFR0_Square_root_Msk (0xFUL << FPU_MVFR0_Square_root_Pos) /*!< MVFR0: Square root bits Mask */
+
+#define FPU_MVFR0_Divide_Pos 16U /*!< MVFR0: Divide bits Position */
+#define FPU_MVFR0_Divide_Msk (0xFUL << FPU_MVFR0_Divide_Pos) /*!< MVFR0: Divide bits Mask */
+
+#define FPU_MVFR0_FP_excep_trapping_Pos 12U /*!< MVFR0: FP exception trapping bits Position */
+#define FPU_MVFR0_FP_excep_trapping_Msk (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos) /*!< MVFR0: FP exception trapping bits Mask */
+
+#define FPU_MVFR0_Double_precision_Pos 8U /*!< MVFR0: Double-precision bits Position */
+#define FPU_MVFR0_Double_precision_Msk (0xFUL << FPU_MVFR0_Double_precision_Pos) /*!< MVFR0: Double-precision bits Mask */
+
+#define FPU_MVFR0_Single_precision_Pos 4U /*!< MVFR0: Single-precision bits Position */
+#define FPU_MVFR0_Single_precision_Msk (0xFUL << FPU_MVFR0_Single_precision_Pos) /*!< MVFR0: Single-precision bits Mask */
+
+#define FPU_MVFR0_A_SIMD_registers_Pos 0U /*!< MVFR0: A_SIMD registers bits Position */
+#define FPU_MVFR0_A_SIMD_registers_Msk (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/) /*!< MVFR0: A_SIMD registers bits Mask */
+
+/* Media and FP Feature Register 1 Definitions */
+#define FPU_MVFR1_FP_fused_MAC_Pos 28U /*!< MVFR1: FP fused MAC bits Position */
+#define FPU_MVFR1_FP_fused_MAC_Msk (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos) /*!< MVFR1: FP fused MAC bits Mask */
+
+#define FPU_MVFR1_FP_HPFP_Pos 24U /*!< MVFR1: FP HPFP bits Position */
+#define FPU_MVFR1_FP_HPFP_Msk (0xFUL << FPU_MVFR1_FP_HPFP_Pos) /*!< MVFR1: FP HPFP bits Mask */
+
+#define FPU_MVFR1_D_NaN_mode_Pos 4U /*!< MVFR1: D_NaN mode bits Position */
+#define FPU_MVFR1_D_NaN_mode_Msk (0xFUL << FPU_MVFR1_D_NaN_mode_Pos) /*!< MVFR1: D_NaN mode bits Mask */
+
+#define FPU_MVFR1_FtZ_mode_Pos 0U /*!< MVFR1: FtZ mode bits Position */
+#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */
+
+/*@} end of group CMSIS_FPU */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_CoreDebug Core Debug Registers (CoreDebug)
+ \brief Type definitions for the Core Debug Registers
+ @{
+ */
+
+/**
+ \brief Structure type to access the Core Debug Register (CoreDebug).
+ */
+typedef struct
+{
+ __IOM uint32_t DHCSR; /*!< Offset: 0x000 (R/W) Debug Halting Control and Status Register */
+ __OM uint32_t DCRSR; /*!< Offset: 0x004 ( /W) Debug Core Register Selector Register */
+ __IOM uint32_t DCRDR; /*!< Offset: 0x008 (R/W) Debug Core Register Data Register */
+ __IOM uint32_t DEMCR; /*!< Offset: 0x00C (R/W) Debug Exception and Monitor Control Register */
+ uint32_t RESERVED4[1U];
+ __IOM uint32_t DAUTHCTRL; /*!< Offset: 0x014 (R/W) Debug Authentication Control Register */
+ __IOM uint32_t DSCSR; /*!< Offset: 0x018 (R/W) Debug Security Control and Status Register */
+} CoreDebug_Type;
+
+/* Debug Halting Control and Status Register Definitions */
+#define CoreDebug_DHCSR_DBGKEY_Pos 16U /*!< CoreDebug DHCSR: DBGKEY Position */
+#define CoreDebug_DHCSR_DBGKEY_Msk (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos) /*!< CoreDebug DHCSR: DBGKEY Mask */
+
+#define CoreDebug_DHCSR_S_RESTART_ST_Pos 26U /*!< CoreDebug DHCSR: S_RESTART_ST Position */
+#define CoreDebug_DHCSR_S_RESTART_ST_Msk (1UL << CoreDebug_DHCSR_S_RESTART_ST_Pos) /*!< CoreDebug DHCSR: S_RESTART_ST Mask */
+
+#define CoreDebug_DHCSR_S_RESET_ST_Pos 25U /*!< CoreDebug DHCSR: S_RESET_ST Position */
+#define CoreDebug_DHCSR_S_RESET_ST_Msk (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos) /*!< CoreDebug DHCSR: S_RESET_ST Mask */
+
+#define CoreDebug_DHCSR_S_RETIRE_ST_Pos 24U /*!< CoreDebug DHCSR: S_RETIRE_ST Position */
+#define CoreDebug_DHCSR_S_RETIRE_ST_Msk (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos) /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */
+
+#define CoreDebug_DHCSR_S_LOCKUP_Pos 19U /*!< CoreDebug DHCSR: S_LOCKUP Position */
+#define CoreDebug_DHCSR_S_LOCKUP_Msk (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos) /*!< CoreDebug DHCSR: S_LOCKUP Mask */
+
+#define CoreDebug_DHCSR_S_SLEEP_Pos 18U /*!< CoreDebug DHCSR: S_SLEEP Position */
+#define CoreDebug_DHCSR_S_SLEEP_Msk (1UL << CoreDebug_DHCSR_S_SLEEP_Pos) /*!< CoreDebug DHCSR: S_SLEEP Mask */
+
+#define CoreDebug_DHCSR_S_HALT_Pos 17U /*!< CoreDebug DHCSR: S_HALT Position */
+#define CoreDebug_DHCSR_S_HALT_Msk (1UL << CoreDebug_DHCSR_S_HALT_Pos) /*!< CoreDebug DHCSR: S_HALT Mask */
+
+#define CoreDebug_DHCSR_S_REGRDY_Pos 16U /*!< CoreDebug DHCSR: S_REGRDY Position */
+#define CoreDebug_DHCSR_S_REGRDY_Msk (1UL << CoreDebug_DHCSR_S_REGRDY_Pos) /*!< CoreDebug DHCSR: S_REGRDY Mask */
+
+#define CoreDebug_DHCSR_C_SNAPSTALL_Pos 5U /*!< CoreDebug DHCSR: C_SNAPSTALL Position */
+#define CoreDebug_DHCSR_C_SNAPSTALL_Msk (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos) /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */
+
+#define CoreDebug_DHCSR_C_MASKINTS_Pos 3U /*!< CoreDebug DHCSR: C_MASKINTS Position */
+#define CoreDebug_DHCSR_C_MASKINTS_Msk (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos) /*!< CoreDebug DHCSR: C_MASKINTS Mask */
+
+#define CoreDebug_DHCSR_C_STEP_Pos 2U /*!< CoreDebug DHCSR: C_STEP Position */
+#define CoreDebug_DHCSR_C_STEP_Msk (1UL << CoreDebug_DHCSR_C_STEP_Pos) /*!< CoreDebug DHCSR: C_STEP Mask */
+
+#define CoreDebug_DHCSR_C_HALT_Pos 1U /*!< CoreDebug DHCSR: C_HALT Position */
+#define CoreDebug_DHCSR_C_HALT_Msk (1UL << CoreDebug_DHCSR_C_HALT_Pos) /*!< CoreDebug DHCSR: C_HALT Mask */
+
+#define CoreDebug_DHCSR_C_DEBUGEN_Pos 0U /*!< CoreDebug DHCSR: C_DEBUGEN Position */
+#define CoreDebug_DHCSR_C_DEBUGEN_Msk (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/) /*!< CoreDebug DHCSR: C_DEBUGEN Mask */
+
+/* Debug Core Register Selector Register Definitions */
+#define CoreDebug_DCRSR_REGWnR_Pos 16U /*!< CoreDebug DCRSR: REGWnR Position */
+#define CoreDebug_DCRSR_REGWnR_Msk (1UL << CoreDebug_DCRSR_REGWnR_Pos) /*!< CoreDebug DCRSR: REGWnR Mask */
+
+#define CoreDebug_DCRSR_REGSEL_Pos 0U /*!< CoreDebug DCRSR: REGSEL Position */
+#define CoreDebug_DCRSR_REGSEL_Msk (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/) /*!< CoreDebug DCRSR: REGSEL Mask */
+
+/* Debug Exception and Monitor Control Register Definitions */
+#define CoreDebug_DEMCR_TRCENA_Pos 24U /*!< CoreDebug DEMCR: TRCENA Position */
+#define CoreDebug_DEMCR_TRCENA_Msk (1UL << CoreDebug_DEMCR_TRCENA_Pos) /*!< CoreDebug DEMCR: TRCENA Mask */
+
+#define CoreDebug_DEMCR_MON_REQ_Pos 19U /*!< CoreDebug DEMCR: MON_REQ Position */
+#define CoreDebug_DEMCR_MON_REQ_Msk (1UL << CoreDebug_DEMCR_MON_REQ_Pos) /*!< CoreDebug DEMCR: MON_REQ Mask */
+
+#define CoreDebug_DEMCR_MON_STEP_Pos 18U /*!< CoreDebug DEMCR: MON_STEP Position */
+#define CoreDebug_DEMCR_MON_STEP_Msk (1UL << CoreDebug_DEMCR_MON_STEP_Pos) /*!< CoreDebug DEMCR: MON_STEP Mask */
+
+#define CoreDebug_DEMCR_MON_PEND_Pos 17U /*!< CoreDebug DEMCR: MON_PEND Position */
+#define CoreDebug_DEMCR_MON_PEND_Msk (1UL << CoreDebug_DEMCR_MON_PEND_Pos) /*!< CoreDebug DEMCR: MON_PEND Mask */
+
+#define CoreDebug_DEMCR_MON_EN_Pos 16U /*!< CoreDebug DEMCR: MON_EN Position */
+#define CoreDebug_DEMCR_MON_EN_Msk (1UL << CoreDebug_DEMCR_MON_EN_Pos) /*!< CoreDebug DEMCR: MON_EN Mask */
+
+#define CoreDebug_DEMCR_VC_HARDERR_Pos 10U /*!< CoreDebug DEMCR: VC_HARDERR Position */
+#define CoreDebug_DEMCR_VC_HARDERR_Msk (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos) /*!< CoreDebug DEMCR: VC_HARDERR Mask */
+
+#define CoreDebug_DEMCR_VC_INTERR_Pos 9U /*!< CoreDebug DEMCR: VC_INTERR Position */
+#define CoreDebug_DEMCR_VC_INTERR_Msk (1UL << CoreDebug_DEMCR_VC_INTERR_Pos) /*!< CoreDebug DEMCR: VC_INTERR Mask */
+
+#define CoreDebug_DEMCR_VC_BUSERR_Pos 8U /*!< CoreDebug DEMCR: VC_BUSERR Position */
+#define CoreDebug_DEMCR_VC_BUSERR_Msk (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos) /*!< CoreDebug DEMCR: VC_BUSERR Mask */
+
+#define CoreDebug_DEMCR_VC_STATERR_Pos 7U /*!< CoreDebug DEMCR: VC_STATERR Position */
+#define CoreDebug_DEMCR_VC_STATERR_Msk (1UL << CoreDebug_DEMCR_VC_STATERR_Pos) /*!< CoreDebug DEMCR: VC_STATERR Mask */
+
+#define CoreDebug_DEMCR_VC_CHKERR_Pos 6U /*!< CoreDebug DEMCR: VC_CHKERR Position */
+#define CoreDebug_DEMCR_VC_CHKERR_Msk (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos) /*!< CoreDebug DEMCR: VC_CHKERR Mask */
+
+#define CoreDebug_DEMCR_VC_NOCPERR_Pos 5U /*!< CoreDebug DEMCR: VC_NOCPERR Position */
+#define CoreDebug_DEMCR_VC_NOCPERR_Msk (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos) /*!< CoreDebug DEMCR: VC_NOCPERR Mask */
+
+#define CoreDebug_DEMCR_VC_MMERR_Pos 4U /*!< CoreDebug DEMCR: VC_MMERR Position */
+#define CoreDebug_DEMCR_VC_MMERR_Msk (1UL << CoreDebug_DEMCR_VC_MMERR_Pos) /*!< CoreDebug DEMCR: VC_MMERR Mask */
+
+#define CoreDebug_DEMCR_VC_CORERESET_Pos 0U /*!< CoreDebug DEMCR: VC_CORERESET Position */
+#define CoreDebug_DEMCR_VC_CORERESET_Msk (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/) /*!< CoreDebug DEMCR: VC_CORERESET Mask */
+
+/* Debug Authentication Control Register Definitions */
+#define CoreDebug_DAUTHCTRL_INTSPNIDEN_Pos 3U /*!< CoreDebug DAUTHCTRL: INTSPNIDEN, Position */
+#define CoreDebug_DAUTHCTRL_INTSPNIDEN_Msk (1UL << CoreDebug_DAUTHCTRL_INTSPNIDEN_Pos) /*!< CoreDebug DAUTHCTRL: INTSPNIDEN, Mask */
+
+#define CoreDebug_DAUTHCTRL_SPNIDENSEL_Pos 2U /*!< CoreDebug DAUTHCTRL: SPNIDENSEL Position */
+#define CoreDebug_DAUTHCTRL_SPNIDENSEL_Msk (1UL << CoreDebug_DAUTHCTRL_SPNIDENSEL_Pos) /*!< CoreDebug DAUTHCTRL: SPNIDENSEL Mask */
+
+#define CoreDebug_DAUTHCTRL_INTSPIDEN_Pos 1U /*!< CoreDebug DAUTHCTRL: INTSPIDEN Position */
+#define CoreDebug_DAUTHCTRL_INTSPIDEN_Msk (1UL << CoreDebug_DAUTHCTRL_INTSPIDEN_Pos) /*!< CoreDebug DAUTHCTRL: INTSPIDEN Mask */
+
+#define CoreDebug_DAUTHCTRL_SPIDENSEL_Pos 0U /*!< CoreDebug DAUTHCTRL: SPIDENSEL Position */
+#define CoreDebug_DAUTHCTRL_SPIDENSEL_Msk (1UL /*<< CoreDebug_DAUTHCTRL_SPIDENSEL_Pos*/) /*!< CoreDebug DAUTHCTRL: SPIDENSEL Mask */
+
+/* Debug Security Control and Status Register Definitions */
+#define CoreDebug_DSCSR_CDS_Pos 16U /*!< CoreDebug DSCSR: CDS Position */
+#define CoreDebug_DSCSR_CDS_Msk (1UL << CoreDebug_DSCSR_CDS_Pos) /*!< CoreDebug DSCSR: CDS Mask */
+
+#define CoreDebug_DSCSR_SBRSEL_Pos 1U /*!< CoreDebug DSCSR: SBRSEL Position */
+#define CoreDebug_DSCSR_SBRSEL_Msk (1UL << CoreDebug_DSCSR_SBRSEL_Pos) /*!< CoreDebug DSCSR: SBRSEL Mask */
+
+#define CoreDebug_DSCSR_SBRSELEN_Pos 0U /*!< CoreDebug DSCSR: SBRSELEN Position */
+#define CoreDebug_DSCSR_SBRSELEN_Msk (1UL /*<< CoreDebug_DSCSR_SBRSELEN_Pos*/) /*!< CoreDebug DSCSR: SBRSELEN Mask */
+
+/*@} end of group CMSIS_CoreDebug */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_core_bitfield Core register bit field macros
+ \brief Macros for use with bit field definitions (xxx_Pos, xxx_Msk).
+ @{
+ */
+
+/**
+ \brief Mask and shift a bit field value for use in a register bit range.
+ \param[in] field Name of the register bit field.
+ \param[in] value Value of the bit field. This parameter is interpreted as an uint32_t type.
+ \return Masked and shifted value.
+*/
+#define _VAL2FLD(field, value) (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+ \brief Mask and shift a register value to extract a bit filed value.
+ \param[in] field Name of the register bit field.
+ \param[in] value Value of register. This parameter is interpreted as an uint32_t type.
+ \return Masked and shifted bit field value.
+*/
+#define _FLD2VAL(field, value) (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/*@} end of group CMSIS_core_bitfield */
+
+
+/**
+ \ingroup CMSIS_core_register
+ \defgroup CMSIS_core_base Core Definitions
+ \brief Definitions for base addresses, unions, and structures.
+ @{
+ */
+
+/* Memory mapping of Core Hardware */
+ #define SCS_BASE (0xE000E000UL) /*!< System Control Space Base Address */
+ #define ITM_BASE (0xE0000000UL) /*!< ITM Base Address */
+ #define DWT_BASE (0xE0001000UL) /*!< DWT Base Address */
+ #define TPI_BASE (0xE0040000UL) /*!< TPI Base Address */
+ #define CoreDebug_BASE (0xE000EDF0UL) /*!< Core Debug Base Address */
+ #define SysTick_BASE (SCS_BASE + 0x0010UL) /*!< SysTick Base Address */
+ #define NVIC_BASE (SCS_BASE + 0x0100UL) /*!< NVIC Base Address */
+ #define SCB_BASE (SCS_BASE + 0x0D00UL) /*!< System Control Block Base Address */
+
+ #define SCnSCB ((SCnSCB_Type *) SCS_BASE ) /*!< System control Register not in SCB */
+ #define SCB ((SCB_Type *) SCB_BASE ) /*!< SCB configuration struct */
+ #define SysTick ((SysTick_Type *) SysTick_BASE ) /*!< SysTick configuration struct */
+ #define NVIC ((NVIC_Type *) NVIC_BASE ) /*!< NVIC configuration struct */
+ #define ITM ((ITM_Type *) ITM_BASE ) /*!< ITM configuration struct */
+ #define DWT ((DWT_Type *) DWT_BASE ) /*!< DWT configuration struct */
+ #define TPI ((TPI_Type *) TPI_BASE ) /*!< TPI configuration struct */
+ #define CoreDebug ((CoreDebug_Type *) CoreDebug_BASE ) /*!< Core Debug configuration struct */
+
+ #if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+ #define MPU_BASE (SCS_BASE + 0x0D90UL) /*!< Memory Protection Unit */
+ #define MPU ((MPU_Type *) MPU_BASE ) /*!< Memory Protection Unit */
+ #endif
+
+ #if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+ #define SAU_BASE (SCS_BASE + 0x0DD0UL) /*!< Security Attribution Unit */
+ #define SAU ((SAU_Type *) SAU_BASE ) /*!< Security Attribution Unit */
+ #endif
+
+ #define FPU_BASE (SCS_BASE + 0x0F30UL) /*!< Floating Point Unit */
+ #define FPU ((FPU_Type *) FPU_BASE ) /*!< Floating Point Unit */
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+ #define SCS_BASE_NS (0xE002E000UL) /*!< System Control Space Base Address (non-secure address space) */
+ #define CoreDebug_BASE_NS (0xE002EDF0UL) /*!< Core Debug Base Address (non-secure address space) */
+ #define SysTick_BASE_NS (SCS_BASE_NS + 0x0010UL) /*!< SysTick Base Address (non-secure address space) */
+ #define NVIC_BASE_NS (SCS_BASE_NS + 0x0100UL) /*!< NVIC Base Address (non-secure address space) */
+ #define SCB_BASE_NS (SCS_BASE_NS + 0x0D00UL) /*!< System Control Block Base Address (non-secure address space) */
+
+ #define SCnSCB_NS ((SCnSCB_Type *) SCS_BASE_NS ) /*!< System control Register not in SCB(non-secure address space) */
+ #define SCB_NS ((SCB_Type *) SCB_BASE_NS ) /*!< SCB configuration struct (non-secure address space) */
+ #define SysTick_NS ((SysTick_Type *) SysTick_BASE_NS ) /*!< SysTick configuration struct (non-secure address space) */
+ #define NVIC_NS ((NVIC_Type *) NVIC_BASE_NS ) /*!< NVIC configuration struct (non-secure address space) */
+ #define CoreDebug_NS ((CoreDebug_Type *) CoreDebug_BASE_NS) /*!< Core Debug configuration struct (non-secure address space) */
+
+ #if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+ #define MPU_BASE_NS (SCS_BASE_NS + 0x0D90UL) /*!< Memory Protection Unit (non-secure address space) */
+ #define MPU_NS ((MPU_Type *) MPU_BASE_NS ) /*!< Memory Protection Unit (non-secure address space) */
+ #endif
+
+ #define FPU_BASE_NS (SCS_BASE_NS + 0x0F30UL) /*!< Floating Point Unit (non-secure address space) */
+ #define FPU_NS ((FPU_Type *) FPU_BASE_NS ) /*!< Floating Point Unit (non-secure address space) */
+
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+/*@} */
+
+
+
+/*******************************************************************************
+ * Hardware Abstraction Layer
+ Core Function Interface contains:
+ - Core NVIC Functions
+ - Core SysTick Functions
+ - Core Debug Functions
+ - Core Register Access Functions
+ ******************************************************************************/
+/**
+ \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference
+*/
+
+
+
+/* ########################## NVIC functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_NVICFunctions NVIC Functions
+ \brief Functions that manage interrupts and exceptions via the NVIC.
+ @{
+ */
+
+#ifdef CMSIS_NVIC_VIRTUAL
+ #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE
+ #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h"
+ #endif
+ #include CMSIS_NVIC_VIRTUAL_HEADER_FILE
+#else
+ #define NVIC_SetPriorityGrouping __NVIC_SetPriorityGrouping
+ #define NVIC_GetPriorityGrouping __NVIC_GetPriorityGrouping
+ #define NVIC_EnableIRQ __NVIC_EnableIRQ
+ #define NVIC_GetEnableIRQ __NVIC_GetEnableIRQ
+ #define NVIC_DisableIRQ __NVIC_DisableIRQ
+ #define NVIC_GetPendingIRQ __NVIC_GetPendingIRQ
+ #define NVIC_SetPendingIRQ __NVIC_SetPendingIRQ
+ #define NVIC_ClearPendingIRQ __NVIC_ClearPendingIRQ
+ #define NVIC_GetActive __NVIC_GetActive
+ #define NVIC_SetPriority __NVIC_SetPriority
+ #define NVIC_GetPriority __NVIC_GetPriority
+ #define NVIC_SystemReset __NVIC_SystemReset
+#endif /* CMSIS_NVIC_VIRTUAL */
+
+#ifdef CMSIS_VECTAB_VIRTUAL
+ #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+ #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
+ #endif
+ #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+ #define NVIC_SetVector __NVIC_SetVector
+ #define NVIC_GetVector __NVIC_GetVector
+#endif /* (CMSIS_VECTAB_VIRTUAL) */
+
+#define NVIC_USER_IRQ_OFFSET 16
+
+
+/* Special LR values for Secure/Non-Secure call handling and exception handling */
+
+/* Function Return Payload (from ARMv8-M Architecture Reference Manual) LR value on entry from Secure BLXNS */
+#define FNC_RETURN (0xFEFFFFFFUL) /* bit [0] ignored when processing a branch */
+
+/* The following EXC_RETURN mask values are used to evaluate the LR on exception entry */
+#define EXC_RETURN_PREFIX (0xFF000000UL) /* bits [31:24] set to indicate an EXC_RETURN value */
+#define EXC_RETURN_S (0x00000040UL) /* bit [6] stack used to push registers: 0=Non-secure 1=Secure */
+#define EXC_RETURN_DCRS (0x00000020UL) /* bit [5] stacking rules for called registers: 0=skipped 1=saved */
+#define EXC_RETURN_FTYPE (0x00000010UL) /* bit [4] allocate stack for floating-point context: 0=done 1=skipped */
+#define EXC_RETURN_MODE (0x00000008UL) /* bit [3] processor mode for return: 0=Handler mode 1=Thread mode */
+#define EXC_RETURN_SPSEL (0x00000004UL) /* bit [2] stack pointer used to restore context: 0=MSP 1=PSP */
+#define EXC_RETURN_ES (0x00000001UL) /* bit [0] security state exception was taken to: 0=Non-secure 1=Secure */
+
+/* Integrity Signature (from ARMv8-M Architecture Reference Manual) for exception context stacking */
+#if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) /* Value for processors with floating-point extension: */
+#define EXC_INTEGRITY_SIGNATURE (0xFEFA125AUL) /* bit [0] SFTC must match LR bit[4] EXC_RETURN_FTYPE */
+#else
+#define EXC_INTEGRITY_SIGNATURE (0xFEFA125BUL) /* Value for processors without floating-point extension */
+#endif
+
+
+/**
+ \brief Set Priority Grouping
+ \details Sets the priority grouping field using the required unlock sequence.
+ The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+ Only values from 0..7 are used.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Priority grouping field.
+ */
+__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
+{
+ uint32_t reg_value;
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+
+ reg_value = SCB->AIRCR; /* read old register configuration */
+ reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
+ reg_value = (reg_value |
+ ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
+ SCB->AIRCR = reg_value;
+}
+
+
+/**
+ \brief Get Priority Grouping
+ \details Reads the priority grouping field from the NVIC Interrupt Controller.
+ \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void)
+{
+ return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+ \brief Enable Interrupt
+ \details Enables a device specific interrupt in the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ __COMPILER_BARRIER();
+ NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
+ }
+}
+
+
+/**
+ \brief Get Interrupt Enable status
+ \details Returns a device specific interrupt enable status from the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt is not enabled.
+ \return 1 Interrupt is enabled.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Disable Interrupt
+ \details Disables a device specific interrupt in the NVIC interrupt controller.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __DSB();
+ __ISB();
+ }
+}
+
+
+/**
+ \brief Get Pending Interrupt
+ \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not pending.
+ \return 1 Interrupt status is pending.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Pending Interrupt
+ \details Sets the pending bit of a device specific interrupt in the NVIC pending register.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Clear Pending Interrupt
+ \details Clears the pending bit of a device specific interrupt in the NVIC pending register.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Active Interrupt
+ \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not active.
+ \return 1 Interrupt status is active.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief Get Interrupt Target State
+ \details Reads the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ \return 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_GetTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Interrupt Target State
+ \details Sets the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_SetTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] |= ((uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)));
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Clear Interrupt Target State
+ \details Clears the interrupt target field in the NVIC and returns the interrupt target bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 if interrupt is assigned to Secure
+ 1 if interrupt is assigned to Non Secure
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t NVIC_ClearTargetState(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] &= ~((uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)));
+ return((uint32_t)(((NVIC->ITNS[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+
+/**
+ \brief Set Interrupt Priority
+ \details Sets the priority of a device specific interrupt or a processor exception.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \param [in] priority Priority to set.
+ \note The priority cannot be set for every processor exception.
+ */
+__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC->IPR[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+ else
+ {
+ SCB->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+}
+
+
+/**
+ \brief Get Interrupt Priority
+ \details Reads the priority of a device specific interrupt or a processor exception.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Interrupt Priority.
+ Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn)
+{
+
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return(((uint32_t)NVIC->IPR[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS)));
+ }
+ else
+ {
+ return(((uint32_t)SCB->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+ }
+}
+
+
+/**
+ \brief Encode Priority
+ \details Encodes the priority for an interrupt with the given priority group,
+ preemptive priority value, and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Used priority group.
+ \param [in] PreemptPriority Preemptive priority value (starting from 0).
+ \param [in] SubPriority Subpriority value (starting from 0).
+ \return Encoded priority. Value can be used in the function \ref NVIC_SetPriority().
+ */
+__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ return (
+ ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) |
+ ((SubPriority & (uint32_t)((1UL << (SubPriorityBits )) - 1UL)))
+ );
+}
+
+
+/**
+ \brief Decode Priority
+ \details Decodes an interrupt priority value with a given priority group to
+ preemptive priority value and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
+ \param [in] Priority Priority value, which can be retrieved with the function \ref NVIC_GetPriority().
+ \param [in] PriorityGroup Used priority group.
+ \param [out] pPreemptPriority Preemptive priority value (starting from 0).
+ \param [out] pSubPriority Subpriority value (starting from 0).
+ */
+__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL);
+ *pSubPriority = (Priority ) & (uint32_t)((1UL << (SubPriorityBits )) - 1UL);
+}
+
+
+/**
+ \brief Set Interrupt Vector
+ \details Sets an interrupt vector in SRAM based interrupt vector table.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ VTOR must been relocated to SRAM before.
+ \param [in] IRQn Interrupt number
+ \param [in] vector Address of interrupt handler function
+ */
+__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
+{
+ uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ __DSB();
+}
+
+
+/**
+ \brief Get Interrupt Vector
+ \details Reads an interrupt vector from interrupt vector table.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Address of interrupt handler function
+ */
+__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
+{
+ uint32_t *vectors = (uint32_t *)SCB->VTOR;
+ return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+}
+
+
+/**
+ \brief System Reset
+ \details Initiates a system reset request to reset the MCU.
+ */
+__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
+{
+ __DSB(); /* Ensure all outstanding memory accesses included
+ buffered write are completed before reset */
+ SCB->AIRCR = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) |
+ SCB_AIRCR_SYSRESETREQ_Msk ); /* Keep priority group unchanged */
+ __DSB(); /* Ensure completion of memory access */
+
+ for(;;) /* wait until reset */
+ {
+ __NOP();
+ }
+}
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief Set Priority Grouping (non-secure)
+ \details Sets the non-secure priority grouping field when in secure state using the required unlock sequence.
+ The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+ Only values from 0..7 are used.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+ \param [in] PriorityGroup Priority grouping field.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPriorityGrouping_NS(uint32_t PriorityGroup)
+{
+ uint32_t reg_value;
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+
+ reg_value = SCB_NS->AIRCR; /* read old register configuration */
+ reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
+ reg_value = (reg_value |
+ ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
+ SCB_NS->AIRCR = reg_value;
+}
+
+
+/**
+ \brief Get Priority Grouping (non-secure)
+ \details Reads the priority grouping field from the non-secure NVIC when in secure state.
+ \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPriorityGrouping_NS(void)
+{
+ return ((uint32_t)((SCB_NS->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+ \brief Enable Interrupt (non-secure)
+ \details Enables a device specific interrupt in the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_EnableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Interrupt Enable status (non-secure)
+ \details Returns a device specific interrupt enable status from the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt is not enabled.
+ \return 1 Interrupt is enabled.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetEnableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Disable Interrupt (non-secure)
+ \details Disables a device specific interrupt in the non-secure NVIC interrupt controller when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_DisableIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Pending Interrupt (non-secure)
+ \details Reads the NVIC pending register in the non-secure NVIC when in secure state and returns the pending bit for the specified device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not pending.
+ \return 1 Interrupt status is pending.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Pending Interrupt (non-secure)
+ \details Sets the pending bit of a device specific interrupt in the non-secure NVIC pending register when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Clear Pending Interrupt (non-secure)
+ \details Clears the pending bit of a device specific interrupt in the non-secure NVIC pending register when in secure state.
+ \param [in] IRQn Device specific interrupt number.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE void TZ_NVIC_ClearPendingIRQ_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ }
+}
+
+
+/**
+ \brief Get Active Interrupt (non-secure)
+ \details Reads the active register in non-secure NVIC when in secure state and returns the active bit for the device specific interrupt.
+ \param [in] IRQn Device specific interrupt number.
+ \return 0 Interrupt status is not active.
+ \return 1 Interrupt status is active.
+ \note IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetActive_NS(IRQn_Type IRQn)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return((uint32_t)(((NVIC_NS->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+ }
+ else
+ {
+ return(0U);
+ }
+}
+
+
+/**
+ \brief Set Interrupt Priority (non-secure)
+ \details Sets the priority of a non-secure device specific interrupt or a non-secure processor exception when in secure state.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \param [in] priority Priority to set.
+ \note The priority cannot be set for every non-secure processor exception.
+ */
+__STATIC_INLINE void TZ_NVIC_SetPriority_NS(IRQn_Type IRQn, uint32_t priority)
+{
+ if ((int32_t)(IRQn) >= 0)
+ {
+ NVIC_NS->IPR[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+ else
+ {
+ SCB_NS->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+}
+
+
+/**
+ \brief Get Interrupt Priority (non-secure)
+ \details Reads the priority of a non-secure device specific interrupt or a non-secure processor exception when in secure state.
+ The interrupt number can be positive to specify a device specific interrupt,
+ or negative to specify a processor exception.
+ \param [in] IRQn Interrupt number.
+ \return Interrupt Priority. Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t TZ_NVIC_GetPriority_NS(IRQn_Type IRQn)
+{
+
+ if ((int32_t)(IRQn) >= 0)
+ {
+ return(((uint32_t)NVIC_NS->IPR[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS)));
+ }
+ else
+ {
+ return(((uint32_t)SCB_NS->SHPR[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+ }
+}
+#endif /* defined (__ARM_FEATURE_CMSE) &&(__ARM_FEATURE_CMSE == 3U) */
+
+/*@} end of CMSIS_Core_NVICFunctions */
+
+/* ########################## MPU functions #################################### */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+
+#include "mpu_armv8.h"
+
+#endif
+
+/* ########################## FPU functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_FpuFunctions FPU Functions
+ \brief Function that provides FPU type.
+ @{
+ */
+
+/**
+ \brief get FPU type
+ \details returns the FPU type
+ \returns
+ - \b 0: No FPU
+ - \b 1: Single precision FPU
+ - \b 2: Double + Single precision FPU
+ */
+__STATIC_INLINE uint32_t SCB_GetFPUType(void)
+{
+ uint32_t mvfr0;
+
+ mvfr0 = FPU->MVFR0;
+ if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x220U)
+ {
+ return 2U; /* Double + Single precision FPU */
+ }
+ else if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U)
+ {
+ return 1U; /* Single precision FPU */
+ }
+ else
+ {
+ return 0U; /* No FPU */
+ }
+}
+
+
+/*@} end of CMSIS_Core_FpuFunctions */
+
+
+
+/* ########################## SAU functions #################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_SAUFunctions SAU Functions
+ \brief Functions that configure the SAU.
+ @{
+ */
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+
+/**
+ \brief Enable SAU
+ \details Enables the Security Attribution Unit (SAU).
+ */
+__STATIC_INLINE void TZ_SAU_Enable(void)
+{
+ SAU->CTRL |= (SAU_CTRL_ENABLE_Msk);
+}
+
+
+
+/**
+ \brief Disable SAU
+ \details Disables the Security Attribution Unit (SAU).
+ */
+__STATIC_INLINE void TZ_SAU_Disable(void)
+{
+ SAU->CTRL &= ~(SAU_CTRL_ENABLE_Msk);
+}
+
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+/*@} end of CMSIS_Core_SAUFunctions */
+
+
+
+
+/* ################################## SysTick function ############################################ */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_SysTickFunctions SysTick Functions
+ \brief Functions that configure the System.
+ @{
+ */
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U)
+
+/**
+ \brief System Tick Configuration
+ \details Initializes the System Timer and its interrupt, and starts the System Tick Timer.
+ Counter is in free running mode to generate periodic interrupts.
+ \param [in] ticks Number of ticks between two interrupts.
+ \return 0 Function succeeded.
+ \return 1 Function failed.
+ \note When the variable __Vendor_SysTickConfig is set to 1, then the
+ function SysTick_Config is not included. In this case, the file device.h
+ must contain a vendor-specific implementation of this function.
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks)
+{
+ if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+ {
+ return (1UL); /* Reload value impossible */
+ }
+
+ SysTick->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */
+ NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+ SysTick->VAL = 0UL; /* Load the SysTick Counter Value */
+ SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk |
+ SysTick_CTRL_TICKINT_Msk |
+ SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */
+ return (0UL); /* Function successful */
+}
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+/**
+ \brief System Tick Configuration (non-secure)
+ \details Initializes the non-secure System Timer and its interrupt when in secure state, and starts the System Tick Timer.
+ Counter is in free running mode to generate periodic interrupts.
+ \param [in] ticks Number of ticks between two interrupts.
+ \return 0 Function succeeded.
+ \return 1 Function failed.
+ \note When the variable __Vendor_SysTickConfig is set to 1, then the
+ function TZ_SysTick_Config_NS is not included. In this case, the file device.h
+ must contain a vendor-specific implementation of this function.
+
+ */
+__STATIC_INLINE uint32_t TZ_SysTick_Config_NS(uint32_t ticks)
+{
+ if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+ {
+ return (1UL); /* Reload value impossible */
+ }
+
+ SysTick_NS->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */
+ TZ_NVIC_SetPriority_NS (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+ SysTick_NS->VAL = 0UL; /* Load the SysTick Counter Value */
+ SysTick_NS->CTRL = SysTick_CTRL_CLKSOURCE_Msk |
+ SysTick_CTRL_TICKINT_Msk |
+ SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */
+ return (0UL); /* Function successful */
+}
+#endif /* defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) */
+
+#endif
+
+/*@} end of CMSIS_Core_SysTickFunctions */
+
+
+
+/* ##################################### Debug In/Output function ########################################### */
+/**
+ \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_core_DebugFunctions ITM Functions
+ \brief Functions that access the ITM debug interface.
+ @{
+ */
+
+extern volatile int32_t ITM_RxBuffer; /*!< External variable to receive characters. */
+#define ITM_RXBUFFER_EMPTY ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */
+
+
+/**
+ \brief ITM Send Character
+ \details Transmits a character via the ITM channel 0, and
+ \li Just returns when no debugger is connected that has booked the output.
+ \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted.
+ \param [in] ch Character to transmit.
+ \returns Character to transmit.
+ */
+__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch)
+{
+ if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) && /* ITM enabled */
+ ((ITM->TER & 1UL ) != 0UL) ) /* ITM Port #0 enabled */
+ {
+ while (ITM->PORT[0U].u32 == 0UL)
+ {
+ __NOP();
+ }
+ ITM->PORT[0U].u8 = (uint8_t)ch;
+ }
+ return (ch);
+}
+
+
+/**
+ \brief ITM Receive Character
+ \details Inputs a character via the external variable \ref ITM_RxBuffer.
+ \return Received character.
+ \return -1 No character pending.
+ */
+__STATIC_INLINE int32_t ITM_ReceiveChar (void)
+{
+ int32_t ch = -1; /* no character available */
+
+ if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY)
+ {
+ ch = ITM_RxBuffer;
+ ITM_RxBuffer = ITM_RXBUFFER_EMPTY; /* ready for next character */
+ }
+
+ return (ch);
+}
+
+
+/**
+ \brief ITM Check Character
+ \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer.
+ \return 0 No character available.
+ \return 1 Character available.
+ */
+__STATIC_INLINE int32_t ITM_CheckChar (void)
+{
+
+ if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY)
+ {
+ return (0); /* no character available */
+ }
+ else
+ {
+ return (1); /* character available */
+ }
+}
+
+/*@} end of CMSIS_core_DebugFunctions */
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM35P_H_DEPENDANT */
+
+#endif /* __CMSIS_GENERIC */
diff --git a/Include/core_cm4.h b/Include/core_cm4.h
index 7d56873..12c023b 100644
--- a/Include/core_cm4.h
+++ b/Include/core_cm4.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_cm4.h
* @brief CMSIS Cortex-M4 Core Peripheral Access Layer Header File
- * @version V5.0.8
- * @date 04. June 2018
+ * @version V5.1.0
+ * @date 13. March 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -86,7 +86,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
#define __FPU_USED 1U
#else
@@ -408,7 +408,7 @@ typedef struct
__IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[24U];
__IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[24U];
+ uint32_t RESERVED1[24U];
__IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[24U];
__IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -822,10 +822,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -876,18 +873,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -1120,13 +1105,13 @@ typedef struct
/* TPI Integration ETM Data Register Definitions (FIFO0) */
#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */
-#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */
#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */
-#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */
#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
@@ -1149,13 +1134,13 @@ typedef struct
/* TPI Integration ITM Data Register Definitions (FIFO1) */
#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */
-#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */
#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */
-#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */
#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
@@ -1324,6 +1309,7 @@ typedef struct
__IOM uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */
__IM uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */
__IM uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */
+ __IM uint32_t MVFR2; /*!< Offset: 0x018 (R/ ) Media and FP Feature Register 2 */
} FPU_Type;
/* Floating-Point Context Control Register Definitions */
@@ -1409,6 +1395,11 @@ typedef struct
#define FPU_MVFR1_FtZ_mode_Pos 0U /*!< MVFR1: FtZ mode bits Position */
#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */
+/* Media and FP Feature Register 2 Definitions */
+
+#define FPU_MVFR2_VFP_Misc_Pos 4U /*!< MVFR2: VFP Misc bits Position */
+#define FPU_MVFR2_VFP_Misc_Msk (0xFUL << FPU_MVFR2_VFP_Misc_Pos) /*!< MVFR2: VFP Misc bits Mask */
+
/*@} end of group CMSIS_FPU */
@@ -1625,7 +1616,7 @@ typedef struct
#ifdef CMSIS_VECTAB_VIRTUAL
#ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE
- #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
+ #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
#endif
#include CMSIS_VECTAB_VIRTUAL_HEADER_FILE
#else
@@ -1689,7 +1680,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -1912,8 +1905,9 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
*/
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ /* ARM Application Note 321 states that the M4 does not require the architectural barrier */
}
@@ -1927,8 +1921,8 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
*/
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
@@ -1953,6 +1947,7 @@ __NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
/*@} end of CMSIS_Core_NVICFunctions */
+
/* ########################## MPU functions #################################### */
#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
diff --git a/Include/core_cm7.h b/Include/core_cm7.h
index a14dc62..c4515d8 100644
--- a/Include/core_cm7.h
+++ b/Include/core_cm7.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_cm7.h
* @brief CMSIS Cortex-M7 Core Peripheral Access Layer Header File
- * @version V5.0.8
- * @date 04. June 2018
+ * @version V5.1.1
+ * @date 28. March 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -86,7 +86,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
#define __FPU_USED 1U
#else
@@ -423,7 +423,7 @@ typedef struct
__IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[24U];
__IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[24U];
+ uint32_t RESERVED1[24U];
__IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[24U];
__IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -930,6 +930,24 @@ typedef struct
#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
/* Auxiliary Control Register Definitions */
+#define SCnSCB_ACTLR_DISDYNADD_Pos 26U /*!< ACTLR: DISDYNADD Position */
+#define SCnSCB_ACTLR_DISDYNADD_Msk (1UL << SCnSCB_ACTLR_DISDYNADD_Pos) /*!< ACTLR: DISDYNADD Mask */
+
+#define SCnSCB_ACTLR_DISISSCH1_Pos 21U /*!< ACTLR: DISISSCH1 Position */
+#define SCnSCB_ACTLR_DISISSCH1_Msk (0x1FUL << SCnSCB_ACTLR_DISISSCH1_Pos) /*!< ACTLR: DISISSCH1 Mask */
+
+#define SCnSCB_ACTLR_DISDI_Pos 16U /*!< ACTLR: DISDI Position */
+#define SCnSCB_ACTLR_DISDI_Msk (0x1FUL << SCnSCB_ACTLR_DISDI_Pos) /*!< ACTLR: DISDI Mask */
+
+#define SCnSCB_ACTLR_DISCRITAXIRUR_Pos 15U /*!< ACTLR: DISCRITAXIRUR Position */
+#define SCnSCB_ACTLR_DISCRITAXIRUR_Msk (1UL << SCnSCB_ACTLR_DISCRITAXIRUR_Pos) /*!< ACTLR: DISCRITAXIRUR Mask */
+
+#define SCnSCB_ACTLR_DISBTACALLOC_Pos 14U /*!< ACTLR: DISBTACALLOC Position */
+#define SCnSCB_ACTLR_DISBTACALLOC_Msk (1UL << SCnSCB_ACTLR_DISBTACALLOC_Pos) /*!< ACTLR: DISBTACALLOC Mask */
+
+#define SCnSCB_ACTLR_DISBTACREAD_Pos 13U /*!< ACTLR: DISBTACREAD Position */
+#define SCnSCB_ACTLR_DISBTACREAD_Msk (1UL << SCnSCB_ACTLR_DISBTACREAD_Pos) /*!< ACTLR: DISBTACREAD Mask */
+
#define SCnSCB_ACTLR_DISITMATBFLUSH_Pos 12U /*!< ACTLR: DISITMATBFLUSH Position */
#define SCnSCB_ACTLR_DISITMATBFLUSH_Msk (1UL << SCnSCB_ACTLR_DISITMATBFLUSH_Pos) /*!< ACTLR: DISITMATBFLUSH Mask */
@@ -1024,10 +1042,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -1078,18 +1093,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -1325,13 +1328,13 @@ typedef struct
/* TPI Integration ETM Data Register Definitions (FIFO0) */
#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */
-#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */
#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */
-#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */
#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
@@ -1354,13 +1357,13 @@ typedef struct
/* TPI Integration ITM Data Register Definitions (FIFO1) */
#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */
-#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */
#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */
-#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */
#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
@@ -1617,6 +1620,9 @@ typedef struct
/* Media and FP Feature Register 2 Definitions */
+#define FPU_MVFR2_VFP_Misc_Pos 4U /*!< MVFR2: VFP Misc bits Position */
+#define FPU_MVFR2_VFP_Misc_Msk (0xFUL << FPU_MVFR2_VFP_Misc_Pos) /*!< MVFR2: VFP Misc bits Mask */
+
/*@} end of group CMSIS_FPU */
@@ -1897,7 +1903,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -2120,8 +2128,9 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
*/
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ __DSB();
}
@@ -2135,8 +2144,8 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
*/
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
@@ -2161,6 +2170,7 @@ __NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
/*@} end of CMSIS_Core_NVICFunctions */
+
/* ########################## MPU functions #################################### */
#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
@@ -2169,6 +2179,7 @@ __NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
#endif
+
/* ########################## FPU functions #################################### */
/**
\ingroup CMSIS_Core_FunctionInterface
@@ -2204,7 +2215,6 @@ __STATIC_INLINE uint32_t SCB_GetFPUType(void)
}
}
-
/*@} end of CMSIS_Core_FpuFunctions */
@@ -2221,14 +2231,18 @@ __STATIC_INLINE uint32_t SCB_GetFPUType(void)
#define CCSIDR_WAYS(x) (((x) & SCB_CCSIDR_ASSOCIATIVITY_Msk) >> SCB_CCSIDR_ASSOCIATIVITY_Pos)
#define CCSIDR_SETS(x) (((x) & SCB_CCSIDR_NUMSETS_Msk ) >> SCB_CCSIDR_NUMSETS_Pos )
+#define __SCB_DCACHE_LINE_SIZE 32U /*!< Cortex-M7 cache line size is fixed to 32 bytes (8 words). See also register SCB_CCSIDR */
+#define __SCB_ICACHE_LINE_SIZE 32U /*!< Cortex-M7 cache line size is fixed to 32 bytes (8 words). See also register SCB_CCSIDR */
/**
\brief Enable I-Cache
\details Turns on I-Cache
*/
-__STATIC_INLINE void SCB_EnableICache (void)
+__STATIC_FORCEINLINE void SCB_EnableICache (void)
{
#if defined (__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1U)
+ if (SCB->CCR & SCB_CCR_IC_Msk) return; /* return if ICache is already enabled */
+
__DSB();
__ISB();
SCB->ICIALLU = 0UL; /* invalidate I-Cache */
@@ -2245,7 +2259,7 @@ __STATIC_INLINE void SCB_EnableICache (void)
\brief Disable I-Cache
\details Turns off I-Cache
*/
-__STATIC_INLINE void SCB_DisableICache (void)
+__STATIC_FORCEINLINE void SCB_DisableICache (void)
{
#if defined (__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1U)
__DSB();
@@ -2262,7 +2276,7 @@ __STATIC_INLINE void SCB_DisableICache (void)
\brief Invalidate I-Cache
\details Invalidates I-Cache
*/
-__STATIC_INLINE void SCB_InvalidateICache (void)
+__STATIC_FORCEINLINE void SCB_InvalidateICache (void)
{
#if defined (__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1U)
__DSB();
@@ -2274,18 +2288,50 @@ __STATIC_INLINE void SCB_InvalidateICache (void)
}
+/**
+ \brief I-Cache Invalidate by address
+ \details Invalidates I-Cache for the given address.
+ I-Cache is invalidated starting from a 32 byte aligned address in 32 byte granularity.
+ I-Cache memory blocks which are part of given address + given size are invalidated.
+ \param[in] addr address
+ \param[in] isize size of memory block (in number of bytes)
+*/
+__STATIC_FORCEINLINE void SCB_InvalidateICache_by_Addr (void *addr, int32_t isize)
+{
+ #if defined (__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1U)
+ if ( isize > 0 ) {
+ int32_t op_size = isize + (((uint32_t)addr) & (__SCB_ICACHE_LINE_SIZE - 1U));
+ uint32_t op_addr = (uint32_t)addr /* & ~(__SCB_ICACHE_LINE_SIZE - 1U) */;
+
+ __DSB();
+
+ do {
+ SCB->ICIMVAU = op_addr; /* register accepts only 32byte aligned values, only bits 31..5 are valid */
+ op_addr += __SCB_ICACHE_LINE_SIZE;
+ op_size -= __SCB_ICACHE_LINE_SIZE;
+ } while ( op_size > 0 );
+
+ __DSB();
+ __ISB();
+ }
+ #endif
+}
+
+
/**
\brief Enable D-Cache
\details Turns on D-Cache
*/
-__STATIC_INLINE void SCB_EnableDCache (void)
+__STATIC_FORCEINLINE void SCB_EnableDCache (void)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
uint32_t ccsidr;
uint32_t sets;
uint32_t ways;
- SCB->CSSELR = 0U; /*(0U << 1U) | 0U;*/ /* Level 1 data cache */
+ if (SCB->CCR & SCB_CCR_DC_Msk) return; /* return if DCache is already enabled */
+
+ SCB->CSSELR = 0U; /* select Level 1 data cache */
__DSB();
ccsidr = SCB->CCSIDR;
@@ -2316,14 +2362,14 @@ __STATIC_INLINE void SCB_EnableDCache (void)
\brief Disable D-Cache
\details Turns off D-Cache
*/
-__STATIC_INLINE void SCB_DisableDCache (void)
+__STATIC_FORCEINLINE void SCB_DisableDCache (void)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
uint32_t ccsidr;
uint32_t sets;
uint32_t ways;
- SCB->CSSELR = 0U; /*(0U << 1U) | 0U;*/ /* Level 1 data cache */
+ SCB->CSSELR = 0U; /* select Level 1 data cache */
__DSB();
SCB->CCR &= ~(uint32_t)SCB_CCR_DC_Msk; /* disable D-Cache */
@@ -2354,14 +2400,14 @@ __STATIC_INLINE void SCB_DisableDCache (void)
\brief Invalidate D-Cache
\details Invalidates D-Cache
*/
-__STATIC_INLINE void SCB_InvalidateDCache (void)
+__STATIC_FORCEINLINE void SCB_InvalidateDCache (void)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
uint32_t ccsidr;
uint32_t sets;
uint32_t ways;
- SCB->CSSELR = 0U; /*(0U << 1U) | 0U;*/ /* Level 1 data cache */
+ SCB->CSSELR = 0U; /* select Level 1 data cache */
__DSB();
ccsidr = SCB->CCSIDR;
@@ -2389,15 +2435,15 @@ __STATIC_INLINE void SCB_InvalidateDCache (void)
\brief Clean D-Cache
\details Cleans D-Cache
*/
-__STATIC_INLINE void SCB_CleanDCache (void)
+__STATIC_FORCEINLINE void SCB_CleanDCache (void)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
uint32_t ccsidr;
uint32_t sets;
uint32_t ways;
- SCB->CSSELR = 0U; /*(0U << 1U) | 0U;*/ /* Level 1 data cache */
- __DSB();
+ SCB->CSSELR = 0U; /* select Level 1 data cache */
+ __DSB();
ccsidr = SCB->CCSIDR;
@@ -2424,14 +2470,14 @@ __STATIC_INLINE void SCB_CleanDCache (void)
\brief Clean & Invalidate D-Cache
\details Cleans and Invalidates D-Cache
*/
-__STATIC_INLINE void SCB_CleanInvalidateDCache (void)
+__STATIC_FORCEINLINE void SCB_CleanInvalidateDCache (void)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
uint32_t ccsidr;
uint32_t sets;
uint32_t ways;
- SCB->CSSELR = 0U; /*(0U << 1U) | 0U;*/ /* Level 1 data cache */
+ SCB->CSSELR = 0U; /* select Level 1 data cache */
__DSB();
ccsidr = SCB->CCSIDR;
@@ -2457,27 +2503,30 @@ __STATIC_INLINE void SCB_CleanInvalidateDCache (void)
/**
\brief D-Cache Invalidate by address
- \details Invalidates D-Cache for the given address
- \param[in] addr address (aligned to 32-byte boundary)
+ \details Invalidates D-Cache for the given address.
+ D-Cache is invalidated starting from a 32 byte aligned address in 32 byte granularity.
+ D-Cache memory blocks which are part of given address + given size are invalidated.
+ \param[in] addr address
\param[in] dsize size of memory block (in number of bytes)
*/
-__STATIC_INLINE void SCB_InvalidateDCache_by_Addr (uint32_t *addr, int32_t dsize)
+__STATIC_FORCEINLINE void SCB_InvalidateDCache_by_Addr (void *addr, int32_t dsize)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
- int32_t op_size = dsize;
- uint32_t op_addr = (uint32_t)addr;
- int32_t linesize = 32; /* in Cortex-M7 size of cache line is fixed to 8 words (32 bytes) */
+ if ( dsize > 0 ) {
+ int32_t op_size = dsize + (((uint32_t)addr) & (__SCB_DCACHE_LINE_SIZE - 1U));
+ uint32_t op_addr = (uint32_t)addr /* & ~(__SCB_DCACHE_LINE_SIZE - 1U) */;
+
+ __DSB();
- __DSB();
+ do {
+ SCB->DCIMVAC = op_addr; /* register accepts only 32byte aligned values, only bits 31..5 are valid */
+ op_addr += __SCB_DCACHE_LINE_SIZE;
+ op_size -= __SCB_DCACHE_LINE_SIZE;
+ } while ( op_size > 0 );
- while (op_size > 0) {
- SCB->DCIMVAC = op_addr;
- op_addr += (uint32_t)linesize;
- op_size -= linesize;
+ __DSB();
+ __ISB();
}
-
- __DSB();
- __ISB();
#endif
}
@@ -2485,26 +2534,29 @@ __STATIC_INLINE void SCB_InvalidateDCache_by_Addr (uint32_t *addr, int32_t dsize
/**
\brief D-Cache Clean by address
\details Cleans D-Cache for the given address
- \param[in] addr address (aligned to 32-byte boundary)
+ D-Cache is cleaned starting from a 32 byte aligned address in 32 byte granularity.
+ D-Cache memory blocks which are part of given address + given size are cleaned.
+ \param[in] addr address
\param[in] dsize size of memory block (in number of bytes)
*/
-__STATIC_INLINE void SCB_CleanDCache_by_Addr (uint32_t *addr, int32_t dsize)
+__STATIC_FORCEINLINE void SCB_CleanDCache_by_Addr (uint32_t *addr, int32_t dsize)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
- int32_t op_size = dsize;
- uint32_t op_addr = (uint32_t) addr;
- int32_t linesize = 32; /* in Cortex-M7 size of cache line is fixed to 8 words (32 bytes) */
+ if ( dsize > 0 ) {
+ int32_t op_size = dsize + (((uint32_t)addr) & (__SCB_DCACHE_LINE_SIZE - 1U));
+ uint32_t op_addr = (uint32_t)addr /* & ~(__SCB_DCACHE_LINE_SIZE - 1U) */;
+
+ __DSB();
- __DSB();
+ do {
+ SCB->DCCMVAC = op_addr; /* register accepts only 32byte aligned values, only bits 31..5 are valid */
+ op_addr += __SCB_DCACHE_LINE_SIZE;
+ op_size -= __SCB_DCACHE_LINE_SIZE;
+ } while ( op_size > 0 );
- while (op_size > 0) {
- SCB->DCCMVAC = op_addr;
- op_addr += (uint32_t)linesize;
- op_size -= linesize;
+ __DSB();
+ __ISB();
}
-
- __DSB();
- __ISB();
#endif
}
@@ -2512,30 +2564,32 @@ __STATIC_INLINE void SCB_CleanDCache_by_Addr (uint32_t *addr, int32_t dsize)
/**
\brief D-Cache Clean and Invalidate by address
\details Cleans and invalidates D_Cache for the given address
+ D-Cache is cleaned and invalidated starting from a 32 byte aligned address in 32 byte granularity.
+ D-Cache memory blocks which are part of given address + given size are cleaned and invalidated.
\param[in] addr address (aligned to 32-byte boundary)
\param[in] dsize size of memory block (in number of bytes)
*/
-__STATIC_INLINE void SCB_CleanInvalidateDCache_by_Addr (uint32_t *addr, int32_t dsize)
+__STATIC_FORCEINLINE void SCB_CleanInvalidateDCache_by_Addr (uint32_t *addr, int32_t dsize)
{
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
- int32_t op_size = dsize;
- uint32_t op_addr = (uint32_t) addr;
- int32_t linesize = 32; /* in Cortex-M7 size of cache line is fixed to 8 words (32 bytes) */
+ if ( dsize > 0 ) {
+ int32_t op_size = dsize + (((uint32_t)addr) & (__SCB_DCACHE_LINE_SIZE - 1U));
+ uint32_t op_addr = (uint32_t)addr /* & ~(__SCB_DCACHE_LINE_SIZE - 1U) */;
+
+ __DSB();
- __DSB();
+ do {
+ SCB->DCCIMVAC = op_addr; /* register accepts only 32byte aligned values, only bits 31..5 are valid */
+ op_addr += __SCB_DCACHE_LINE_SIZE;
+ op_size -= __SCB_DCACHE_LINE_SIZE;
+ } while ( op_size > 0 );
- while (op_size > 0) {
- SCB->DCCIMVAC = op_addr;
- op_addr += (uint32_t)linesize;
- op_size -= linesize;
+ __DSB();
+ __ISB();
}
-
- __DSB();
- __ISB();
#endif
}
-
/*@} end of CMSIS_Core_CacheFunctions */
diff --git a/Include/core_sc000.h b/Include/core_sc000.h
index 9b67c92..cf92577 100644
--- a/Include/core_sc000.h
+++ b/Include/core_sc000.h
@@ -1,8 +1,8 @@
/**************************************************************************//**
* @file core_sc000.h
* @brief CMSIS SC000 Core Peripheral Access Layer Header File
- * @version V5.0.5
- * @date 28. May 2018
+ * @version V5.0.6
+ * @date 12. November 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -750,7 +750,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[0U] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -904,6 +906,7 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
uint32_t *vectors = (uint32_t *)SCB->VTOR;
vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ /* ARM Application Note 321 states that the M0 and M0+ do not require the architectural barrier - assume SC000 is the same */
}
diff --git a/Include/core_sc300.h b/Include/core_sc300.h
index 3e8a471..40f3af8 100644
--- a/Include/core_sc300.h
+++ b/Include/core_sc300.h
@@ -1,11 +1,11 @@
/**************************************************************************//**
* @file core_sc300.h
* @brief CMSIS SC300 Core Peripheral Access Layer Header File
- * @version V5.0.6
- * @date 04. June 2018
+ * @version V5.0.8
+ * @date 31. May 2019
******************************************************************************/
/*
- * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -81,7 +81,7 @@
#endif
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
- #if defined __ARM_PCS_VFP
+ #if defined __ARM_FP
#error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
#endif
@@ -342,7 +342,7 @@ typedef struct
__IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
uint32_t RESERVED0[24U];
__IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
- uint32_t RSERVED1[24U];
+ uint32_t RESERVED1[24U];
__IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
uint32_t RESERVED2[24U];
__IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
@@ -653,13 +653,23 @@ typedef struct
{
uint32_t RESERVED0[1U];
__IM uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */
- uint32_t RESERVED1[1U];
+ __IOM uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */
} SCnSCB_Type;
/* Interrupt Controller Type Register Definitions */
#define SCnSCB_ICTR_INTLINESNUM_Pos 0U /*!< ICTR: INTLINESNUM Position */
#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
+/* Auxiliary Control Register Definitions */
+#define SCnSCB_ACTLR_DISFOLD_Pos 2U /*!< ACTLR: DISFOLD Position */
+#define SCnSCB_ACTLR_DISFOLD_Msk (1UL << SCnSCB_ACTLR_DISFOLD_Pos) /*!< ACTLR: DISFOLD Mask */
+
+#define SCnSCB_ACTLR_DISDEFWBUF_Pos 1U /*!< ACTLR: DISDEFWBUF Position */
+#define SCnSCB_ACTLR_DISDEFWBUF_Msk (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos) /*!< ACTLR: DISDEFWBUF Mask */
+
+#define SCnSCB_ACTLR_DISMCYCINT_Pos 0U /*!< ACTLR: DISMCYCINT Position */
+#define SCnSCB_ACTLR_DISMCYCINT_Msk (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/) /*!< ACTLR: DISMCYCINT Mask */
+
/*@} end of group CMSIS_SCnotSCB */
@@ -739,10 +749,7 @@ typedef struct
__IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
uint32_t RESERVED2[15U];
__IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
- uint32_t RESERVED3[29U];
- __OM uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
- __IM uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
- __IOM uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED3[32U];
uint32_t RESERVED4[43U];
__OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
__IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
@@ -793,18 +800,6 @@ typedef struct
#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */
#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
-/* ITM Integration Write Register Definitions */
-#define ITM_IWR_ATVALIDM_Pos 0U /*!< ITM IWR: ATVALIDM Position */
-#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
-
-/* ITM Integration Read Register Definitions */
-#define ITM_IRR_ATREADYM_Pos 0U /*!< ITM IRR: ATREADYM Position */
-#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
-
-/* ITM Integration Mode Control Register Definitions */
-#define ITM_IMCR_INTEGRATION_Pos 0U /*!< ITM IMCR: INTEGRATION Position */
-#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
-
/* ITM Lock Status Register Definitions */
#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */
#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
@@ -1037,13 +1032,13 @@ typedef struct
/* TPI Integration ETM Data Register Definitions (FIFO0) */
#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */
-#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */
#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */
-#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */
#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
@@ -1066,13 +1061,13 @@ typedef struct
/* TPI Integration ITM Data Register Definitions (FIFO1) */
#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */
-#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */
#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */
-#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */
#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
@@ -1448,7 +1443,6 @@ typedef struct
#define EXC_RETURN_THREAD_PSP (0xFFFFFFFDUL) /* return to Thread mode, uses PSP after return */
-
/**
\brief Set Priority Grouping
\details Sets the priority grouping field using the required unlock sequence.
@@ -1467,7 +1461,7 @@ __STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
reg_value = (reg_value |
((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
- (PriorityGroupTmp << 8U) ); /* Insert write key and priorty group */
+ (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */
SCB->AIRCR = reg_value;
}
@@ -1493,7 +1487,9 @@ __STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
{
if ((int32_t)(IRQn) >= 0)
{
+ __COMPILER_BARRIER();
NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+ __COMPILER_BARRIER();
}
}
@@ -1716,8 +1712,9 @@ __STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGr
*/
__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ (* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4)) = vector;
+ /* ARM Application Note 321 states that the M3 does not require the architectural barrier */
}
@@ -1731,8 +1728,8 @@ __STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
*/
__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
{
- uint32_t *vectors = (uint32_t *)SCB->VTOR;
- return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+ uint32_t vectors = (uint32_t )SCB->VTOR;
+ return (uint32_t)(* (int *) (vectors + ((int32_t)IRQn + NVIC_USER_IRQ_OFFSET) * 4));
}
diff --git a/Include/mpu_armv7.h b/Include/mpu_armv7.h
index 0142203..66ef59b 100644
--- a/Include/mpu_armv7.h
+++ b/Include/mpu_armv7.h
@@ -1,11 +1,11 @@
/******************************************************************************
* @file mpu_armv7.h
* @brief CMSIS MPU API for Armv7-M MPU
- * @version V5.0.4
- * @date 10. January 2018
+ * @version V5.1.0
+ * @date 08. March 2019
******************************************************************************/
/*
- * Copyright (c) 2017-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2017-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -86,10 +86,10 @@
* \param IsBufferable Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
*/
#define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable) \
- ((((TypeExtField ) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk) | \
- (((IsShareable ) << MPU_RASR_S_Pos) & MPU_RASR_S_Msk) | \
- (((IsCacheable ) << MPU_RASR_C_Pos) & MPU_RASR_C_Msk) | \
- (((IsBufferable ) << MPU_RASR_B_Pos) & MPU_RASR_B_Msk))
+ ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk) | \
+ (((IsShareable) << MPU_RASR_S_Pos) & MPU_RASR_S_Msk) | \
+ (((IsCacheable) << MPU_RASR_C_Pos) & MPU_RASR_C_Msk) | \
+ (((IsBufferable) << MPU_RASR_B_Pos) & MPU_RASR_B_Msk))
/**
* MPU Region Attribute and Size Register Value
@@ -100,11 +100,14 @@
* \param SubRegionDisable Sub-region disable field.
* \param Size Region size of the region to be configured, for example 4K, 8K.
*/
-#define ARM_MPU_RASR_EX(DisableExec, AccessPermission, AccessAttributes, SubRegionDisable, Size) \
- ((((DisableExec ) << MPU_RASR_XN_Pos) & MPU_RASR_XN_Msk) | \
- (((AccessPermission) << MPU_RASR_AP_Pos) & MPU_RASR_AP_Msk) | \
- (((AccessAttributes) ) & (MPU_RASR_TEX_Msk | MPU_RASR_S_Msk | MPU_RASR_C_Msk | MPU_RASR_B_Msk)))
-
+#define ARM_MPU_RASR_EX(DisableExec, AccessPermission, AccessAttributes, SubRegionDisable, Size) \
+ ((((DisableExec) << MPU_RASR_XN_Pos) & MPU_RASR_XN_Msk) | \
+ (((AccessPermission) << MPU_RASR_AP_Pos) & MPU_RASR_AP_Msk) | \
+ (((AccessAttributes) & (MPU_RASR_TEX_Msk | MPU_RASR_S_Msk | MPU_RASR_C_Msk | MPU_RASR_B_Msk))) | \
+ (((SubRegionDisable) << MPU_RASR_SRD_Pos) & MPU_RASR_SRD_Msk) | \
+ (((Size) << MPU_RASR_SIZE_Pos) & MPU_RASR_SIZE_Msk) | \
+ (((MPU_RASR_ENABLE_Msk))))
+
/**
* MPU Region Attribute and Size Register Value
*
@@ -131,7 +134,7 @@
/**
* MPU Memory Access Attribute for device memory.
-* - TEX: 000b (if non-shareable) or 010b (if shareable)
+* - TEX: 000b (if shareable) or 010b (if non-shareable)
* - Shareable or non-shareable
* - Non-cacheable
* - Bufferable (if shareable) or non-bufferable (if non-shareable)
@@ -187,20 +190,19 @@ typedef struct {
*/
__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
{
- __DSB();
- __ISB();
MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
#endif
+ __DSB();
+ __ISB();
}
/** Disable the MPU.
*/
__STATIC_INLINE void ARM_MPU_Disable(void)
{
- __DSB();
- __ISB();
+ __DMB();
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
#endif
@@ -243,7 +245,7 @@ __STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t r
* \param src Source data is copied from.
* \param len Amount of data words to be copied.
*/
-__STATIC_INLINE void orderedCpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
{
uint32_t i;
for (i = 0U; i < len; ++i)
@@ -260,11 +262,11 @@ __STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt)
{
const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
while (cnt > MPU_TYPE_RALIASES) {
- orderedCpy(&(MPU->RBAR), &(table->RBAR), MPU_TYPE_RALIASES*rowWordSize);
+ ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), MPU_TYPE_RALIASES*rowWordSize);
table += MPU_TYPE_RALIASES;
cnt -= MPU_TYPE_RALIASES;
}
- orderedCpy(&(MPU->RBAR), &(table->RBAR), cnt*rowWordSize);
+ ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), cnt*rowWordSize);
}
#endif
diff --git a/Include/mpu_armv8.h b/Include/mpu_armv8.h
index 62571da..0041d4d 100644
--- a/Include/mpu_armv8.h
+++ b/Include/mpu_armv8.h
@@ -1,11 +1,11 @@
/******************************************************************************
* @file mpu_armv8.h
- * @brief CMSIS MPU API for Armv8-M MPU
- * @version V5.0.4
- * @date 10. January 2018
+ * @brief CMSIS MPU API for Armv8-M and Armv8.1-M MPU
+ * @version V5.1.0
+ * @date 08. March 2019
******************************************************************************/
/*
- * Copyright (c) 2017-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2017-2019 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -101,6 +101,21 @@
((IDX << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
(MPU_RLAR_EN_Msk))
+#if defined(MPU_RLAR_PXN_Pos)
+
+/** \brief Region Limit Address Register with PXN value
+* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
+* \param PXN Privileged execute never. Defines whether code can be executed from this privileged region.
+* \param IDX The attribute index to be associated with this memory region.
+*/
+#define ARM_MPU_RLAR_PXN(LIMIT, PXN, IDX) \
+ ((LIMIT & MPU_RLAR_LIMIT_Msk) | \
+ ((PXN << MPU_RLAR_PXN_Pos) & MPU_RLAR_PXN_Msk) | \
+ ((IDX << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
+ (MPU_RLAR_EN_Msk))
+
+#endif
+
/**
* Struct for a single MPU Region
*/
@@ -114,20 +129,19 @@ typedef struct {
*/
__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
{
- __DSB();
- __ISB();
MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
#endif
+ __DSB();
+ __ISB();
}
/** Disable the MPU.
*/
__STATIC_INLINE void ARM_MPU_Disable(void)
{
- __DSB();
- __ISB();
+ __DMB();
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
#endif
@@ -140,20 +154,19 @@ __STATIC_INLINE void ARM_MPU_Disable(void)
*/
__STATIC_INLINE void ARM_MPU_Enable_NS(uint32_t MPU_Control)
{
- __DSB();
- __ISB();
MPU_NS->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB_NS->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
#endif
+ __DSB();
+ __ISB();
}
/** Disable the Non-secure MPU.
*/
__STATIC_INLINE void ARM_MPU_Disable_NS(void)
{
- __DSB();
- __ISB();
+ __DMB();
#ifdef SCB_SHCSR_MEMFAULTENA_Msk
SCB_NS->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
#endif
@@ -267,7 +280,7 @@ __STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t
* \param src Source data is copied from.
* \param len Amount of data words to be copied.
*/
-__STATIC_INLINE void orderedCpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
{
uint32_t i;
for (i = 0U; i < len; ++i)
@@ -287,7 +300,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
if (cnt == 1U) {
mpu->RNR = rnr;
- orderedCpy(&(mpu->RBAR), &(table->RBAR), rowWordSize);
+ ARM_MPU_OrderedMemcpy(&(mpu->RBAR), &(table->RBAR), rowWordSize);
} else {
uint32_t rnrBase = rnr & ~(MPU_TYPE_RALIASES-1U);
uint32_t rnrOffset = rnr % MPU_TYPE_RALIASES;
@@ -295,7 +308,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
mpu->RNR = rnrBase;
while ((rnrOffset + cnt) > MPU_TYPE_RALIASES) {
uint32_t c = MPU_TYPE_RALIASES - rnrOffset;
- orderedCpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), c*rowWordSize);
+ ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), c*rowWordSize);
table += c;
cnt -= c;
rnrOffset = 0U;
@@ -303,7 +316,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
mpu->RNR = rnrBase;
}
- orderedCpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
+ ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
}
}
diff --git a/Lib/ARM/arm_ARMv8MBLl_math.lib b/Lib/ARM/arm_ARMv8MBLl_math.lib
deleted file mode 100644
index 181b876..0000000
Binary files a/Lib/ARM/arm_ARMv8MBLl_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_ARMv8MMLl_math.lib b/Lib/ARM/arm_ARMv8MMLl_math.lib
deleted file mode 100644
index 6acdc24..0000000
Binary files a/Lib/ARM/arm_ARMv8MMLl_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_ARMv8MMLld_math.lib b/Lib/ARM/arm_ARMv8MMLld_math.lib
deleted file mode 100644
index 9dae573..0000000
Binary files a/Lib/ARM/arm_ARMv8MMLld_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_ARMv8MMLldfsp_math.lib b/Lib/ARM/arm_ARMv8MMLldfsp_math.lib
deleted file mode 100644
index ce90a77..0000000
Binary files a/Lib/ARM/arm_ARMv8MMLldfsp_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_ARMv8MMLlfsp_math.lib b/Lib/ARM/arm_ARMv8MMLlfsp_math.lib
deleted file mode 100644
index b3b5200..0000000
Binary files a/Lib/ARM/arm_ARMv8MMLlfsp_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM0b_math.lib b/Lib/ARM/arm_cortexM0b_math.lib
deleted file mode 100644
index beec252..0000000
Binary files a/Lib/ARM/arm_cortexM0b_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM0l_math.lib b/Lib/ARM/arm_cortexM0l_math.lib
deleted file mode 100644
index 73aef8a..0000000
Binary files a/Lib/ARM/arm_cortexM0l_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM3b_math.lib b/Lib/ARM/arm_cortexM3b_math.lib
deleted file mode 100644
index 1a8f571..0000000
Binary files a/Lib/ARM/arm_cortexM3b_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM3l_math.lib b/Lib/ARM/arm_cortexM3l_math.lib
deleted file mode 100644
index 52677b7..0000000
Binary files a/Lib/ARM/arm_cortexM3l_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM4b_math.lib b/Lib/ARM/arm_cortexM4b_math.lib
deleted file mode 100644
index 5a4b86f..0000000
Binary files a/Lib/ARM/arm_cortexM4b_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM4bf_math.lib b/Lib/ARM/arm_cortexM4bf_math.lib
deleted file mode 100644
index fe135ae..0000000
Binary files a/Lib/ARM/arm_cortexM4bf_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM4l_math.lib b/Lib/ARM/arm_cortexM4l_math.lib
deleted file mode 100644
index 593d42f..0000000
Binary files a/Lib/ARM/arm_cortexM4l_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM4lf_math.lib b/Lib/ARM/arm_cortexM4lf_math.lib
deleted file mode 100644
index 5e8d842..0000000
Binary files a/Lib/ARM/arm_cortexM4lf_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7b_math.lib b/Lib/ARM/arm_cortexM7b_math.lib
deleted file mode 100644
index b17ef6a..0000000
Binary files a/Lib/ARM/arm_cortexM7b_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7bfdp_math.lib b/Lib/ARM/arm_cortexM7bfdp_math.lib
deleted file mode 100644
index 9f5712a..0000000
Binary files a/Lib/ARM/arm_cortexM7bfdp_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7bfsp_math.lib b/Lib/ARM/arm_cortexM7bfsp_math.lib
deleted file mode 100644
index 4ec7d63..0000000
Binary files a/Lib/ARM/arm_cortexM7bfsp_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7l_math.lib b/Lib/ARM/arm_cortexM7l_math.lib
deleted file mode 100644
index bed5c16..0000000
Binary files a/Lib/ARM/arm_cortexM7l_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7lfdp_math.lib b/Lib/ARM/arm_cortexM7lfdp_math.lib
deleted file mode 100644
index 47f0121..0000000
Binary files a/Lib/ARM/arm_cortexM7lfdp_math.lib and /dev/null differ
diff --git a/Lib/ARM/arm_cortexM7lfsp_math.lib b/Lib/ARM/arm_cortexM7lfsp_math.lib
deleted file mode 100644
index d16578a..0000000
Binary files a/Lib/ARM/arm_cortexM7lfsp_math.lib and /dev/null differ
diff --git a/Lib/GCC/libarm_ARMv8MBLl_math.a b/Lib/GCC/libarm_ARMv8MBLl_math.a
deleted file mode 100644
index d70be0f..0000000
Binary files a/Lib/GCC/libarm_ARMv8MBLl_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_ARMv8MMLl_math.a b/Lib/GCC/libarm_ARMv8MMLl_math.a
deleted file mode 100644
index d01c450..0000000
Binary files a/Lib/GCC/libarm_ARMv8MMLl_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_ARMv8MMLld_math.a b/Lib/GCC/libarm_ARMv8MMLld_math.a
deleted file mode 100644
index e6e06c7..0000000
Binary files a/Lib/GCC/libarm_ARMv8MMLld_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_ARMv8MMLldfsp_math.a b/Lib/GCC/libarm_ARMv8MMLldfsp_math.a
deleted file mode 100644
index e284e9e..0000000
Binary files a/Lib/GCC/libarm_ARMv8MMLldfsp_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_ARMv8MMLlfsp_math.a b/Lib/GCC/libarm_ARMv8MMLlfsp_math.a
deleted file mode 100644
index d1691fc..0000000
Binary files a/Lib/GCC/libarm_ARMv8MMLlfsp_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM0l_math.a b/Lib/GCC/libarm_cortexM0l_math.a
deleted file mode 100644
index 98d1f29..0000000
Binary files a/Lib/GCC/libarm_cortexM0l_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM3l_math.a b/Lib/GCC/libarm_cortexM3l_math.a
deleted file mode 100644
index 12c644a..0000000
Binary files a/Lib/GCC/libarm_cortexM3l_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM4l_math.a b/Lib/GCC/libarm_cortexM4l_math.a
deleted file mode 100644
index ca10192..0000000
Binary files a/Lib/GCC/libarm_cortexM4l_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM4lf_math.a b/Lib/GCC/libarm_cortexM4lf_math.a
deleted file mode 100644
index d8c3658..0000000
Binary files a/Lib/GCC/libarm_cortexM4lf_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM7l_math.a b/Lib/GCC/libarm_cortexM7l_math.a
deleted file mode 100644
index 376ab84..0000000
Binary files a/Lib/GCC/libarm_cortexM7l_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM7lfdp_math.a b/Lib/GCC/libarm_cortexM7lfdp_math.a
deleted file mode 100644
index 170233a..0000000
Binary files a/Lib/GCC/libarm_cortexM7lfdp_math.a and /dev/null differ
diff --git a/Lib/GCC/libarm_cortexM7lfsp_math.a b/Lib/GCC/libarm_cortexM7lfsp_math.a
deleted file mode 100644
index 36c7461..0000000
Binary files a/Lib/GCC/libarm_cortexM7lfsp_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MBLl_math.a b/Lib/IAR/iar_ARMv8MBLl_math.a
deleted file mode 100644
index b5e74a7..0000000
Binary files a/Lib/IAR/iar_ARMv8MBLl_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLl_math.a b/Lib/IAR/iar_ARMv8MMLl_math.a
deleted file mode 100644
index 77b627e..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLl_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLld_math.a b/Lib/IAR/iar_ARMv8MMLld_math.a
deleted file mode 100644
index 2a9e6c0..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLld_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLldfdp_math.a b/Lib/IAR/iar_ARMv8MMLldfdp_math.a
deleted file mode 100644
index 44304ba..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLldfdp_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLldfsp_math.a b/Lib/IAR/iar_ARMv8MMLldfsp_math.a
deleted file mode 100644
index 8ac4412..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLldfsp_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLlfdp_math.a b/Lib/IAR/iar_ARMv8MMLlfdp_math.a
deleted file mode 100644
index 1df5659..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLlfdp_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_ARMv8MMLlfsp_math.a b/Lib/IAR/iar_ARMv8MMLlfsp_math.a
deleted file mode 100644
index 606a16c..0000000
Binary files a/Lib/IAR/iar_ARMv8MMLlfsp_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM0b_math.a b/Lib/IAR/iar_cortexM0b_math.a
deleted file mode 100644
index 4e9fd75..0000000
Binary files a/Lib/IAR/iar_cortexM0b_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM0l_math.a b/Lib/IAR/iar_cortexM0l_math.a
deleted file mode 100644
index 35f0fcd..0000000
Binary files a/Lib/IAR/iar_cortexM0l_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM3b_math.a b/Lib/IAR/iar_cortexM3b_math.a
deleted file mode 100644
index 694862f..0000000
Binary files a/Lib/IAR/iar_cortexM3b_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM3l_math.a b/Lib/IAR/iar_cortexM3l_math.a
deleted file mode 100644
index 9be18bf..0000000
Binary files a/Lib/IAR/iar_cortexM3l_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM4b_math.a b/Lib/IAR/iar_cortexM4b_math.a
deleted file mode 100644
index d17c8c8..0000000
Binary files a/Lib/IAR/iar_cortexM4b_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM4bf_math.a b/Lib/IAR/iar_cortexM4bf_math.a
deleted file mode 100644
index fe2a42e..0000000
Binary files a/Lib/IAR/iar_cortexM4bf_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM4l_math.a b/Lib/IAR/iar_cortexM4l_math.a
deleted file mode 100644
index 805e7ea..0000000
Binary files a/Lib/IAR/iar_cortexM4l_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM4lf_math.a b/Lib/IAR/iar_cortexM4lf_math.a
deleted file mode 100644
index 2260b2e..0000000
Binary files a/Lib/IAR/iar_cortexM4lf_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7b_math.a b/Lib/IAR/iar_cortexM7b_math.a
deleted file mode 100644
index b1f01fe..0000000
Binary files a/Lib/IAR/iar_cortexM7b_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7bf_math.a b/Lib/IAR/iar_cortexM7bf_math.a
deleted file mode 100644
index 8b49543..0000000
Binary files a/Lib/IAR/iar_cortexM7bf_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7bs_math.a b/Lib/IAR/iar_cortexM7bs_math.a
deleted file mode 100644
index 7bdbbc9..0000000
Binary files a/Lib/IAR/iar_cortexM7bs_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7l_math.a b/Lib/IAR/iar_cortexM7l_math.a
deleted file mode 100644
index fa8fd7b..0000000
Binary files a/Lib/IAR/iar_cortexM7l_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7lf_math.a b/Lib/IAR/iar_cortexM7lf_math.a
deleted file mode 100644
index 8d85dc8..0000000
Binary files a/Lib/IAR/iar_cortexM7lf_math.a and /dev/null differ
diff --git a/Lib/IAR/iar_cortexM7ls_math.a b/Lib/IAR/iar_cortexM7ls_math.a
deleted file mode 100644
index 97ee91e..0000000
Binary files a/Lib/IAR/iar_cortexM7ls_math.a and /dev/null differ
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/startup_ARMCM0.s b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/startup_ARMCM0.s
new file mode 100644
index 0000000..9fc447d
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/startup_ARMCM0.s
@@ -0,0 +1,242 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM0.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM0 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/system_ARMCM0.c b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/system_ARMCM0.c
new file mode 100644
index 0000000..66829ad
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM0/system_ARMCM0.c
@@ -0,0 +1,56 @@
+/**************************************************************************//**
+ * @file system_ARMCM0.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM0 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ARMCM0.h"
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/startup_ARMCM3.s b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/startup_ARMCM3.s
new file mode 100644
index 0000000..16e56b0
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/startup_ARMCM3.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM3.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM3 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/system_ARMCM3.c b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/system_ARMCM3.c
new file mode 100644
index 0000000..f08df7a
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM3/system_ARMCM3.c
@@ -0,0 +1,68 @@
+/**************************************************************************//**
+ * @file system_ARMCM3.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM3 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ARMCM3.h"
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/startup_ARMCM4.s b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/startup_ARMCM4.s
new file mode 100644
index 0000000..dae6439
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/startup_ARMCM4.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM4.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM4 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/system_ARMCM4.c b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/system_ARMCM4.c
new file mode 100644
index 0000000..070fa1e
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM4_FP/system_ARMCM4.c
@@ -0,0 +1,83 @@
+/**************************************************************************//**
+ * @file system_ARMCM4.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM4 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+ #include "ARMCM4.h"
+#elif defined (ARMCM4_FP)
+ #include "ARMCM4_FP.h"
+#else
+ #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1)
+ SCB->CPACR |= ((3U << 10*2) | /* set CP10 Full Access */
+ (3U << 11*2) ); /* set CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+ SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/gcc_arm.ld b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/gcc_arm.ld
new file mode 100644
index 0000000..b987fd1
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/gcc_arm.ld
@@ -0,0 +1,196 @@
+/* Linker script to configure memory regions. */
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 512K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 256K
+}
+
+/* Library configurations */
+GROUP(libgcc.a libc.a libm.a libnosys.a)
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ * Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ * __exidx_start
+ * __exidx_end
+ * __copy_table_start__
+ * __copy_table_end__
+ * __zero_table_start__
+ * __zero_table_end__
+ * __etext
+ * __data_start__
+ * __preinit_array_start
+ * __preinit_array_end
+ * __init_array_start
+ * __init_array_end
+ * __fini_array_start
+ * __fini_array_end
+ * __data_end__
+ * __bss_start__
+ * __bss_end__
+ * __end__
+ * end
+ * __HeapBase
+ * __HeapLimit
+ * __StackLimit
+ * __StackTop
+ * __stack
+ * __Vectors_End
+ * __Vectors_Size
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+ .text :
+ {
+ KEEP(*(.vectors))
+ __Vectors_End = .;
+ __Vectors_Size = __Vectors_End - __Vectors;
+ __end__ = .;
+
+ *(.text*)
+
+ KEEP(*(.init))
+ KEEP(*(.fini))
+
+ /* .ctors */
+ *crtbegin.o(.ctors)
+ *crtbegin?.o(.ctors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+ *(SORT(.ctors.*))
+ *(.ctors)
+
+ /* .dtors */
+ *crtbegin.o(.dtors)
+ *crtbegin?.o(.dtors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+ *(SORT(.dtors.*))
+ *(.dtors)
+
+ *(.rodata*)
+
+ KEEP(*(.eh_frame*))
+ } > FLASH
+
+ .ARM.extab :
+ {
+ *(.ARM.extab* .gnu.linkonce.armextab.*)
+ } > FLASH
+
+ __exidx_start = .;
+ .ARM.exidx :
+ {
+ *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+ } > FLASH
+ __exidx_end = .;
+
+ /* To copy multiple ROM to RAM sections,
+ * uncomment .copy.table section and,
+ * define __STARTUP_COPY_MULTIPLE in startup_ARMCMx.S */
+ /*
+ .copy.table :
+ {
+ . = ALIGN(4);
+ __copy_table_start__ = .;
+ LONG (__etext)
+ LONG (__data_start__)
+ LONG (__data_end__ - __data_start__)
+ LONG (__etext2)
+ LONG (__data2_start__)
+ LONG (__data2_end__ - __data2_start__)
+ __copy_table_end__ = .;
+ } > FLASH
+ */
+
+ /* To clear multiple BSS sections,
+ * uncomment .zero.table section and,
+ * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */
+ /*
+ .zero.table :
+ {
+ . = ALIGN(4);
+ __zero_table_start__ = .;
+ LONG (__bss_start__)
+ LONG (__bss_end__ - __bss_start__)
+ LONG (__bss2_start__)
+ LONG (__bss2_end__ - __bss2_start__)
+ __zero_table_end__ = .;
+ } > FLASH
+ */
+
+ __etext = .;
+
+ .data : AT (__etext)
+ {
+ __data_start__ = .;
+ *(vtable)
+ *(.data*)
+
+ . = ALIGN(4);
+ /* preinit data */
+ PROVIDE_HIDDEN (__preinit_array_start = .);
+ KEEP(*(.preinit_array))
+ PROVIDE_HIDDEN (__preinit_array_end = .);
+
+ . = ALIGN(4);
+ /* init data */
+ PROVIDE_HIDDEN (__init_array_start = .);
+ KEEP(*(SORT(.init_array.*)))
+ KEEP(*(.init_array))
+ PROVIDE_HIDDEN (__init_array_end = .);
+
+
+ . = ALIGN(4);
+ /* finit data */
+ PROVIDE_HIDDEN (__fini_array_start = .);
+ KEEP(*(SORT(.fini_array.*)))
+ KEEP(*(.fini_array))
+ PROVIDE_HIDDEN (__fini_array_end = .);
+
+ KEEP(*(.jcr*))
+ . = ALIGN(4);
+ /* All data end */
+ __data_end__ = .;
+
+ } > RAM
+
+ .bss :
+ {
+ . = ALIGN(4);
+ __bss_start__ = .;
+ *(.bss*)
+ *(COMMON)
+ . = ALIGN(4);
+ __bss_end__ = .;
+ } > RAM
+
+ .heap (COPY):
+ {
+ __HeapBase = .;
+ __end__ = .;
+ end = __end__;
+ KEEP(*(.heap*))
+ __HeapLimit = .;
+ } > RAM
+
+ /* .stack_dummy section doesn't contains any symbols. It is only
+ * used for linker to calculate size of stack sections, and assign
+ * values to stack symbols later */
+ .stack_dummy (COPY):
+ {
+ KEEP(*(.stack*))
+ } > RAM
+
+ /* Set stack top to end of RAM, and stack limit move down by
+ * size of stack_dummy section */
+ __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+ __StackLimit = __StackTop - SIZEOF(.stack_dummy);
+ PROVIDE(__stack = __StackTop);
+
+ /* Check if data + heap + stack exceeds RAM limit */
+ ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.c b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.c
new file mode 100644
index 0000000..26edb9f
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.c
@@ -0,0 +1,295 @@
+/**************************************************************************//**
+ * @file startup_ARMCM7.s
+ * @brief CMSIS Core Device Startup File for
+ * ARMCM7 Device Series
+ * @version V5.00
+ * @date 26. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+
+/*----------------------------------------------------------------------------
+ Linker generated Symbols
+ *----------------------------------------------------------------------------*/
+extern uint32_t __etext;
+extern uint32_t __data_start__;
+extern uint32_t __data_end__;
+extern uint32_t __copy_table_start__;
+extern uint32_t __copy_table_end__;
+extern uint32_t __zero_table_start__;
+extern uint32_t __zero_table_end__;
+extern uint32_t __bss_start__;
+extern uint32_t __bss_end__;
+extern uint32_t __StackTop;
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Handler Function Prototype
+ *----------------------------------------------------------------------------*/
+typedef void( *pFunc )( void );
+
+
+/*----------------------------------------------------------------------------
+ External References
+ *----------------------------------------------------------------------------*/
+#ifndef __START
+extern void _start(void) __attribute__((noreturn)); /* PreeMain (C library entry point) */
+#else
+extern int __START(void) __attribute__((noreturn)); /* main entry point */
+#endif
+
+#ifndef __NO_SYSTEM_INIT
+extern void SystemInit (void); /* CMSIS System Initialization */
+#endif
+
+
+/*----------------------------------------------------------------------------
+ Internal References
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void); /* Default empty handler */
+void Reset_Handler(void); /* Reset Handler */
+
+
+/*----------------------------------------------------------------------------
+ User Initial Stack & Heap
+ *----------------------------------------------------------------------------*/
+#ifndef __STACK_SIZE
+ #define __STACK_SIZE 0x00000400
+#endif
+static uint8_t stack[__STACK_SIZE] __attribute__ ((aligned(8), used, section(".stack")));
+
+#ifndef __HEAP_SIZE
+ #define __HEAP_SIZE 0x00000C00
+#endif
+#if __HEAP_SIZE > 0
+static uint8_t heap[__HEAP_SIZE] __attribute__ ((aligned(8), used, section(".heap")));
+#endif
+
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Cortex-M7 Processor Exceptions */
+void NMI_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MemManage_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void BusFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UsageFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SVC_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void DebugMon_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+
+/* ARMCM7 Specific Interrupts */
+void WDT_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void RTC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void TIM0_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void TIM2_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MCIA_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MCIB_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART0_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART1_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART2_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART4_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void AACI_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void ENET_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void USBDC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void USBHC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CHLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void FLEXRAY_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CAN_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void LIN_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void I2C_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CPU_CLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART3_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SPI_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+const pFunc __Vectors[] __attribute__ ((section(".vectors"))) = {
+ /* Cortex-M7 Exceptions Handler */
+ (pFunc)((uint32_t)&__StackTop), /* Initial Stack Pointer */
+ Reset_Handler, /* Reset Handler */
+ NMI_Handler, /* NMI Handler */
+ HardFault_Handler, /* Hard Fault Handler */
+ MemManage_Handler, /* MPU Fault Handler */
+ BusFault_Handler, /* Bus Fault Handler */
+ UsageFault_Handler, /* Usage Fault Handler */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ SVC_Handler, /* SVCall Handler */
+ DebugMon_Handler, /* Debug Monitor Handler */
+ 0, /* Reserved */
+ PendSV_Handler, /* PendSV Handler */
+ SysTick_Handler, /* SysTick Handler */
+
+ /* External interrupts */
+ WDT_IRQHandler, /* 0: Watchdog Timer */
+ RTC_IRQHandler, /* 1: Real Time Clock */
+ TIM0_IRQHandler, /* 2: Timer0 / Timer1 */
+ TIM2_IRQHandler, /* 3: Timer2 / Timer3 */
+ MCIA_IRQHandler, /* 4: MCIa */
+ MCIB_IRQHandler, /* 5: MCIb */
+ UART0_IRQHandler, /* 6: UART0 - DUT FPGA */
+ UART1_IRQHandler, /* 7: UART1 - DUT FPGA */
+ UART2_IRQHandler, /* 8: UART2 - DUT FPGA */
+ UART4_IRQHandler, /* 9: UART4 - not connected */
+ AACI_IRQHandler, /* 10: AACI / AC97 */
+ CLCD_IRQHandler, /* 11: CLCD Combined Interrupt */
+ ENET_IRQHandler, /* 12: Ethernet */
+ USBDC_IRQHandler, /* 13: USB Device */
+ USBHC_IRQHandler, /* 14: USB Host Controller */
+ CHLCD_IRQHandler, /* 15: Character LCD */
+ FLEXRAY_IRQHandler, /* 16: Flexray */
+ CAN_IRQHandler, /* 17: CAN */
+ LIN_IRQHandler, /* 18: LIN */
+ I2C_IRQHandler, /* 19: I2C ADC/DAC */
+ 0, /* 20: Reserved */
+ 0, /* 21: Reserved */
+ 0, /* 22: Reserved */
+ 0, /* 23: Reserved */
+ 0, /* 24: Reserved */
+ 0, /* 25: Reserved */
+ 0, /* 26: Reserved */
+ 0, /* 27: Reserved */
+ CPU_CLCD_IRQHandler, /* 28: Reserved - CPU FPGA CLCD */
+ 0, /* 29: Reserved - CPU FPGA */
+ UART3_IRQHandler, /* 30: UART3 - CPU FPGA */
+ SPI_IRQHandler /* 31: SPI Touchscreen - CPU FPGA */
+};
+
+
+/*----------------------------------------------------------------------------
+ Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+void Reset_Handler(void) {
+ uint32_t *pSrc, *pDest;
+ uint32_t *pTable __attribute__((unused));
+
+/* Firstly it copies data from read only memory to RAM. There are two schemes
+ * to copy. One can copy more than one sections. Another can only copy
+ * one section. The former scheme needs more instructions and read-only
+ * data to implement than the latter.
+ * Macro __STARTUP_COPY_MULTIPLE is used to choose between two schemes. */
+
+#ifdef __STARTUP_COPY_MULTIPLE
+/* Multiple sections scheme.
+ *
+ * Between symbol address __copy_table_start__ and __copy_table_end__,
+ * there are array of triplets, each of which specify:
+ * offset 0: LMA of start of a section to copy from
+ * offset 4: VMA of start of a section to copy to
+ * offset 8: size of the section to copy. Must be multiply of 4
+ *
+ * All addresses must be aligned to 4 bytes boundary.
+ */
+ pTable = &__copy_table_start__;
+
+ for (; pTable < &__copy_table_end__; pTable = pTable + 3) {
+ pSrc = (uint32_t*)*(pTable + 0);
+ pDest = (uint32_t*)*(pTable + 1);
+ for (; pDest < (uint32_t*)(*(pTable + 1) + *(pTable + 2)) ; ) {
+ *pDest++ = *pSrc++;
+ }
+ }
+#else
+/* Single section scheme.
+ *
+ * The ranges of copy from/to are specified by following symbols
+ * __etext: LMA of start of the section to copy from. Usually end of text
+ * __data_start__: VMA of start of the section to copy to
+ * __data_end__: VMA of end of the section to copy to
+ *
+ * All addresses must be aligned to 4 bytes boundary.
+ */
+ pSrc = &__etext;
+ pDest = &__data_start__;
+
+ for ( ; pDest < &__data_end__ ; ) {
+ *pDest++ = *pSrc++;
+ }
+#endif /*__STARTUP_COPY_MULTIPLE */
+
+/* This part of work usually is done in C library startup code. Otherwise,
+ * define this macro to enable it in this startup.
+ *
+ * There are two schemes too. One can clear multiple BSS sections. Another
+ * can only clear one section. The former is more size expensive than the
+ * latter.
+ *
+ * Define macro __STARTUP_CLEAR_BSS_MULTIPLE to choose the former.
+ * Otherwise efine macro __STARTUP_CLEAR_BSS to choose the later.
+ */
+#ifdef __STARTUP_CLEAR_BSS_MULTIPLE
+/* Multiple sections scheme.
+ *
+ * Between symbol address __copy_table_start__ and __copy_table_end__,
+ * there are array of tuples specifying:
+ * offset 0: Start of a BSS section
+ * offset 4: Size of this BSS section. Must be multiply of 4
+ */
+ pTable = &__zero_table_start__;
+
+ for (; pTable < &__zero_table_end__; pTable = pTable + 2) {
+ pDest = (uint32_t*)*(pTable + 0);
+ for (; pDest < (uint32_t*)(*(pTable + 0) + *(pTable + 1)) ; ) {
+ *pDest++ = 0;
+ }
+ }
+#elif defined (__STARTUP_CLEAR_BSS)
+/* Single BSS section scheme.
+ *
+ * The BSS section is specified by following symbols
+ * __bss_start__: start of the BSS section.
+ * __bss_end__: end of the BSS section.
+ *
+ * Both addresses must be aligned to 4 bytes boundary.
+ */
+ pDest = &__bss_start__;
+
+ for ( ; pDest < &__bss_end__ ; ) {
+ *pDest++ = 0UL;
+ }
+#endif /* __STARTUP_CLEAR_BSS_MULTIPLE || __STARTUP_CLEAR_BSS */
+
+#ifndef __NO_SYSTEM_INIT
+ SystemInit();
+#endif
+
+#ifndef __START
+#define __START _start
+#endif
+ __START();
+
+}
+
+
+/*----------------------------------------------------------------------------
+ Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void) {
+
+ while(1);
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.s b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.s
new file mode 100644
index 0000000..b69f038
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/startup_ARMCM7.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM7.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM7 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/system_ARMCM7.c b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/system_ARMCM7.c
new file mode 100644
index 0000000..65bfaca
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Device/ARMCM7_SP/system_ARMCM7.c
@@ -0,0 +1,85 @@
+/**************************************************************************//**
+ * @file system_ARMCM7.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM7 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM7)
+ #include "ARMCM7.h"
+#elif defined (ARMCM7_SP)
+ #include "ARMCM7_SP.h"
+#elif defined (ARMCM7_DP)
+ #include "ARMCM7_DP.h"
+#else
+ #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1)
+ SCB->CPACR |= ((3U << 10*2) | /* set CP10 Full Access */
+ (3U << 11*2) ); /* set CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+ SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/startup_ARMCM0.s b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/startup_ARMCM0.s
new file mode 100644
index 0000000..9fc447d
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/startup_ARMCM0.s
@@ -0,0 +1,242 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM0.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM0 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/system_ARMCM0.c b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/system_ARMCM0.c
new file mode 100644
index 0000000..66829ad
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM0/system_ARMCM0.c
@@ -0,0 +1,56 @@
+/**************************************************************************//**
+ * @file system_ARMCM0.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM0 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ARMCM0.h"
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/startup_ARMCM3.s b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/startup_ARMCM3.s
new file mode 100644
index 0000000..16e56b0
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/startup_ARMCM3.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM3.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM3 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/system_ARMCM3.c b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/system_ARMCM3.c
new file mode 100644
index 0000000..f08df7a
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM3/system_ARMCM3.c
@@ -0,0 +1,68 @@
+/**************************************************************************//**
+ * @file system_ARMCM3.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM3 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ARMCM3.h"
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/startup_ARMCM4.s b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/startup_ARMCM4.s
new file mode 100644
index 0000000..dae6439
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/startup_ARMCM4.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM4.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM4 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/system_ARMCM4.c b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/system_ARMCM4.c
new file mode 100644
index 0000000..070fa1e
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM4_FP/system_ARMCM4.c
@@ -0,0 +1,83 @@
+/**************************************************************************//**
+ * @file system_ARMCM4.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM4 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+ #include "ARMCM4.h"
+#elif defined (ARMCM4_FP)
+ #include "ARMCM4_FP.h"
+#else
+ #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1)
+ SCB->CPACR |= ((3U << 10*2) | /* set CP10 Full Access */
+ (3U << 11*2) ); /* set CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+ SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/gcc_arm.ld b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/gcc_arm.ld
new file mode 100644
index 0000000..b987fd1
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/gcc_arm.ld
@@ -0,0 +1,196 @@
+/* Linker script to configure memory regions. */
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 512K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 256K
+}
+
+/* Library configurations */
+GROUP(libgcc.a libc.a libm.a libnosys.a)
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ * Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ * __exidx_start
+ * __exidx_end
+ * __copy_table_start__
+ * __copy_table_end__
+ * __zero_table_start__
+ * __zero_table_end__
+ * __etext
+ * __data_start__
+ * __preinit_array_start
+ * __preinit_array_end
+ * __init_array_start
+ * __init_array_end
+ * __fini_array_start
+ * __fini_array_end
+ * __data_end__
+ * __bss_start__
+ * __bss_end__
+ * __end__
+ * end
+ * __HeapBase
+ * __HeapLimit
+ * __StackLimit
+ * __StackTop
+ * __stack
+ * __Vectors_End
+ * __Vectors_Size
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+ .text :
+ {
+ KEEP(*(.vectors))
+ __Vectors_End = .;
+ __Vectors_Size = __Vectors_End - __Vectors;
+ __end__ = .;
+
+ *(.text*)
+
+ KEEP(*(.init))
+ KEEP(*(.fini))
+
+ /* .ctors */
+ *crtbegin.o(.ctors)
+ *crtbegin?.o(.ctors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+ *(SORT(.ctors.*))
+ *(.ctors)
+
+ /* .dtors */
+ *crtbegin.o(.dtors)
+ *crtbegin?.o(.dtors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+ *(SORT(.dtors.*))
+ *(.dtors)
+
+ *(.rodata*)
+
+ KEEP(*(.eh_frame*))
+ } > FLASH
+
+ .ARM.extab :
+ {
+ *(.ARM.extab* .gnu.linkonce.armextab.*)
+ } > FLASH
+
+ __exidx_start = .;
+ .ARM.exidx :
+ {
+ *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+ } > FLASH
+ __exidx_end = .;
+
+ /* To copy multiple ROM to RAM sections,
+ * uncomment .copy.table section and,
+ * define __STARTUP_COPY_MULTIPLE in startup_ARMCMx.S */
+ /*
+ .copy.table :
+ {
+ . = ALIGN(4);
+ __copy_table_start__ = .;
+ LONG (__etext)
+ LONG (__data_start__)
+ LONG (__data_end__ - __data_start__)
+ LONG (__etext2)
+ LONG (__data2_start__)
+ LONG (__data2_end__ - __data2_start__)
+ __copy_table_end__ = .;
+ } > FLASH
+ */
+
+ /* To clear multiple BSS sections,
+ * uncomment .zero.table section and,
+ * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */
+ /*
+ .zero.table :
+ {
+ . = ALIGN(4);
+ __zero_table_start__ = .;
+ LONG (__bss_start__)
+ LONG (__bss_end__ - __bss_start__)
+ LONG (__bss2_start__)
+ LONG (__bss2_end__ - __bss2_start__)
+ __zero_table_end__ = .;
+ } > FLASH
+ */
+
+ __etext = .;
+
+ .data : AT (__etext)
+ {
+ __data_start__ = .;
+ *(vtable)
+ *(.data*)
+
+ . = ALIGN(4);
+ /* preinit data */
+ PROVIDE_HIDDEN (__preinit_array_start = .);
+ KEEP(*(.preinit_array))
+ PROVIDE_HIDDEN (__preinit_array_end = .);
+
+ . = ALIGN(4);
+ /* init data */
+ PROVIDE_HIDDEN (__init_array_start = .);
+ KEEP(*(SORT(.init_array.*)))
+ KEEP(*(.init_array))
+ PROVIDE_HIDDEN (__init_array_end = .);
+
+
+ . = ALIGN(4);
+ /* finit data */
+ PROVIDE_HIDDEN (__fini_array_start = .);
+ KEEP(*(SORT(.fini_array.*)))
+ KEEP(*(.fini_array))
+ PROVIDE_HIDDEN (__fini_array_end = .);
+
+ KEEP(*(.jcr*))
+ . = ALIGN(4);
+ /* All data end */
+ __data_end__ = .;
+
+ } > RAM
+
+ .bss :
+ {
+ . = ALIGN(4);
+ __bss_start__ = .;
+ *(.bss*)
+ *(COMMON)
+ . = ALIGN(4);
+ __bss_end__ = .;
+ } > RAM
+
+ .heap (COPY):
+ {
+ __HeapBase = .;
+ __end__ = .;
+ end = __end__;
+ KEEP(*(.heap*))
+ __HeapLimit = .;
+ } > RAM
+
+ /* .stack_dummy section doesn't contains any symbols. It is only
+ * used for linker to calculate size of stack sections, and assign
+ * values to stack symbols later */
+ .stack_dummy (COPY):
+ {
+ KEEP(*(.stack*))
+ } > RAM
+
+ /* Set stack top to end of RAM, and stack limit move down by
+ * size of stack_dummy section */
+ __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+ __StackLimit = __StackTop - SIZEOF(.stack_dummy);
+ PROVIDE(__stack = __StackTop);
+
+ /* Check if data + heap + stack exceeds RAM limit */
+ ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.c b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.c
new file mode 100644
index 0000000..26edb9f
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.c
@@ -0,0 +1,295 @@
+/**************************************************************************//**
+ * @file startup_ARMCM7.s
+ * @brief CMSIS Core Device Startup File for
+ * ARMCM7 Device Series
+ * @version V5.00
+ * @date 26. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+
+/*----------------------------------------------------------------------------
+ Linker generated Symbols
+ *----------------------------------------------------------------------------*/
+extern uint32_t __etext;
+extern uint32_t __data_start__;
+extern uint32_t __data_end__;
+extern uint32_t __copy_table_start__;
+extern uint32_t __copy_table_end__;
+extern uint32_t __zero_table_start__;
+extern uint32_t __zero_table_end__;
+extern uint32_t __bss_start__;
+extern uint32_t __bss_end__;
+extern uint32_t __StackTop;
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Handler Function Prototype
+ *----------------------------------------------------------------------------*/
+typedef void( *pFunc )( void );
+
+
+/*----------------------------------------------------------------------------
+ External References
+ *----------------------------------------------------------------------------*/
+#ifndef __START
+extern void _start(void) __attribute__((noreturn)); /* PreeMain (C library entry point) */
+#else
+extern int __START(void) __attribute__((noreturn)); /* main entry point */
+#endif
+
+#ifndef __NO_SYSTEM_INIT
+extern void SystemInit (void); /* CMSIS System Initialization */
+#endif
+
+
+/*----------------------------------------------------------------------------
+ Internal References
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void); /* Default empty handler */
+void Reset_Handler(void); /* Reset Handler */
+
+
+/*----------------------------------------------------------------------------
+ User Initial Stack & Heap
+ *----------------------------------------------------------------------------*/
+#ifndef __STACK_SIZE
+ #define __STACK_SIZE 0x00000400
+#endif
+static uint8_t stack[__STACK_SIZE] __attribute__ ((aligned(8), used, section(".stack")));
+
+#ifndef __HEAP_SIZE
+ #define __HEAP_SIZE 0x00000C00
+#endif
+#if __HEAP_SIZE > 0
+static uint8_t heap[__HEAP_SIZE] __attribute__ ((aligned(8), used, section(".heap")));
+#endif
+
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Cortex-M7 Processor Exceptions */
+void NMI_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MemManage_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void BusFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UsageFault_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SVC_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void DebugMon_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler (void) __attribute__ ((weak, alias("Default_Handler")));
+
+/* ARMCM7 Specific Interrupts */
+void WDT_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void RTC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void TIM0_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void TIM2_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MCIA_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void MCIB_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART0_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART1_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART2_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART4_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void AACI_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void ENET_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void USBDC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void USBHC_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CHLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void FLEXRAY_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CAN_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void LIN_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void I2C_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void CPU_CLCD_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void UART3_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+void SPI_IRQHandler (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+ Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+const pFunc __Vectors[] __attribute__ ((section(".vectors"))) = {
+ /* Cortex-M7 Exceptions Handler */
+ (pFunc)((uint32_t)&__StackTop), /* Initial Stack Pointer */
+ Reset_Handler, /* Reset Handler */
+ NMI_Handler, /* NMI Handler */
+ HardFault_Handler, /* Hard Fault Handler */
+ MemManage_Handler, /* MPU Fault Handler */
+ BusFault_Handler, /* Bus Fault Handler */
+ UsageFault_Handler, /* Usage Fault Handler */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ 0, /* Reserved */
+ SVC_Handler, /* SVCall Handler */
+ DebugMon_Handler, /* Debug Monitor Handler */
+ 0, /* Reserved */
+ PendSV_Handler, /* PendSV Handler */
+ SysTick_Handler, /* SysTick Handler */
+
+ /* External interrupts */
+ WDT_IRQHandler, /* 0: Watchdog Timer */
+ RTC_IRQHandler, /* 1: Real Time Clock */
+ TIM0_IRQHandler, /* 2: Timer0 / Timer1 */
+ TIM2_IRQHandler, /* 3: Timer2 / Timer3 */
+ MCIA_IRQHandler, /* 4: MCIa */
+ MCIB_IRQHandler, /* 5: MCIb */
+ UART0_IRQHandler, /* 6: UART0 - DUT FPGA */
+ UART1_IRQHandler, /* 7: UART1 - DUT FPGA */
+ UART2_IRQHandler, /* 8: UART2 - DUT FPGA */
+ UART4_IRQHandler, /* 9: UART4 - not connected */
+ AACI_IRQHandler, /* 10: AACI / AC97 */
+ CLCD_IRQHandler, /* 11: CLCD Combined Interrupt */
+ ENET_IRQHandler, /* 12: Ethernet */
+ USBDC_IRQHandler, /* 13: USB Device */
+ USBHC_IRQHandler, /* 14: USB Host Controller */
+ CHLCD_IRQHandler, /* 15: Character LCD */
+ FLEXRAY_IRQHandler, /* 16: Flexray */
+ CAN_IRQHandler, /* 17: CAN */
+ LIN_IRQHandler, /* 18: LIN */
+ I2C_IRQHandler, /* 19: I2C ADC/DAC */
+ 0, /* 20: Reserved */
+ 0, /* 21: Reserved */
+ 0, /* 22: Reserved */
+ 0, /* 23: Reserved */
+ 0, /* 24: Reserved */
+ 0, /* 25: Reserved */
+ 0, /* 26: Reserved */
+ 0, /* 27: Reserved */
+ CPU_CLCD_IRQHandler, /* 28: Reserved - CPU FPGA CLCD */
+ 0, /* 29: Reserved - CPU FPGA */
+ UART3_IRQHandler, /* 30: UART3 - CPU FPGA */
+ SPI_IRQHandler /* 31: SPI Touchscreen - CPU FPGA */
+};
+
+
+/*----------------------------------------------------------------------------
+ Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+void Reset_Handler(void) {
+ uint32_t *pSrc, *pDest;
+ uint32_t *pTable __attribute__((unused));
+
+/* Firstly it copies data from read only memory to RAM. There are two schemes
+ * to copy. One can copy more than one sections. Another can only copy
+ * one section. The former scheme needs more instructions and read-only
+ * data to implement than the latter.
+ * Macro __STARTUP_COPY_MULTIPLE is used to choose between two schemes. */
+
+#ifdef __STARTUP_COPY_MULTIPLE
+/* Multiple sections scheme.
+ *
+ * Between symbol address __copy_table_start__ and __copy_table_end__,
+ * there are array of triplets, each of which specify:
+ * offset 0: LMA of start of a section to copy from
+ * offset 4: VMA of start of a section to copy to
+ * offset 8: size of the section to copy. Must be multiply of 4
+ *
+ * All addresses must be aligned to 4 bytes boundary.
+ */
+ pTable = &__copy_table_start__;
+
+ for (; pTable < &__copy_table_end__; pTable = pTable + 3) {
+ pSrc = (uint32_t*)*(pTable + 0);
+ pDest = (uint32_t*)*(pTable + 1);
+ for (; pDest < (uint32_t*)(*(pTable + 1) + *(pTable + 2)) ; ) {
+ *pDest++ = *pSrc++;
+ }
+ }
+#else
+/* Single section scheme.
+ *
+ * The ranges of copy from/to are specified by following symbols
+ * __etext: LMA of start of the section to copy from. Usually end of text
+ * __data_start__: VMA of start of the section to copy to
+ * __data_end__: VMA of end of the section to copy to
+ *
+ * All addresses must be aligned to 4 bytes boundary.
+ */
+ pSrc = &__etext;
+ pDest = &__data_start__;
+
+ for ( ; pDest < &__data_end__ ; ) {
+ *pDest++ = *pSrc++;
+ }
+#endif /*__STARTUP_COPY_MULTIPLE */
+
+/* This part of work usually is done in C library startup code. Otherwise,
+ * define this macro to enable it in this startup.
+ *
+ * There are two schemes too. One can clear multiple BSS sections. Another
+ * can only clear one section. The former is more size expensive than the
+ * latter.
+ *
+ * Define macro __STARTUP_CLEAR_BSS_MULTIPLE to choose the former.
+ * Otherwise efine macro __STARTUP_CLEAR_BSS to choose the later.
+ */
+#ifdef __STARTUP_CLEAR_BSS_MULTIPLE
+/* Multiple sections scheme.
+ *
+ * Between symbol address __copy_table_start__ and __copy_table_end__,
+ * there are array of tuples specifying:
+ * offset 0: Start of a BSS section
+ * offset 4: Size of this BSS section. Must be multiply of 4
+ */
+ pTable = &__zero_table_start__;
+
+ for (; pTable < &__zero_table_end__; pTable = pTable + 2) {
+ pDest = (uint32_t*)*(pTable + 0);
+ for (; pDest < (uint32_t*)(*(pTable + 0) + *(pTable + 1)) ; ) {
+ *pDest++ = 0;
+ }
+ }
+#elif defined (__STARTUP_CLEAR_BSS)
+/* Single BSS section scheme.
+ *
+ * The BSS section is specified by following symbols
+ * __bss_start__: start of the BSS section.
+ * __bss_end__: end of the BSS section.
+ *
+ * Both addresses must be aligned to 4 bytes boundary.
+ */
+ pDest = &__bss_start__;
+
+ for ( ; pDest < &__bss_end__ ; ) {
+ *pDest++ = 0UL;
+ }
+#endif /* __STARTUP_CLEAR_BSS_MULTIPLE || __STARTUP_CLEAR_BSS */
+
+#ifndef __NO_SYSTEM_INIT
+ SystemInit();
+#endif
+
+#ifndef __START
+#define __START _start
+#endif
+ __START();
+
+}
+
+
+/*----------------------------------------------------------------------------
+ Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void) {
+
+ while(1);
+}
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.s b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.s
new file mode 100644
index 0000000..b69f038
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/startup_ARMCM7.s
@@ -0,0 +1,262 @@
+;/**************************************************************************//**
+; * @file startup_ARMCM7.s
+; * @brief CMSIS Core Device Startup File for
+; * ARMCM7 Device Series
+; * @version V5.00
+; * @date 02. March 2016
+; ******************************************************************************/
+;/*
+; * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+;/*
+;//-------- <<< Use Configuration Wizard in Context Menu >>> ------------------
+;*/
+
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000C00
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+
+ PRESERVE8
+ THUMB
+
+
+; Vector Table Mapped to Address 0 at Reset
+
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WDT_IRQHandler ; 0: Watchdog Timer
+ DCD RTC_IRQHandler ; 1: Real Time Clock
+ DCD TIM0_IRQHandler ; 2: Timer0 / Timer1
+ DCD TIM2_IRQHandler ; 3: Timer2 / Timer3
+ DCD MCIA_IRQHandler ; 4: MCIa
+ DCD MCIB_IRQHandler ; 5: MCIb
+ DCD UART0_IRQHandler ; 6: UART0 - DUT FPGA
+ DCD UART1_IRQHandler ; 7: UART1 - DUT FPGA
+ DCD UART2_IRQHandler ; 8: UART2 - DUT FPGA
+ DCD UART4_IRQHandler ; 9: UART4 - not connected
+ DCD AACI_IRQHandler ; 10: AACI / AC97
+ DCD CLCD_IRQHandler ; 11: CLCD Combined Interrupt
+ DCD ENET_IRQHandler ; 12: Ethernet
+ DCD USBDC_IRQHandler ; 13: USB Device
+ DCD USBHC_IRQHandler ; 14: USB Host Controller
+ DCD CHLCD_IRQHandler ; 15: Character LCD
+ DCD FLEXRAY_IRQHandler ; 16: Flexray
+ DCD CAN_IRQHandler ; 17: CAN
+ DCD LIN_IRQHandler ; 18: LIN
+ DCD I2C_IRQHandler ; 19: I2C ADC/DAC
+ DCD 0 ; 20: Reserved
+ DCD 0 ; 21: Reserved
+ DCD 0 ; 22: Reserved
+ DCD 0 ; 23: Reserved
+ DCD 0 ; 24: Reserved
+ DCD 0 ; 25: Reserved
+ DCD 0 ; 26: Reserved
+ DCD 0 ; 27: Reserved
+ DCD CPU_CLCD_IRQHandler ; 28: Reserved - CPU FPGA CLCD
+ DCD 0 ; 29: Reserved - CPU FPGA
+ DCD UART3_IRQHandler ; 30: UART3 - CPU FPGA
+ DCD SPI_IRQHandler ; 31: SPI Touchscreen - CPU FPGA
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+
+; Reset Handler
+
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WDT_IRQHandler [WEAK]
+ EXPORT RTC_IRQHandler [WEAK]
+ EXPORT TIM0_IRQHandler [WEAK]
+ EXPORT TIM2_IRQHandler [WEAK]
+ EXPORT MCIA_IRQHandler [WEAK]
+ EXPORT MCIB_IRQHandler [WEAK]
+ EXPORT UART0_IRQHandler [WEAK]
+ EXPORT UART1_IRQHandler [WEAK]
+ EXPORT UART2_IRQHandler [WEAK]
+ EXPORT UART3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT AACI_IRQHandler [WEAK]
+ EXPORT CLCD_IRQHandler [WEAK]
+ EXPORT ENET_IRQHandler [WEAK]
+ EXPORT USBDC_IRQHandler [WEAK]
+ EXPORT USBHC_IRQHandler [WEAK]
+ EXPORT CHLCD_IRQHandler [WEAK]
+ EXPORT FLEXRAY_IRQHandler [WEAK]
+ EXPORT CAN_IRQHandler [WEAK]
+ EXPORT LIN_IRQHandler [WEAK]
+ EXPORT I2C_IRQHandler [WEAK]
+ EXPORT CPU_CLCD_IRQHandler [WEAK]
+ EXPORT SPI_IRQHandler [WEAK]
+
+WDT_IRQHandler
+RTC_IRQHandler
+TIM0_IRQHandler
+TIM2_IRQHandler
+MCIA_IRQHandler
+MCIB_IRQHandler
+UART0_IRQHandler
+UART1_IRQHandler
+UART2_IRQHandler
+UART3_IRQHandler
+UART4_IRQHandler
+AACI_IRQHandler
+CLCD_IRQHandler
+ENET_IRQHandler
+USBDC_IRQHandler
+USBHC_IRQHandler
+CHLCD_IRQHandler
+FLEXRAY_IRQHandler
+CAN_IRQHandler
+LIN_IRQHandler
+I2C_IRQHandler
+CPU_CLCD_IRQHandler
+SPI_IRQHandler
+ B .
+
+ ENDP
+
+
+ ALIGN
+
+
+; User Initial Stack & Heap
+
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap PROC
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+ ENDP
+
+ ALIGN
+
+ ENDIF
+
+
+ END
diff --git a/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/system_ARMCM7.c b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/system_ARMCM7.c
new file mode 100644
index 0000000..65bfaca
--- /dev/null
+++ b/NN/Examples/ARM/arm_nn_examples/gru/RTE/Device/ARMCM7_SP/system_ARMCM7.c
@@ -0,0 +1,85 @@
+/**************************************************************************//**
+ * @file system_ARMCM7.c
+ * @brief CMSIS Device System Source File for
+ * ARMCM7 Device Series
+ * @version V5.00
+ * @date 08. April 2016
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2016 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM7)
+ #include "ARMCM7.h"
+#elif defined (ARMCM7_SP)
+ #include "ARMCM7_SP.h"
+#elif defined (ARMCM7_DP)
+ #include "ARMCM7_DP.h"
+#else
+ #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+ Define clocks
+ *----------------------------------------------------------------------------*/
+#define XTAL ( 5000000U) /* Oscillator frequency */
+
+#define SYSTEM_CLOCK (5 * XTAL)
+
+
+/*----------------------------------------------------------------------------
+ Externals
+ *----------------------------------------------------------------------------*/
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ extern uint32_t __Vectors;
+#endif
+
+/*----------------------------------------------------------------------------
+ System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+
+
+/*----------------------------------------------------------------------------
+ System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+ SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+ System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1)
+ SCB->VTOR = (uint32_t) &__Vectors;
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1)
+ SCB->CPACR |= ((3U << 10*2) | /* set CP10 Full Access */
+ (3U << 11*2) ); /* set CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+ SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+ SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/NN-example-cifar10.ewp b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/NN-example-cifar10.ewp
new file mode 100644
index 0000000..d3be3cf
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/NN-example-cifar10.ewp
@@ -0,0 +1,2260 @@
+
+
+ 3
+
+ Debug
+
+ ARM
+
+ 1
+
+ General
+ 3
+
+ 30
+ 1
+ 1
+
+ ExePath
+ Debug\Exe
+
+
+ ObjPath
+ Debug\Obj
+
+
+ ListPath
+ Debug\List
+
+
+ GEndianMode
+ 0
+
+
+ Input description
+ Automatic choice of formatter, without multibyte support.
+
+
+ Output description
+ Automatic choice of formatter, without multibyte support.
+
+
+ GOutputBinary
+ 0
+
+
+ OGCoreOrChip
+ 2
+
+
+ GRuntimeLibSelect
+ 0
+ 1
+
+
+ GRuntimeLibSelectSlave
+ 0
+ 1
+
+
+ RTDescription
+ Use the normal configuration of the C/C++ runtime library. No locale interface, C locale, no file descriptor support, no multibytes in printf and scanf, and no hex floats in strtod.
+
+
+
+
+ BILINK
+ 0
+
+
+
+ Coder
+ 0
+
+
+
+
+ CMSIS-Pack
+ CMSISPack.ComponentGroup
+
+ $PROJ_DIR$\RTE\RTE_Components.h
+
+
+ Device Startup
+ CMSISPack.Component
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\Device\ARM\ARMCM0\Include\ARMCM0.h
+
+
+ $PROJ_DIR$\RTE\CMSIS\ARM\startup_ARMCM0.s
+
+
+ $PROJ_DIR$\RTE\CMSIS\ARM\system_ARMCM0.c
+
+
+
+ CMSIS NN Lib
+ CMSISPack.Component
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_1x1_HWC_q7_fast_nonsquare.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q15_basic.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q7_fast.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q7_fast_nonsquare.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_convolve_HWC_q7_RGB.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_depthwise_separable_conv_HWC_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_depthwise_separable_conv_HWC_q7_nonsquare.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_mat_q7_vec_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_mat_q7_vec_q15_opt.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_q15_opt.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\FullyConnectedFunctions\arm_fully_connected_q7_opt.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ActivationFunctions\arm_nn_activations_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ActivationFunctions\arm_nn_activations_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_nn_mat_mult_kernel_q7_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ConvolutionFunctions\arm_nn_mat_mult_kernel_q7_q15_reordered.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\NNSupportFunctions\arm_nn_mult_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\NNSupportFunctions\arm_nn_mult_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Include\arm_nnfunctions.h
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\NNSupportFunctions\arm_nntables.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\PoolingFunctions\arm_pool_q7_HWC.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\NNSupportFunctions\arm_q7_to_q15_no_shift.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\NNSupportFunctions\arm_q7_to_q15_reordered_no_shift.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ActivationFunctions\arm_relu_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\ActivationFunctions\arm_relu_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\SoftmaxFunctions\arm_softmax_q15.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\NN\Source\SoftmaxFunctions\arm_softmax_q7.c
+
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\Documentation\NN\html\index.html
+
+
+
+ CMSIS CORE
+ CMSISPack.Component
+
+ ${CMSIS_PACK_PATH_ARM#CMSIS#5.5.0-dev0}$\CMSIS\Documentation\Core\html\index.html
+
+
+
+
+ Documentation
+
+ $PROJ_DIR$\..\readme_iar.txt
+
+
+
+ Source code
+
+ $PROJ_DIR$\arm_nnexamples_cifar10.cpp
+
+
+
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<configuration xmlns:xs="http://www.w3.org/2001/XMLSchema-instance">
+<packages/>
+<device Dclock="10000000" Dcore="Cortex-M0" DcoreVersion="r0p0" Dendian="Little-endian" Dfamily="ARM Cortex M0" Dfpu="NO_FPU" Dmpu="NO_MPU" Dname="ARMCM0" Dvendor="ARM:82" Pname="">
+<url>http://www.keil.com/dd2/arm/armcm0</url>
+<package name="CMSIS" url="http://www.keil.com/pack/" vendor="ARM" version="5.5.0-dev0"/>
+</device>
+<toolchain Tcompiler="IAR" Toutput="exe"/>
+<components>
+<component Cclass="CMSIS" Cgroup="CORE" Cvendor="ARM" Cversion="5.1.2">
+<package name="CMSIS" url="http://www.keil.com/pack/" vendor="ARM" version="5.5.0-dev0"/>
+<file category="doc" name="CMSIS/Documentation/Core/html/index.html"/>
+<file category="include" name="CMSIS/Core/Include/"/>
+</component>
+<component Cclass="CMSIS" Cgroup="NN Lib" Cvendor="ARM" Cversion="1.1.0">
+<package name="CMSIS" url="http://www.keil.com/pack/" vendor="ARM" version="5.5.0-dev0"/>
+<file category="doc" name="CMSIS/Documentation/NN/html/index.html"/>
+<file category="header" name="CMSIS/NN/Include/arm_nnfunctions.h"/>
+<file category="source" name="CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c"/>
+<file category="source" name="CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c"/>
+<file category="source" name="CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c"/>
+<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c"/>
+<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c"/>
+<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c"/>
+<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c"/>
+<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c"/>
+<file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c"/>
+<file category="source" name="CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c"/>
+<file category="source" name="CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c"/>
+</component>
+<component Cclass="Device" Cgroup="Startup" Cvendor="ARM" Cversion="1.0.1" deviceDependent="1">
+<package name="CMSIS" url="http://www.keil.com/pack/" vendor="ARM" version="5.5.0-dev0"/>
+<file category="header" deviceDependent="1" name="Device/ARM/ARMCM0/Include/ARMCM0.h"/>
+<file attr="config" category="sourceAsm" condition="IAR" deviceDependent="1" name="Device/ARM/ARMCM0/Source/IAR/startup_ARMCM0.s" version="1.0.0"/>
+<file attr="config" category="sourceC" deviceDependent="1" name="Device/ARM/ARMCM0/Source/system_ARMCM0.c" version="1.0.0"/>
+</component>
+</components>
+<apis/>
+</configuration>
+
+
+
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10.cpp b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10.cpp
new file mode 100644
index 0000000..471899c
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10.cpp
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010-2018 Arm Limited. All rights reserved.
+*
+*
+* Project: CMSIS NN Library
+* Title: arm_nnexamples_cifar10.cpp
+*
+* Description: Convolutional Neural Network Example
+*
+* Target Processor: Cortex-M4/Cortex-M7
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+* - Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* - Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+* - Neither the name of Arm LIMITED nor the names of its contributors
+* may be used to endorse or promote products derived from this
+* software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+* -------------------------------------------------------------------- */
+
+/**
+ * @ingroup groupExamples
+ */
+
+/**
+ * @defgroup CNNExample Convolutional Neural Network Example
+ *
+ * \par Description:
+ * \par
+ * Demonstrates a convolutional neural network (CNN) example with the use of convolution,
+ * ReLU activation, pooling and fully-connected functions.
+ *
+ * \par Model definition:
+ * \par
+ * The CNN used in this example is based on CIFAR-10 example from Caffe [1].
+ * The neural network consists
+ * of 3 convolution layers interspersed by ReLU activation and max pooling layers, followed by a
+ * fully-connected layer at the end. The input to the network is a 32x32 pixel color image, which will
+ * be classified into one of the 10 output classes.
+ * This example model implementation needs 32.3 KB to store weights, 40 KB for activations and
+ * 3.1 KB for storing the \c im2col data.
+ *
+ * \image html CIFAR10_CNN.gif "Neural Network model definition"
+ *
+ * \par Variables Description:
+ * \par
+ * \li \c conv1_wt, \c conv2_wt, \c conv3_wt are convolution layer weight matrices
+ * \li \c conv1_bias, \c conv2_bias, \c conv3_bias are convolution layer bias arrays
+ * \li \c ip1_wt, ip1_bias point to fully-connected layer weights and biases
+ * \li \c input_data points to the input image data
+ * \li \c output_data points to the classification output
+ * \li \c col_buffer is a buffer to store the \c im2col output
+ * \li \c scratch_buffer is used to store the activation data (intermediate layer outputs)
+ *
+ * \par CMSIS DSP Software Library Functions Used:
+ * \par
+ * - arm_convolve_HWC_q7_RGB()
+ * - arm_convolve_HWC_q7_fast()
+ * - arm_relu_q7()
+ * - arm_maxpool_q7_HWC()
+ * - arm_avepool_q7_HWC()
+ * - arm_fully_connected_q7_opt()
+ * - arm_fully_connected_q7()
+ *
+ * Refer
+ * \link arm_nnexamples_cifar10.cpp \endlink
+ *
+ * \par [1] https://github.com/BVLC/caffe
+ */
+
+#include
+#include
+#include "arm_math.h"
+#include "arm_nnexamples_cifar10_parameter.h"
+#include "arm_nnexamples_cifar10_weights.h"
+
+#include "arm_nnfunctions.h"
+#include "arm_nnexamples_cifar10_inputs.h"
+
+#ifdef _RTE_
+#include "RTE_Components.h"
+#ifdef RTE_Compiler_EventRecorder
+#include "EventRecorder.h"
+#endif
+#endif
+
+// include the input and weights
+
+static q7_t conv1_wt[CONV1_IM_CH * CONV1_KER_DIM * CONV1_KER_DIM * CONV1_OUT_CH] = CONV1_WT;
+static q7_t conv1_bias[CONV1_OUT_CH] = CONV1_BIAS;
+
+static q7_t conv2_wt[CONV2_IM_CH * CONV2_KER_DIM * CONV2_KER_DIM * CONV2_OUT_CH] = CONV2_WT;
+static q7_t conv2_bias[CONV2_OUT_CH] = CONV2_BIAS;
+
+static q7_t conv3_wt[CONV3_IM_CH * CONV3_KER_DIM * CONV3_KER_DIM * CONV3_OUT_CH] = CONV3_WT;
+static q7_t conv3_bias[CONV3_OUT_CH] = CONV3_BIAS;
+
+static q7_t ip1_wt[IP1_DIM * IP1_OUT] = IP1_WT;
+static q7_t ip1_bias[IP1_OUT] = IP1_BIAS;
+
+/* Here the image_data should be the raw uint8 type RGB image in [RGB, RGB, RGB ... RGB] format */
+uint8_t image_data[CONV1_IM_CH * CONV1_IM_DIM * CONV1_IM_DIM] = IMG_DATA;
+q7_t output_data[IP1_OUT];
+
+//vector buffer: max(im2col buffer,average pool buffer, fully connected buffer)
+q7_t col_buffer[2 * 5 * 5 * 32 * 2];
+
+q7_t scratch_buffer[32 * 32 * 10 * 4];
+
+int main()
+{
+ #ifdef RTE_Compiler_EventRecorder
+ EventRecorderInitialize (EventRecordAll, 1); // initialize and start Event Recorder
+ #endif
+
+ printf("start execution\n");
+ /* start the execution */
+
+ q7_t *img_buffer1 = scratch_buffer;
+ q7_t *img_buffer2 = img_buffer1 + 32 * 32 * 32;
+
+ /* input pre-processing */
+ int mean_data[3] = INPUT_MEAN_SHIFT;
+ unsigned int scale_data[3] = INPUT_RIGHT_SHIFT;
+ for (int i=0;i<32*32*3; i+=3) {
+ img_buffer2[i] = (q7_t)__SSAT( ((((int)image_data[i] - mean_data[0])<<7) + (0x1<<(scale_data[0]-1)))
+ >> scale_data[0], 8);
+ img_buffer2[i+1] = (q7_t)__SSAT( ((((int)image_data[i+1] - mean_data[1])<<7) + (0x1<<(scale_data[1]-1)))
+ >> scale_data[1], 8);
+ img_buffer2[i+2] = (q7_t)__SSAT( ((((int)image_data[i+2] - mean_data[2])<<7) + (0x1<<(scale_data[2]-1)))
+ >> scale_data[2], 8);
+ }
+
+ // conv1 img_buffer2 -> img_buffer1
+ arm_convolve_HWC_q7_RGB(img_buffer2, CONV1_IM_DIM, CONV1_IM_CH, conv1_wt, CONV1_OUT_CH, CONV1_KER_DIM, CONV1_PADDING,
+ CONV1_STRIDE, conv1_bias, CONV1_BIAS_LSHIFT, CONV1_OUT_RSHIFT, img_buffer1, CONV1_OUT_DIM,
+ (q15_t *) col_buffer, NULL);
+
+ arm_relu_q7(img_buffer1, CONV1_OUT_DIM * CONV1_OUT_DIM * CONV1_OUT_CH);
+
+ // pool1 img_buffer1 -> img_buffer2
+ arm_maxpool_q7_HWC(img_buffer1, CONV1_OUT_DIM, CONV1_OUT_CH, POOL1_KER_DIM,
+ POOL1_PADDING, POOL1_STRIDE, POOL1_OUT_DIM, NULL, img_buffer2);
+
+ // conv2 img_buffer2 -> img_buffer1
+ arm_convolve_HWC_q7_fast(img_buffer2, CONV2_IM_DIM, CONV2_IM_CH, conv2_wt, CONV2_OUT_CH, CONV2_KER_DIM,
+ CONV2_PADDING, CONV2_STRIDE, conv2_bias, CONV2_BIAS_LSHIFT, CONV2_OUT_RSHIFT, img_buffer1,
+ CONV2_OUT_DIM, (q15_t *) col_buffer, NULL);
+
+ arm_relu_q7(img_buffer1, CONV2_OUT_DIM * CONV2_OUT_DIM * CONV2_OUT_CH);
+
+ // pool2 img_buffer1 -> img_buffer2
+ arm_maxpool_q7_HWC(img_buffer1, CONV2_OUT_DIM, CONV2_OUT_CH, POOL2_KER_DIM,
+ POOL2_PADDING, POOL2_STRIDE, POOL2_OUT_DIM, col_buffer, img_buffer2);
+
+// conv3 img_buffer2 -> img_buffer1
+ arm_convolve_HWC_q7_fast(img_buffer2, CONV3_IM_DIM, CONV3_IM_CH, conv3_wt, CONV3_OUT_CH, CONV3_KER_DIM,
+ CONV3_PADDING, CONV3_STRIDE, conv3_bias, CONV3_BIAS_LSHIFT, CONV3_OUT_RSHIFT, img_buffer1,
+ CONV3_OUT_DIM, (q15_t *) col_buffer, NULL);
+
+ arm_relu_q7(img_buffer1, CONV3_OUT_DIM * CONV3_OUT_DIM * CONV3_OUT_CH);
+
+ // pool3 img_buffer-> img_buffer2
+ arm_maxpool_q7_HWC(img_buffer1, CONV3_OUT_DIM, CONV3_OUT_CH, POOL3_KER_DIM,
+ POOL3_PADDING, POOL3_STRIDE, POOL3_OUT_DIM, col_buffer, img_buffer2);
+
+ arm_fully_connected_q7_opt(img_buffer2, ip1_wt, IP1_DIM, IP1_OUT, IP1_BIAS_LSHIFT, IP1_OUT_RSHIFT, ip1_bias,
+ output_data, (q15_t *) img_buffer1);
+
+ arm_softmax_q7(output_data, 10, output_data);
+
+ for (int i = 0; i < 10; i++)
+ {
+ printf("%d: %d\n", i, output_data[i]);
+ }
+
+ return 0;
+}
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_inputs.h b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_inputs.h
new file mode 100644
index 0000000..c600c5a
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_inputs.h
@@ -0,0 +1,6 @@
+/* Here are two different test images */
+
+//#define IMG_DATA {158,112,49,159,111,47,165,116,51,166,118,53,160,112,46,156,109,41,162,115,47,159,113,45,158,111,44,159,113,41,161,116,41,160,111,52,161,111,49,166,117,41,169,117,45,170,119,44,167,117,40,162,113,38,160,111,39,160,112,43,156,109,44,149,107,45,150,107,45,148,106,43,149,107,44,143,101,39,140,98,43,141,97,41,143,97,38,137,95,36,126,91,36,116,85,33,152,112,51,151,110,40,159,114,45,166,116,56,162,112,49,160,113,43,164,117,47,162,114,45,163,116,46,156,110,38,155,111,41,159,110,54,163,113,52,170,119,41,171,117,40,171,115,33,169,115,30,160,111,33,154,112,41,151,115,50,145,110,53,139,104,55,140,102,52,141,100,48,149,105,50,147,102,46,145,102,45,142,97,38,143,98,34,136,95,31,125,91,32,119,88,34,151,110,47,151,109,33,158,111,36,167,111,48,160,106,42,163,115,44,165,117,45,165,117,45,163,115,43,162,115,43,158,114,48,157,109,57,161,111,51,166,115,38,167,114,37,169,113,35,170,116,39,159,114,47,145,111,54,121,96,49,110,90,52,98,78,50,101,77,47,114,85,50,120,86,48,134,96,55,143,103,51,140,99,39,142,99,35,139,98,34,130,95,34,120,89,33,155,107,40,155,110,32,160,109,31,174,112,44,167,110,43,167,117,46,169,120,48,169,119,48,165,115,44,165,117,45,167,123,57,191,146,95,177,130,75,157,111,41,162,115,47,164,114,54,158,112,58,149,111,67,104,80,47,103,87,65,98,90,76,92,90,84,80,75,66,74,63,50,86,70,52,83,62,39,113,85,45,132,98,46,140,102,43,140,101,39,136,99,39,127,94,36,155,107,41,156,114,48,161,115,49,170,114,47,169,114,43,163,113,40,169,120,47,166,116,44,164,113,41,164,116,42,173,128,59,246,214,164,195,156,107,151,114,56,146,111,60,142,108,71,111,80,50,78,53,31,85,69,56,113,103,98,112,110,111,106,114,118,97,102,105,93,94,93,74,72,67,84,78,70,85,73,47,105,83,45,128,96,48,138,101,46,133,94,36,129,93,36,148,109,54,133,104,64,130,100,57,147,112,53,161,115,44,165,113,39,167,116,41,167,115,41,163,111,37,165,116,39,163,118,42,180,138,85,157,122,78,128,102,58,97,75,43,66,50,31,69,58,43,66,56,45,89,83,76,118,113,110,122,121,120,119,122,122,114,116,116,94,96,96,99,100,97,91,91,86,58,58,47,67,58,37,108,84,49,140,105,58,138,98,44,134,95,40,127,100,57,109,95,80,47,37,17,88,74,28,153,117,48,170,118,43,168,115,40,170,118,43,169,117,42,166,116,37,164,120,39,147,107,52,129,98,59,127,108,75,100,87,70,68,67,57,78,83,72,72,75,64,83,84,74,132,130,121,146,142,132,124,118,108,105,99,90,107,102,94,115,111,103,85,83,77,63,71,69,46,47,39,79,61,36,132,98,58,141,99,48,134,93,39,131,115,90,99,96,92,42,43,38,70,64,41,143,111,56,167,117,42,165,114,36,168,116,39,171,119,49,161,113,51,140,109,51,120,94,49,130,110,77,144,131,107,116,106,93,88,87,79,91,95,88,85,88,82,77,77,69,124,118,107,163,153,140,136,124,112,102,93,81,106,98,88,100,93,84,85,81,74,54,60,58,49,53,49,57,47,32,107,83,50,138,103,51,136,97,39,170,161,144,103,105,105,54,58,59,124,121,113,153,124,82,161,113,43,163,117,41,166,122,50,165,121,66,174,135,95,113,89,59,125,105,78,157,141,121,156,143,128,121,111,101,86,80,74,82,81,77,84,85,82,80,78,73,81,71,61,138,125,112,146,135,123,113,103,93,87,79,70,83,77,69,86,82,76,71,73,67,56,57,53,40,35,27,74,59,35,133,106,59,137,103,45,180,176,163,134,139,143,94,100,105,154,154,149,174,149,112,158,116,51,156,116,47,153,118,60,207,180,146,237,214,198,207,180,166,156,131,119,174,153,145,148,131,125,125,110,107,93,85,79,86,84,79,74,74,71,59,57,53,76,68,58,137,125,112,143,133,122,133,124,114,106,98,89,86,81,74,87,85,78,84,85,78,75,76,71,50,49,43,40,30,15,95,75,44,132,103,57,183,183,175,108,116,122,142,151,158,165,169,168,177,156,122,155,112,50,159,118,51,122,89,47,213,197,179,237,224,226,220,191,188,164,135,131,183,159,155,156,137,132,125,108,104,120,111,104,78,76,69,80,80,77,45,44,40,91,85,77,175,165,154,157,147,137,155,147,138,107,100,92,87,83,77,103,102,96,88,88,79,78,79,73,59,59,59,41,36,33,59,46,31,104,81,46,188,191,189,100,108,116,135,144,153,170,175,178,187,167,136,166,120,59,173,123,55,134,93,44,117,95,80,194,182,188,199,171,164,170,142,133,185,161,151,189,171,159,134,119,106,117,107,95,102,98,89,84,84,79,38,38,34,125,121,113,210,201,192,160,152,142,146,139,130,93,89,82,83,80,75,94,93,88,104,104,94,85,87,81,73,75,78,55,53,55,62,55,48,76,56,26,189,194,194,90,96,105,127,134,144,175,180,185,174,156,133,166,123,68,178,123,53,159,109,47,97,68,44,168,154,152,168,144,126,137,114,94,186,166,148,216,202,183,160,149,129,123,113,98,120,114,105,115,114,109,50,50,47,150,147,140,194,187,178,155,149,140,123,118,111,91,88,83,84,83,79,84,84,80,95,95,85,86,87,81,84,87,89,73,73,73,79,74,64,73,55,24,189,192,193,93,95,103,152,154,163,185,188,192,119,110,98,136,106,66,173,124,58,167,116,50,103,72,39,147,132,120,145,125,103,167,149,127,189,174,155,226,216,200,180,172,157,141,131,117,126,117,107,117,114,109,71,71,68,154,152,147,186,181,174,149,144,136,114,110,104,87,85,80,80,80,76,72,73,70,80,80,72,99,100,94,100,101,99,90,88,81,97,89,69,94,73,34,194,196,196,108,107,112,168,167,172,186,186,188,105,109,109,99,89,67,156,119,62,167,122,55,100,74,34,115,106,88,138,123,103,198,185,169,190,180,169,172,165,159,145,140,140,154,143,134,146,136,125,103,100,95,71,71,70,152,152,149,179,175,170,137,133,127,130,128,122,110,109,105,85,86,83,91,93,91,95,96,90,109,110,104,115,116,111,100,96,80,97,85,53,117,95,47,197,197,197,132,129,136,172,167,174,184,178,181,130,137,142,78,83,77,140,120,88,155,125,77,115,94,52,130,120,93,143,131,116,230,221,211,242,236,230,145,138,137,135,130,130,131,121,112,121,112,101,108,104,95,95,88,75,144,134,118,168,159,146,152,147,138,112,108,101,87,85,80,71,72,68,87,88,87,105,104,99,112,109,99,120,110,93,103,86,54,121,96,48,136,104,48,203,203,204,146,146,160,168,164,178,191,182,188,168,170,172,78,86,90,126,125,126,138,126,113,138,121,82,96,80,37,154,143,133,173,163,155,162,152,141,140,132,117,113,106,88,113,106,90,101,101,92,105,101,87,112,90,58,171,143,104,156,138,109,148,141,126,135,130,118,109,105,97,78,76,72,79,79,77,94,93,94,101,91,82,107,83,55,125,88,45,151,108,55,144,104,46,214,215,215,163,166,180,164,167,184,183,184,194,176,182,186,94,102,105,96,96,102,156,149,145,148,137,111,106,93,61,129,116,105,118,105,95,114,102,89,116,105,89,102,91,73,115,110,98,86,91,88,101,103,95,144,128,102,118,96,64,68,56,32,128,120,105,133,126,115,75,69,61,60,56,51,58,56,53,71,70,65,102,93,78,116,94,64,143,112,68,150,116,64,140,110,54,212,211,205,178,184,192,167,175,189,173,181,193,176,184,188,124,131,133,86,88,96,141,139,143,153,148,141,135,128,111,104,90,80,77,64,55,134,121,108,124,111,96,129,117,100,147,143,133,85,92,93,92,96,93,150,139,120,132,117,93,117,109,92,107,99,86,75,68,58,64,59,52,44,41,39,65,62,60,86,69,40,133,105,59,155,119,62,160,120,54,154,115,45,151,111,46,199,192,180,187,189,187,171,176,181,174,179,185,177,182,184,144,149,152,86,90,99,119,121,132,122,124,130,137,136,135,144,134,126,70,59,51,129,118,108,108,97,86,145,134,123,184,176,168,116,118,118,73,75,73,131,119,103,137,124,105,134,129,118,89,86,78,51,49,44,52,51,50,47,49,52,90,90,93,121,91,60,163,118,68,171,121,64,164,113,52,158,111,50,149,107,46,165,156,146,195,193,187,179,178,175,177,173,172,181,181,180,152,157,160,99,103,111,131,135,146,171,175,185,103,105,111,93,90,87,80,77,73,93,90,86,122,118,116,178,173,173,191,182,177,150,148,148,100,100,101,89,78,66,87,77,63,60,61,57,46,52,54,38,46,51,24,33,41,46,57,69,60,71,83,108,100,75,144,125,82,144,123,76,128,109,61,127,113,69,120,105,63,117,120,124,195,200,200,177,178,176,178,169,168,181,179,179,138,144,147,83,87,91,150,153,159,245,247,250,219,222,225,133,140,144,134,141,147,149,156,164,176,182,192,190,196,208,194,192,197,168,172,181,125,133,143,110,109,109,61,62,62,35,49,58,34,54,68,49,70,87,58,81,102,61,85,110,58,84,111,69,99,122,72,101,119,78,104,120,69,96,112,59,92,112,55,90,115,79,105,133,175,197,213,174,183,192,176,172,177,177,177,182,140,146,150,109,112,113,211,211,209,253,252,247,252,253,252,208,224,232,124,143,157,114,132,149,124,141,162,116,133,156,122,133,152,104,124,148,68,93,119,68,87,104,60,82,101,52,84,111,50,84,110,51,85,115,56,93,125,56,94,131,51,91,130,43,96,135,51,104,141,59,108,142,48,97,132,43,97,137,42,95,132,41,89,135,96,137,168,144,168,188,168,174,188,178,182,192,165,170,174,165,166,164,246,245,237,253,251,241,227,231,228,110,136,153,60,88,111,53,80,105,49,76,105,49,75,107,48,72,101,45,79,115,42,81,120,46,81,113,42,82,116,38,86,125,46,90,125,46,89,126,43,87,128,42,89,132,46,93,139,46,94,137,50,96,137,55,96,135,53,94,134,51,95,139,45,90,133,29,91,141,29,87,130,59,102,134,131,153,176,166,179,191,132,136,137,194,189,181,254,250,242,241,245,245,141,159,175,61,94,127,50,84,118,50,84,119,51,85,121,49,83,120,50,84,116,47,86,117,42,84,117,39,82,115,34,79,113,35,83,120,39,86,125,38,85,125,42,89,130,45,92,134,56,103,145,62,103,142,59,101,142,56,102,146,50,99,144,46,94,140,51,103,149,48,111,162,30,94,140,34,85,124,73,106,136,128,148,167,128,136,143,215,213,209,255,253,249,187,198,205,66,93,118,54,91,128,50,88,125,52,90,127,52,90,127,46,83,121,45,82,115,43,82,113,41,81,112,36,80,113,39,83,117,40,86,123,40,89,131,43,92,134,46,95,138,59,108,150,62,110,152,64,109,147,59,108,149,54,108,154,50,105,152,70,123,167,83,137,182,52,114,165,35,99,147,31,86,130,41,83,122,66,95,126,128,145,164,224,229,234,240,245,247,124,143,153,58,92,114,49,87,123,56,94,131,54,92,129,44,82,119,44,82,119,47,83,119,46,84,119,43,83,119,43,86,123,44,88,127,44,90,131,45,97,141,54,106,150,58,110,154,54,105,150,46,97,141,43,95,140,36,91,138,51,108,158,73,130,178,85,138,182,76,125,169,50,110,162,35,98,149,29,89,138,35,86,133,44,83,126,78,106,138,202,219,233,211,228,234,97,126,140,65,104,126,54,94,129,48,87,124,58,97,133,48,87,123,40,80,116,45,82,119,47,84,122,48,87,126,47,89,130,46,89,132,51,97,140,39,92,138,39,93,139,48,102,148,47,101,147,39,93,139,28,85,133,40,101,153,67,129,182,67,126,176,46,98,142,51,96,139,50,108,161,35,97,147,32,92,143,33,88,141,41,88,138,46,84,125,104,133,159,170,197,211,64,100,119,54,97,121,52,94,128,53,95,130,61,103,139,58,100,135,54,96,131,45,83,120,42,79,118,41,80,120,46,88,130,49,92,135,46,92,136,42,95,139,40,93,138,39,92,136,37,90,135,40,93,138,44,102,151,63,125,178,47,110,164,31,90,140,15,60,103,51,93,136,68,124,177,42,100,148,31,88,137,38,91,146,37,87,139,43,89,132,42,79,113,71,107,133,49,89,114,31,77,105,27,71,105,38,82,117,49,93,128,56,100,135,58,102,137,53,92,128,56,94,131,60,99,137,57,99,139,53,97,138,50,95,137,45,94,136,39,88,131,33,83,125,42,91,133,62,112,154,79,132,179,73,131,181,56,116,168,38,97,146,13,64,108,40,85,127,61,116,168,49,102,148,35,85,132,43,91,143,39,90,139,42,92,134,44,88,125,40,81,112,42,85,115,27,72,104,23,67,102,30,74,109,27,71,106,29,73,108,36,80,115,47,86,120,56,95,128,62,101,135,66,109,144,75,119,156,69,113,152,49,95,134,43,88,127,43,88,127,60,105,144,85,130,170,109,156,197,93,145,190,60,115,164,26,82,130,29,82,126,20,64,107,54,107,160,56,105,149,45,89,132,43,86,134,40,89,134,40,92,132,40,87,123,38,81,115,36,79,114,26,69,105,22,66,101,29,73,108,25,69,104,29,73,108,19,63,98,18,58,89,32,70,100,47,87,118,61,104,137,74,119,152,66,111,145,53,96,131,52,95,130,45,87,123,67,109,145,89,131,167,105,146,182,89,135,175,48,99,145,24,77,124,34,84,129,21,67,110}
+
+
+#define IMG_DATA {235,235,235,231,231,231,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233,232,233,233,231,233,232,231,233,231,233,233,230,233,232,232,232,234,232,231,234,232,232,232,233,233,230,232,233,231,233,233,233,232,232,232,232,232,232,232,232,232,233,233,233,233,233,233,232,232,232,238,238,238,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,236,236,236,236,236,236,236,236,236,236,236,236,236,236,236,236,236,236,237,234,233,236,234,233,236,236,234,234,236,234,234,235,237,234,234,238,235,236,237,236,236,235,236,236,234,236,236,236,235,235,235,235,235,235,235,235,235,236,236,236,236,236,236,235,235,235,237,237,237,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,235,235,235,235,234,234,236,233,231,236,234,231,235,235,234,234,235,236,227,230,233,231,235,238,231,233,235,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,235,235,235,235,235,235,234,234,234,238,238,238,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,234,235,235,235,235,235,234,233,233,230,232,232,231,228,230,232,223,226,231,186,192,197,209,216,219,207,210,213,228,228,230,236,235,235,234,234,234,234,234,234,234,234,234,234,234,234,235,235,235,235,235,235,235,235,235,237,237,237,234,234,234,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,235,234,234,234,234,234,234,235,235,235,235,235,235,234,234,234,234,234,234,235,235,235,235,235,235,236,238,236,233,237,237,219,225,230,203,210,219,163,172,179,195,205,208,214,218,221,230,229,232,237,235,237,235,235,235,235,235,235,235,235,235,235,236,236,236,236,236,236,236,236,236,236,236,239,239,238,236,235,235,236,235,235,236,235,235,236,235,235,236,235,235,235,236,235,235,235,235,234,234,234,235,235,235,237,236,236,237,236,236,234,235,236,232,233,234,235,237,237,229,231,232,208,216,218,194,205,210,185,198,207,174,188,200,165,179,189,184,196,202,207,215,220,226,228,232,236,235,237,236,236,235,236,236,235,236,236,235,236,236,236,237,237,237,237,237,237,237,237,237,228,229,229,228,227,228,232,230,231,231,228,230,234,232,233,237,236,236,237,237,235,236,237,235,237,235,236,237,235,236,239,236,237,239,237,238,225,229,230,224,228,229,233,237,238,221,226,228,183,197,204,161,180,190,159,180,191,154,176,190,144,163,177,143,159,171,156,169,177,198,206,211,233,238,239,236,237,234,235,236,233,235,235,235,235,236,236,236,238,237,237,237,237,239,237,238,212,220,222,224,230,233,230,234,238,227,232,234,229,234,234,234,237,236,237,238,235,238,237,236,239,237,238,239,237,238,239,236,237,240,238,239,201,204,203,219,222,221,233,236,235,214,218,218,193,204,210,185,201,210,184,201,211,173,191,203,165,182,196,159,174,187,162,176,185,186,199,204,229,239,240,234,239,238,233,238,237,233,238,238,234,239,238,236,239,238,237,239,238,238,238,238,216,234,241,221,236,243,225,238,246,225,239,243,227,240,240,231,238,237,236,237,235,238,236,235,238,236,237,238,236,237,237,237,237,239,239,239,197,198,196,220,221,218,233,234,231,230,231,229,209,213,217,209,216,222,219,228,235,208,218,227,209,221,234,210,224,235,217,233,240,218,235,241,225,240,243,228,238,240,228,239,240,230,240,240,230,240,239,235,240,239,237,240,239,238,238,238,118,140,149,119,138,148,124,142,153,136,155,161,172,188,191,225,234,233,235,236,233,237,234,232,236,233,234,235,235,235,235,237,236,233,237,235,214,216,214,226,228,226,232,234,232,236,237,236,228,230,232,227,230,235,231,236,241,225,232,239,225,237,247,217,233,243,201,219,226,185,204,211,172,189,195,167,179,186,167,180,185,186,199,201,223,235,235,235,241,239,236,240,239,238,240,239,109,130,141,103,121,133,108,125,137,111,127,137,146,159,165,222,229,231,227,228,225,229,226,224,236,232,233,234,234,234,231,236,234,230,237,235,229,234,235,231,235,236,232,237,238,230,235,236,231,236,238,231,237,240,229,237,241,223,232,238,191,206,213,164,184,191,146,165,172,137,156,163,134,149,159,128,140,153,121,133,143,149,162,166,216,228,229,234,241,239,235,240,238,237,240,239,195,212,224,188,202,215,199,211,224,200,211,223,209,217,227,223,227,231,213,213,211,211,209,206,216,213,214,220,222,222,219,226,225,210,221,219,209,219,223,211,221,225,216,225,230,220,229,233,225,234,237,226,236,239,225,237,241,218,231,237,183,204,208,175,198,203,181,200,207,178,194,202,186,197,211,170,178,196,142,151,164,185,195,202,219,230,233,231,240,238,234,241,239,236,240,239,193,207,222,191,202,217,202,211,224,214,217,234,223,225,241,214,219,227,203,208,208,171,174,174,177,180,183,207,213,214,174,184,188,98,112,121,93,114,126,101,121,132,111,129,139,122,138,147,137,152,161,153,167,174,202,216,220,223,236,237,218,232,235,220,233,238,223,234,240,217,226,233,221,228,237,212,219,229,196,203,212,222,230,237,219,227,234,221,230,233,232,239,242,235,241,242,113,130,152,111,125,147,113,125,141,125,131,151,138,145,165,170,182,193,191,201,205,190,199,204,208,219,226,216,230,234,158,172,183,54,71,92,45,70,91,49,73,91,53,73,90,66,84,98,102,114,129,159,168,179,221,227,233,234,239,241,233,237,241,227,231,237,223,228,233,207,211,217,202,208,212,211,218,220,212,219,223,199,206,214,179,186,196,188,197,205,211,221,227,221,231,234,61,81,108,69,86,114,63,79,100,68,85,102,123,141,155,139,155,164,151,157,164,195,200,207,214,228,234,206,223,228,163,180,190,103,121,138,95,112,131,101,117,135,138,151,168,181,192,207,207,212,223,221,222,232,219,219,227,205,203,212,183,186,195,158,166,174,147,154,163,131,138,147,125,133,140,130,139,144,136,146,152,133,142,151,128,137,147,138,153,160,182,197,203,197,212,216,40,53,77,58,70,94,85,98,116,127,144,153,132,151,156,96,107,110,119,115,118,163,158,161,173,180,182,184,194,197,182,194,198,181,193,200,183,194,202,198,209,217,218,228,236,200,210,217,174,181,186,159,165,172,145,150,159,132,136,149,116,125,138,98,111,123,94,106,118,99,111,123,105,118,128,107,121,130,122,135,145,138,151,161,150,164,174,157,174,184,188,206,213,185,203,208,13,15,35,26,29,47,134,140,151,206,216,220,138,150,150,118,123,123,141,133,134,172,162,162,181,181,180,207,209,211,220,224,225,228,234,233,224,234,232,230,241,240,226,238,238,176,189,190,144,159,163,138,154,162,142,158,170,145,163,177,154,171,187,149,165,182,149,165,182,154,171,187,157,174,189,160,177,191,173,190,204,187,204,217,190,207,218,178,196,208,165,183,193,157,175,183,5,5,24,58,62,79,200,207,217,225,232,239,197,205,212,199,207,211,212,212,218,226,224,229,229,230,237,233,236,246,232,238,245,230,238,239,209,221,220,223,238,239,221,238,241,210,228,234,198,217,228,180,200,214,193,216,230,188,213,229,189,212,231,194,214,234,192,212,232,184,204,224,172,193,212,171,191,209,161,181,197,144,165,179,136,156,169,131,146,161,128,143,158,138,154,165,39,45,71,145,155,179,190,204,222,186,196,216,184,197,217,192,211,229,194,211,230,194,208,227,194,206,227,191,203,228,192,207,228,190,207,221,177,193,207,180,198,215,154,176,193,147,169,188,145,161,184,156,171,195,146,163,186,113,133,156,114,137,161,132,157,180,126,150,173,111,135,158,92,115,138,91,112,135,93,114,133,94,116,131,105,125,140,121,133,151,129,141,158,129,142,156,122,135,161,162,179,207,143,160,194,137,154,189,131,152,187,128,152,190,127,150,192,130,150,193,131,150,192,128,147,190,127,147,189,129,149,189,129,149,188,124,145,186,104,126,163,100,122,154,102,120,154,118,134,170,112,128,163,94,109,145,94,112,148,94,117,153,87,112,144,83,103,136,80,97,130,83,103,134,93,111,139,101,117,141,108,121,144,115,125,146,121,133,148,130,144,156,73,87,109,76,90,113,77,90,122,80,93,127,84,98,134,87,102,142,87,102,147,90,105,150,94,111,152,102,119,160,107,124,165,113,131,172,115,137,181,118,136,186,118,132,180,120,133,175,115,136,172,110,133,168,106,127,163,100,119,155,95,109,148,85,101,139,79,97,132,80,92,127,80,94,129,77,100,133,80,100,129,82,98,122,92,104,126,113,119,138,125,135,146,136,149,156,13,25,41,3,11,25,9,16,35,18,26,48,18,26,52,21,25,56,20,25,58,22,30,61,26,36,62,34,43,70,42,51,77,48,59,87,52,69,106,60,75,121,66,77,126,70,79,126,71,87,127,72,88,126,67,81,120,60,72,112,55,67,106,53,68,104,53,69,103,57,69,102,57,71,105,57,78,110,72,89,115,87,100,119,104,113,128,120,124,136,130,136,141,137,146,149,36,46,55,11,16,20,8,13,19,32,44,53,36,45,58,22,25,41,8,11,30,3,8,24,1,4,17,0,2,15,0,2,15,0,4,20,6,13,42,5,18,56,1,19,60,3,23,62,13,29,71,24,38,81,21,33,77,21,31,76,21,38,78,22,44,79,30,50,83,39,58,90,57,70,101,85,90,118,113,115,138,123,123,138,116,115,125,122,123,128,134,139,137,153,160,158,35,41,45,26,27,26,13,19,18,27,41,41,71,81,84,70,70,76,49,50,57,27,31,37,15,15,21,5,5,11,2,2,7,0,0,7,17,17,35,57,64,91,31,50,78,10,36,62,4,30,60,4,30,62,7,30,63,14,35,69,25,43,74,41,55,83,62,71,99,86,97,123,122,124,146,144,131,149,132,120,135,114,105,114,117,111,116,132,134,133,146,152,146,172,179,175,16,15,17,13,10,9,4,10,8,3,12,11,45,44,46,65,52,57,54,43,47,36,33,35,18,18,20,4,4,7,2,2,4,0,1,3,7,8,15,118,117,134,161,158,179,131,128,148,112,112,131,105,105,125,105,103,124,109,105,127,118,107,126,138,115,133,154,126,144,151,126,141,127,106,116,105,86,91,106,94,97,120,116,116,129,130,129,142,147,144,164,172,165,184,194,190,40,40,35,12,10,7,0,3,3,0,4,4,12,6,7,30,12,17,32,12,17,21,10,12,7,6,7,2,1,3,2,1,2,3,2,3,0,0,2,68,58,64,182,128,146,205,130,148,196,127,144,194,123,141,195,119,137,187,113,129,172,110,122,150,96,106,123,75,83,103,66,69,95,71,70,104,93,88,122,118,113,129,132,126,132,141,135,152,162,158,171,182,176,185,197,194,69,77,64,26,29,21,1,1,1,1,1,2,4,1,0,12,2,5,18,3,9,12,2,5,4,1,2,2,0,0,2,0,0,4,0,1,1,1,1,32,12,11,153,45,59,203,47,68,195,46,67,191,48,69,179,50,67,155,49,59,119,42,49,91,38,42,81,48,46,94,77,71,117,110,102,125,126,116,125,128,120,129,135,128,144,153,147,162,176,171,173,187,183,184,198,196,83,94,82,47,52,43,1,1,1,2,1,2,2,0,0,5,1,2,7,1,5,4,0,2,1,0,0,1,0,0,1,0,0,3,0,0,1,2,0,27,3,2,142,25,38,205,32,54,198,25,46,169,25,43,121,25,36,85,29,34,74,41,39,85,66,56,102,92,82,121,113,105,128,124,115,122,126,115,121,127,118,132,139,131,147,157,150,165,179,174,176,191,187,186,201,199,92,102,93,54,60,50,6,7,3,3,2,1,2,2,0,1,3,1,1,3,3,1,2,2,1,1,1,1,0,0,1,0,0,1,1,1,0,3,2,15,1,0,102,19,28,157,31,47,117,17,23,74,13,12,56,27,22,74,58,55,99,90,81,115,115,99,122,126,111,124,124,112,123,123,113,125,130,119,128,135,126,136,145,137,148,159,151,162,176,171,177,192,188,188,202,201,87,99,89,43,51,37,19,23,11,11,12,4,8,10,2,5,11,4,2,10,4,2,7,2,3,4,1,3,4,1,3,4,1,2,3,2,0,6,6,4,5,2,42,13,13,71,21,24,53,27,25,57,50,41,80,77,62,113,98,82,132,113,101,134,126,113,123,126,112,116,125,111,120,128,115,131,138,126,139,148,137,143,154,145,156,168,161,169,184,179,182,197,193,188,202,201,82,96,82,46,57,36,36,44,22,31,35,17,27,30,15,22,28,15,17,26,13,16,23,12,18,21,12,19,21,13,20,22,14,19,23,15,19,27,20,23,31,21,37,40,27,64,55,45,87,70,67,104,88,81,116,102,85,128,112,88,139,121,105,131,122,110,117,122,107,115,127,112,123,133,119,131,139,127,139,149,138,148,160,151,159,172,164,174,189,183,185,200,196,187,202,200,85,101,83,62,75,48,58,67,38,55,61,37,51,56,35,47,53,33,46,53,34,48,55,38,49,55,40,51,56,41,53,58,44,55,62,46,59,67,45,68,71,48,81,84,59,104,96,74,116,103,83,127,109,92,133,116,97,127,121,97,127,127,107,118,124,106,114,125,108,122,131,117,129,136,123,136,145,133,141,152,141,149,162,153,158,171,163,168,183,178,180,195,191,186,200,199}
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_parameter.h b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_parameter.h
new file mode 100644
index 0000000..09d0ca3
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_parameter.h
@@ -0,0 +1,43 @@
+#define CONV1_IM_DIM 32
+#define CONV1_IM_CH 3
+#define CONV1_KER_DIM 5
+#define CONV1_PADDING 2
+#define CONV1_STRIDE 1
+#define CONV1_OUT_CH 32
+#define CONV1_OUT_DIM 32
+
+#define POOL1_KER_DIM 3
+#define POOL1_STRIDE 2
+#define POOL1_PADDING 0
+#define POOL1_OUT_DIM 16
+
+#define CONV2_IM_DIM 16
+#define CONV2_IM_CH 32
+#define CONV2_KER_DIM 5
+#define CONV2_PADDING 2
+#define CONV2_STRIDE 1
+#define CONV2_OUT_CH 16
+#define CONV2_OUT_DIM 16
+
+#define POOL2_KER_DIM 3
+#define POOL2_STRIDE 2
+#define POOL2_PADDING 0
+#define POOL2_OUT_DIM 8
+
+#define CONV3_IM_DIM 8
+#define CONV3_IM_CH 16
+#define CONV3_KER_DIM 5
+#define CONV3_PADDING 2
+#define CONV3_STRIDE 1
+#define CONV3_OUT_CH 32
+#define CONV3_OUT_DIM 8
+
+#define POOL3_KER_DIM 3
+#define POOL3_STRIDE 2
+#define POOL3_PADDING 0
+#define POOL3_OUT_DIM 4
+
+#define IP1_DIM 4*4*32
+#define IP1_IM_DIM 4
+#define IP1_IM_CH 32
+#define IP1_OUT 10
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_weights.h b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_weights.h
new file mode 100644
index 0000000..8d92d21
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_weights.h
@@ -0,0 +1,26 @@
+#define CONV1_WT {-9,-1,2,6,-4,6,4,-11,8,-9,-11,10,-12,5,15,11,-1,-1,33,-25,-18,47,-35,-23,25,-27,-5,0,-4,9,2,-5,0,34,-25,-21,55,-40,-33,34,-32,-16,5,-7,2,-21,16,14,-4,5,3,14,-10,-7,3,-10,-2,-8,5,8,-29,23,12,-25,15,8,-15,4,7,-16,5,8,-13,17,12,2,-4,-1,-1,-10,9,6,-4,21,11,0,12,-10,-9,-4,13,10,12,19,11,27,0,-5,11,-21,-26,-28,-7,3,-15,16,14,26,-22,-25,-8,-29,-24,-23,0,6,-13,13,25,0,-21,-18,-3,-21,-19,-13,11,14,4,13,18,-4,6,16,-1,-10,-14,-7,12,9,5,13,8,-4,9,10,-9,-4,7,-3,7,7,-12,7,3,0,-1,-7,2,-8,-5,9,-3,6,-4,7,-10,-25,7,-7,-7,10,2,14,-4,2,17,-9,5,-3,7,-10,-25,4,-10,-8,0,0,16,-7,11,25,-14,13,4,7,-12,-23,5,-16,-17,8,0,11,2,13,21,-11,9,-3,2,-9,-9,-3,-14,-11,1,-1,6,-6,6,14,-1,16,11,7,3,-14,21,6,-6,17,13,4,1,7,-3,-6,2,2,27,13,-5,18,0,-7,-6,-4,2,-24,-11,0,-18,-8,6,24,8,-14,11,-6,-8,-8,0,12,-22,-2,13,-15,-3,7,29,6,-16,10,-6,-4,-12,-7,11,-26,-14,3,-10,-8,0,16,5,-8,-5,-5,3,-15,-2,23,-17,-5,17,1,1,12,10,3,-5,-7,-18,-7,-3,-4,2,3,6,4,4,2,-7,17,20,12,-23,-27,-15,-3,3,6,5,15,6,-2,2,-5,22,22,11,-26,-30,-19,-6,-5,3,10,12,6,2,1,-3,20,14,4,-15,-26,-18,0,-9,1,14,8,6,1,-7,-9,8,11,6,-10,-13,-11,-4,-4,8,2,5,17,-9,0,11,-9,-11,21,6,13,39,-7,1,30,2,12,36,-11,-1,28,-2,-25,-6,6,-16,-6,-9,-34,-24,3,-17,-15,2,-20,-11,0,-13,-11,25,13,6,0,-15,-20,16,7,-11,7,-3,-10,-5,-2,-6,14,22,12,-16,-5,-12,7,23,7,-10,5,-2,-10,-1,-9,7,21,8,-19,-2,-13,2,21,3,-15,9,-2,-4,-2,-2,-12,2,13,-19,1,6,0,16,-1,5,8,-28,11,-2,-4,-10,-1,21,-30,-4,22,-8,14,9,11,14,-21,20,-9,-23,0,-4,19,-28,-9,31,-30,-1,12,3,14,-11,30,3,-24,3,-8,6,-22,-9,31,-29,2,26,-5,11,8,24,2,-36,24,3,-9,-6,-17,4,-8,-8,12,1,-1,5,3,-12,15,-6,-2,-1,0,15,-11,-18,6,-18,-10,6,3,6,-3,3,-12,13,-14,-1,43,-23,-3,52,-16,-11,28,-8,4,-6,5,-11,16,-6,-20,32,-28,-24,38,-31,-22,20,-16,5,-13,7,-2,5,3,-6,17,-10,-8,22,-13,-5,13,0,11,-16,14,-3,-14,6,3,-4,4,4,-2,2,-3,-9,5,2,1,-2,-10,-6,-3,-17,-2,1,-19,2,1,-14,8,11,14,3,7,1,-13,-13,2,-1,-3,-9,0,-5,-18,4,0,18,3,9,15,-10,-8,10,-12,-18,5,-3,-16,-10,0,-14,5,4,14,14,-1,10,16,-6,-2,14,-1,-12,8,2,-18,-13,-1,17,-2,2,24,-4,-3,17,8,-1,12,9,-6,-3,-2,-9,3,-2,-13,-1,-5,-17,-5,-7,-16,-7,-8,-14,-9,3,-9,-1,0,-13,-7,4,-9,-7,3,-6,-8,-2,-4,-7,5,-7,-4,1,-8,-11,10,6,-3,14,12,-1,6,10,-2,0,-12,-16,-3,-9,-19,0,1,-12,9,12,-2,1,10,-1,10,4,-8,6,3,-11,11,14,1,12,19,8,3,16,9,-7,-1,4,-11,-1,3,1,12,8,7,15,13,-5,-2,4,-7,-5,1,-8,-3,2,1,10,9,-1,8,9,-11,-9,-2,0,-1,6,-14,-13,-6,1,3,10,3,4,11,-1,-5,5,7,4,10,-14,-17,-10,-12,-13,-6,-11,-14,-6,1,-6,1,2,-1,1,-5,-7,-3,-2,-4,1,-11,-13,-9,-12,-16,-12,-2,-1,-2,-9,-5,-4,-7,-3,3,-1,-3,8,9,-2,1,8,11,13,5,14,17,1,10,17,-1,3,18,0,-6,0,-1,0,3,4,13,16,-1,8,14,-4,2,14,-16,-20,-14,-3,-3,0,13,21,25,5,11,14,-5,-2,9,1,-5,2,-6,-9,-14,-7,-4,-8,-10,-9,-13,-10,-12,-12,7,-1,-2,-5,2,-14,-5,3,-6,1,9,3,-8,3,-1,-9,2,0,-12,-4,-21,6,17,8,17,28,22,-5,6,2,-15,-8,-7,8,21,-3,2,20,-5,-1,16,-8,7,17,3,0,6,1,11,26,0,-22,-4,-33,-36,-16,-40,-2,14,1,2,10,5,1,17,-10,-16,3,-22,-22,-1,-14,-9,10,6,-7,7,7,10,-8,5,-3,-4,-1,-7,-7,-7,1,0,-2,-7,11,9,5,-2,2,12,14,16,12,7,2,0,-9,-23,-14,-8,-23,-1,-11,-14,-7,-9,-7,14,11,7,44,35,19,34,38,12,1,3,9,-6,-4,8,-32,-28,-19,-41,-45,-37,-14,-17,-17,-7,4,-3,-5,8,8,3,16,21,8,14,24,-3,-5,6,-11,-15,-18,-8,-12,-12,-6,-14,-8,9,-6,1,16,3,11,3,3,-8,-5,0,-6,-13,-13,-13,-6,-15,-15,6,-5,-4,9,16,5,1,14,5,-4,7,-1,-12,-11,-18,0,-6,-12,5,14,0,-1,13,2,2,18,6,-9,-4,-16,0,-3,-17,4,12,0,-1,13,2,0,16,6,-6,1,-9,-5,-8,-19,18,-1,-4,16,-2,-6,18,-1,-4,9,-4,1,-12,-13,-4,9,-3,-9,6,-4,-14,15,5,-2,-5,-5,-1,-20,-9,-3,16,2,-5,12,3,-7,15,7,1,-12,-7,-2,-15,-2,2,18,7,-1,10,2,-5,-1,-8,-8,-18,-15,-7,-2,7,9,-15,-15,-17,-7,-6,-7,-12,-12,-6,-17,-12,-6,9,14,12,13,7,6,3,-3,2,-6,-7,1,-7,-4,3,-11,-12,-6,12,11,6,-4,-3,-2,-15,-12,-4,-7,-4,3,-9,-9,-4,15,17,12,-3,1,2,-18,-13,-6,4,5,10,6,1,5,9,14,8,-6,1,3,-21,-15,-7,-2,-1,5,4,-1,2,4,17,14,-4,4,9,-24,-20,-11,-7,-7,3,-2,-6,1,5,9,-10,6,9,-12,12,11,-7,4,1,-15,11,9,-6,25,15,5,21,13,5,23,14,7,13,2,-3,17,10,6,-1,-10,-2,-20,-21,-14,-11,-11,-2,-11,-12,-3,-16,-15,-3,-4,-14,6,-16,-19,-1,-3,-2,13,-1,-3,12,-6,-10,8,0,-5,10,-11,-10,4,-4,-1,8,-11,-5,3,-6,-1,8,-8,-15,0,-3,0,9,4,11,14,-3,3,4,-15,-9,-6,-12,-14,-1,-18,-8,-1,-26,-12,-8,-28,-12,-9,-30,-13,-8,10,-1,-6,15,17,5,11,22,8,15,24,8,17,25,12,10,-5,-9,4,4,-5,1,10,1,2,4,-8,6,7,-7,5,-10,-6,-1,-5,-3,3,8,10,1,4,-2,-1,0,-8,2,-8,3,0,-6,9,1,-2,-8,6,1,-3,1,-10,6,5,-6,7,-8,-12,0,11,17,-4,-7,-1,-12,1,-2,9,11,2,17,-35,-32,-21,38,52,20,-27,-17,-30,4,2,17,18,10,19,-44,-34,-34,43,59,28,-25,-17,-31,6,0,16,10,4,18,-31,-27,-22,27,31,16,-10,-9,-16,3,-6,6,-2,-11,15,10,0,14,-2,-8,-8,0,-7,-7,13,0,11,2,-7,3,10,4,3,0,-2,-14,-1,-1,-16,-1,3,-6,8,-4,3,12,4,2,4,1,-17,7,10,-8,0,10,-2,-3,-15,0,10,4,8,-10,-13,-24,-7,-2,-16,0,8,-3,-6,-6,7,7,11,13,2,9,-6,-4,11,-15,2,22,0,2,-4,-2,-10,-12,-5,24,15,25,-1,-4,-2,-11,-1,-3,7,11,5,-16,-17,-23,-29,-35,-30,53,45,54,-11,-9,-12,-3,5,4,44,39,36,-45,-51,-51,-19,-28,-16,25,24,28,-15,0,-12,7,1,2,44,42,40,-36,-41,-41,5,0,3,-1,18,5,-17,-15,-15,7,8,14,11,3,4,1,-11,-8,10,-1,-2,-5,-10,-3,6,6,10,5,9,9,-6,3,0,-14,-21,-16,-40,-39,-26,0,4,16,15,20,24,-14,-7,-5,3,-5,1,-13,-12,0,-16,-11,6,4,7,20,-11,-9,-1,11,-4,7,14,7,22,-14,-14,5,-7,-6,12,-4,-9,4,18,-6,5,14,-4,12,-2,-11,6,3,-5,11,4,-10,1,2,-1,-14,4,-6,-23,26,11,-15,12,-7,-33,6,-10,-15,13,6,-20,14,0,-25,2,-14,-34,1,-4,-11,3,9,23,1,6,-5,-6,-2,-8,-7,6,17,-5,18,42,-19,10,42,-3,-1,-2,-2,1,8,4,11,29,-18,-12,7,-16,-7,2,1,-1,9,-9,-9,15,-7,-5,18,-1,-4,6,1,2,-6,-10,7,-2,1,4,-2,-2,-4,6,5,-7,4,3,-7,-2,-11,17,0,-15,-18,-24,40,25,30,-9,-20,-18,1,4,1,-32,-12,-22,54,43,41,0,-19,-9,-29,-29,-25,5,16,20,20,6,8,29,5,16,-60,-74,-65,37,39,43,-5,-2,-2,21,-7,9,-21,-38,-19,6,22,14,2,25,17,-22,4,-11,3,6,-15,-5,2,-13,-2,12,-8,0,13,5,-4,1,5,-8,19,4,-5,28,29,-18,18,19,-20,1,-2,-9,-2,-10,-30,1,8,-57,-26,-3,-47,-28,-10,4,1,3,15,4,-1,-4,-8,4,12,-3,10,48,16,25,38,1,-1,15,-7,-4,23,-2,2,31,-7,-15,23,-17,-19,6,-20,-21,-2,-1,4,4,-4,3,9,0,8,10,-6,-3,17,-2,-11,14,-2,-20,-12,-8,18,-15,-9,20,-4,-1,23,7,6,20,-2,-3,-5,-12,-3,26,-27,-15,17,-22,-10,19,-10,-3,16,-10,-3,0,-7,-1,19,-16,-6,16,-12,-5,16,-7,-5,6,-6,-4,-9,-5,-4,8,-6,-2,9,1,-1,4,7,0,-6,9,5,-13,12,6,4,8,15,10,-22,-12,-15,-34,-20,-19,-14,-2,-3,9,0,7,13,18,21,3,21,22,-31,-11,-6,-24,-5,0,8,-10,-1,-12,-20,-13,8,13,13,16,28,24,5,21,22,3,-11,-5,1,-15,-11,18,6,0,3,1,-8,-1,2,6,-1,-5,-8,8,-5,-8,22,7,-6,6,-4,-13,-4,-12,-6,-12,-9,-10,-9,-9,-11,-6,-6,-9,-6,-7,-10,-6,-9,-10,-11,-8,-8,-9,-7,-7,-5,-3,-4,-5,-4,-4,-6,-5,-3,1,4,4,-1,3,3,1,4,4,-2,1,2,-2,1,5,-3,3,6,-5,1,4,-5,1,3,-4,0,3,-3,2,6,-9,-2,2,-6,1,3,-5,1,1,-5,0,2,-5,1,4,0,-4,-1,20,6,2,20,-2,-17,7,-1,-15,-10,4,-3,-12,-1,16,-1,-4,6,23,1,-7,22,0,-13,10,2,-2,-32,-10,14,-25,-13,11,16,3,6,25,0,-13,19,-1,-10,-27,-2,11,-41,-18,8,-1,0,11,17,-3,-12,26,0,-13,-6,10,10,-22,-1,16,-16,0,17,-3,-3,7,3,-11,-8,-19,14,-3,-12,13,2,-5,1,9,4,-1,-4,10,0,2,-20,18,2,-15,11,14,0,-6,11,20,-11,-17,17,-12,-13,-27,23,10,-18,15,25,4,-14,5,30,-16,-29,26,-13,-21,-33,21,14,-20,13,32,4,-19,2,32,-19,-29,22,-11,-18,-24,19,8,-11,12,22,5,-16,-3,25,-11,-16,7,-7,-11,-6,-3,-14,8,11,2,4,5,-2,9,-1,-1,17,-4,2,-10,-1,-9,-9,3,-4,-26,-13,-19,11,11,14,16,4,12,-11,2,-1,-19,-1,-5,-32,-12,-17,19,23,23,8,0,4,-16,-4,-5,-17,1,-3,-13,4,0,17,19,15,1,-8,-7,-16,-5,-5,-2,14,11,-3,10,6,10,7,3,-2,-11,-13}
+
+#define CONV1_BIAS {-49,-18,-7,-20,-12,-15,7,2,-10,-84,-72,-65,-53,-6,-87,-63,-64,-28,-28,-4,-3,-10,-52,-15,-5,-7,-31,-44,-102,-19,-5,-65}
+
+#define CONV2_WT {-3,-9,-16,-14,8,-17,-10,-9,-20,37,-11,-5,-21,9,-22,-10,-11,18,4,12,8,22,11,6,-14,-6,14,15,8,-15,-6,-10,-23,-32,-11,-11,34,29,-5,-14,-13,14,-31,-17,-12,8,-19,-9,11,12,-8,31,35,24,-35,7,16,-16,-6,20,3,0,9,-28,-36,-21,-12,-35,-6,-2,-10,23,10,-2,8,7,-16,26,-13,-15,-11,17,-3,16,15,-2,-23,-9,-23,-13,6,-21,7,-4,-40,-38,-24,-20,-3,-47,19,26,-3,4,21,-4,-4,9,1,24,14,-17,-26,-1,-12,38,19,13,-7,-18,49,1,1,-11,8,-7,-26,-8,-30,14,-28,-49,11,-15,-16,-8,39,1,6,-16,-23,35,4,-8,-7,-15,-15,33,14,8,-9,-2,42,27,4,15,0,-25,-13,-22,14,-10,-44,-35,-10,3,-16,14,-10,15,8,14,6,26,0,10,19,23,15,5,-8,12,-23,7,-18,14,8,19,-4,-9,4,-5,18,-31,-44,-35,9,7,-23,7,13,0,-15,-17,-2,25,-6,13,9,24,-12,26,16,35,-34,-14,19,10,-8,8,-8,16,14,-17,5,-21,4,-17,-20,6,7,43,26,-4,-2,-10,-1,30,-17,-9,-16,6,9,24,15,-21,-41,16,-4,34,11,-8,-6,-4,-39,-19,-3,-39,-21,-9,-5,-1,8,27,10,-21,-7,-3,-5,-10,-11,-11,-36,5,-14,12,6,25,-21,0,2,43,12,-13,3,-14,-36,-11,-10,-35,-37,-49,-7,-13,4,24,22,-16,-4,-16,-17,-18,-8,-13,-22,7,-30,34,6,23,-15,-22,26,3,-7,-5,-3,-20,-8,-22,21,-44,-34,3,-18,11,-27,15,15,4,25,12,-5,33,-4,-3,-5,26,-4,-8,-17,-1,-43,7,-17,13,2,-23,-23,-13,-21,-9,26,-46,-47,-5,-5,8,-34,2,28,-4,33,-23,-24,41,-15,-9,-15,5,-12,-17,6,-21,-20,-22,-6,8,-8,-22,-26,8,-1,-17,25,-28,6,-6,-35,15,-21,48,30,-5,24,-12,-48,70,-13,-12,-1,11,13,3,17,-4,-25,-9,-33,1,13,-18,-16,-15,4,-24,37,-26,11,-39,-17,8,-4,22,23,-7,21,-1,-31,44,-7,2,-37,-5,13,-16,1,2,-37,-20,-3,3,5,-14,-5,-32,-26,-2,10,16,20,-51,-9,12,4,13,9,-24,-19,13,-15,50,9,6,-16,-27,19,23,14,35,-12,-30,39,-18,2,2,-10,8,-17,29,9,-15,-12,14,-19,23,-7,-28,8,6,3,-15,-12,-26,20,-3,13,35,23,-25,-59,-32,10,36,-40,12,8,-8,15,-22,-20,1,-2,-54,-32,-14,-16,10,-30,-54,-3,0,-2,-10,-22,-10,5,-15,-5,26,23,-40,-60,-6,-12,9,-28,-21,4,-29,11,4,-28,8,4,-26,-17,-25,-44,30,-4,-24,18,4,13,-5,-23,15,22,-7,3,33,48,-28,-22,-20,-22,-3,-58,-2,32,5,22,2,-10,-20,-5,-23,15,-36,-28,42,-1,-41,12,1,18,6,-35,-4,14,1,-27,44,35,-36,-21,3,-21,-11,-50,-7,14,7,21,34,-24,1,-2,-12,3,-27,-8,54,1,-10,16,-24,-13,-27,-35,-10,7,-12,-5,21,-19,6,12,4,9,-42,-6,-16,7,-20,4,35,-9,63,-7,7,35,16,-10,49,13,-26,55,-28,17,-35,16,-40,17,4,19,32,-2,-20,-51,-21,5,38,-8,26,-2,-17,25,-6,0,21,-11,-20,8,-1,0,27,8,-38,14,-28,11,-44,3,-32,-19,-8,-25,15,-9,-16,-47,1,7,-10,-8,7,-4,1,20,-9,-34,16,-9,-26,22,-3,-15,49,11,5,21,-27,28,-31,15,54,10,-16,-21,-1,8,-2,-10,19,3,-1,-24,4,15,10,24,17,7,-13,-25,-27,26,-12,-3,9,-2,-5,34,-12,15,-23,-16,26,-2,-10,-1,-11,6,23,-21,15,6,20,-19,17,-2,-20,11,34,-8,16,-19,-3,38,-2,9,28,-9,11,21,-8,-22,-48,-15,-15,-13,-7,24,-7,-3,35,-9,18,-8,13,-1,27,-21,6,-10,36,41,32,11,12,-3,22,-10,13,-2,-16,7,-16,25,31,14,-11,6,7,6,7,-5,35,-22,3,37,18,54,-9,2,-17,-5,-6,2,-7,-1,2,21,8,5,14,8,-48,-18,-3,-3,7,-17,-11,7,23,-6,-21,-25,22,-26,21,-19,13,77,7,-29,-2,-12,1,23,-18,-4,-18,34,-1,2,19,53,6,3,9,5,4,-7,36,5,-6,-11,-18,-5,49,-15,9,15,-5,17,2,-14,12,8,42,59,-40,39,-9,7,-13,-3,-22,47,-9,-7,8,32,2,-27,0,-33,14,-10,-27,0,39,-15,23,-3,0,10,22,-7,-23,5,17,1,17,26,26,37,15,3,18,21,-20,3,-4,23,29,-7,-21,-32,34,-6,-2,42,22,-7,34,11,2,30,34,-4,-14,-7,10,14,4,6,-9,-9,37,-39,0,2,-14,21,-45,-5,28,-2,-6,8,0,16,37,-8,3,-25,30,57,15,-16,-28,-8,-17,4,-2,-45,1,-8,10,-18,1,19,-17,-11,-39,3,-27,-6,14,-19,-14,12,16,-25,-24,-41,-26,-11,57,20,-12,16,-3,-17,-1,-9,-3,-1,-5,-38,43,4,16,1,-14,38,-3,17,-5,-2,16,-16,17,26,-2,4,-17,-16,0,-10,47,25,9,22,13,-15,-16,14,13,-1,-17,-5,32,-13,5,5,-33,4,-34,9,12,10,18,-1,-27,-21,-1,1,7,11,-21,-13,4,6,12,46,24,-25,-4,-1,9,-25,18,-1,43,12,17,-15,0,-11,-22,-2,-3,-4,17,-30,3,-22,7,-12,22,34,-48,-22,-8,2,2,-6,18,-14,15,-5,16,-2,1,14,-22,-14,4,-42,-19,-30,11,16,10,-28,10,22,-7,5,-30,-14,46,29,-53,1,-37,29,2,-74,12,13,20,0,21,-40,-2,20,-10,-60,-16,-38,-4,-24,-13,0,-6,-12,-14,-1,-16,5,-20,-42,4,30,-110,-3,-6,5,-6,-85,-1,6,8,-12,-2,-49,1,-14,24,-17,11,-50,21,18,25,38,-22,-3,-16,-1,-15,24,-19,3,-16,39,-52,3,-24,3,27,-64,37,18,26,8,-4,-43,2,2,-14,-70,-15,-58,-6,-23,2,2,-8,-2,-4,15,-18,3,-27,-20,-21,3,-115,3,-33,31,14,-50,28,-13,20,8,-55,-51,34,19,-16,-42,17,-62,-9,-24,11,5,5,22,1,12,34,16,-19,-19,23,18,-103,-13,-34,29,12,-57,-2,3,17,0,-17,-25,8,9,-22,-36,-20,-34,15,-9,12,32,31,-18,17,-3,-23,-8,-8,-1,31,15,-37,-3,-29,24,1,-34,24,28,-7,11,21,-18,-21,18,-54,-68,-35,-63,-1,-19,-1,-16,8,-19,0,-19,-18,-2,-17,-39,16,3,-68,-21,-75,-17,-3,-80,-12,24,-46,4,-7,-45,-6,5,-26,4,-1,-44,39,20,18,4,-8,-2,12,-5,-16,12,-5,-17,15,6,-17,1,-75,-25,3,-51,12,46,24,19,-3,-31,1,23,-11,-58,-23,-88,8,-5,-24,-6,18,8,7,1,-27,21,-2,-32,11,-18,-88,2,-74,-6,-15,-52,-7,24,6,11,-36,-48,19,19,41,-13,3,-67,6,20,-13,31,33,5,-7,-4,4,20,11,-2,23,17,-43,-6,-49,-12,-15,24,19,34,42,8,-9,-27,-5,20,39,12,-17,-18,20,6,3,31,-11,-5,-26,26,16,-7,-5,-2,23,29,47,11,28,13,8,35,21,-7,21,3,50,1,-23,2,3,-10,-8,-36,-9,-25,-4,28,-23,-25,-17,-11,-1,2,-11,-2,-7,32,-5,1,-16,4,8,-12,-12,-5,3,1,-4,-19,-6,-28,5,17,-1,-2,19,9,32,17,-17,-18,13,-17,10,8,-5,1,-23,8,27,34,-42,-25,-8,-4,16,10,18,23,9,29,-5,3,6,-46,-26,-50,-37,-13,6,-27,6,1,-8,15,0,-7,5,-10,-18,4,-9,28,-14,-30,6,36,19,1,6,15,-35,-17,10,26,21,-1,15,6,-25,39,4,4,10,22,-26,23,-1,-1,13,16,9,-9,-2,21,7,2,22,56,25,28,36,19,4,26,9,-7,3,15,-6,-21,5,-14,28,-4,-44,8,-16,16,8,14,-18,25,-21,-5,13,-12,5,-2,4,-5,-13,27,-11,13,-13,-10,11,4,20,50,-10,11,12,1,42,-7,-5,-12,14,4,-4,3,-28,24,-14,14,0,-14,9,-1,-22,2,13,15,-17,21,3,-16,9,31,43,-36,12,8,-4,15,40,-6,29,12,14,-4,-5,-2,-37,38,-14,-5,-1,-23,-5,12,-21,-8,35,15,-4,21,-13,20,22,12,-22,-26,-13,-7,7,-26,17,-26,-11,3,-15,8,17,0,-15,-5,-16,4,-31,-3,-68,0,-22,-38,16,-3,2,21,-26,-45,10,15,-3,-25,-25,30,20,-20,15,22,-13,-5,10,-6,20,4,5,1,-12,-13,-12,-4,-2,19,-37,-17,0,17,11,15,-3,3,29,-7,-22,19,9,-11,5,19,24,-17,-51,23,-31,-19,22,4,-17,14,-30,-22,-20,-15,14,22,-23,5,-17,25,11,11,-2,-17,-6,3,4,32,14,3,13,19,12,-33,0,31,-13,0,3,-20,-30,0,16,11,-19,-33,-18,6,8,25,-2,13,-1,5,-18,-18,10,27,44,-11,22,2,-22,26,11,4,38,22,-9,11,-7,-31,-16,14,46,-2,-38,-24,-19,-3,30,24,59,-15,-20,-2,-29,43,2,0,-31,-21,-50,-27,-21,2,16,-5,-4,25,-12,-5,-15,-10,-10,-14,6,13,-55,-18,13,17,8,-43,31,-28,0,0,5,-37,-8,25,-25,-22,-61,-10,-14,25,18,38,-17,9,12,9,24,-21,-15,0,1,4,-15,-9,-18,34,-21,-8,14,-15,16,-18,25,-1,15,-15,-16,-14,5,18,-37,24,23,-17,-20,9,-22,-15,33,2,13,18,-7,-32,-2,-15,-14,3,-26,-3,1,10,-8,-24,12,37,-28,-12,52,-19,11,-6,-2,18,2,-6,14,0,-11,-4,-27,-10,18,7,33,10,-13,3,-22,7,-13,35,32,-5,-29,-18,-18,54,-16,-13,66,11,26,-9,4,14,-40,1,36,-16,14,-20,-58,-20,9,-17,26,2,-25,-10,-16,-16,14,24,74,-26,-11,-9,-27,99,13,-24,-29,-1,2,-4,-18,8,-11,21,-2,-3,6,-13,-6,18,-13,-14,-1,25,-49,0,12,-10,1,0,23,-19,-5,-17,-7,20,-13,-10,1,43,-12,-5,0,36,9,44,24,-1,29,10,30,-6,-3,-12,33,9,-14,9,30,17,-38,-12,23,-29,22,-33,5,12,-1,-27,17,-9,-9,5,-5,2,12,-5,14,9,-32,10,33,15,36,-2,23,-5,-19,-8,-18,-3,24,-14,7,25,3,-15,33,27,6,-32,13,13,7,-14,-2,-25,-40,4,7,-5,-17,-11,-30,27,17,21,18,-18,-12,4,-20,-4,10,-40,0,0,-53,-10,26,58,-13,-42,5,34,23,20,-3,-58,-98,-17,25,-17,1,-30,-74,-5,19,4,-19,-48,12,15,-22,-17,1,23,-43,-13,-32,-7,10,74,-7,-53,15,22,12,10,8,-87,-83,-1,9,-23,2,-40,-21,5,-12,19,-31,-12,-7,24,-20,-42,-23,-13,-65,-21,2,-10,-22,1,-34,-24,9,34,-10,-9,24,-70,-60,25,14,-7,7,-21,14,0,12,-25,7,-16,1,25,20,4,-18,-24,-51,-30,2,-14,6,6,-12,5,15,-9,-12,-26,31,-2,2,-7,-23,14,-30,25,-10,16,56,-6,2,9,10,-5,24,7,-16,20,41,18,-5,-8,9,1,10,-15,7,0,13,-32,-8,-23,-37,-7,-19,-7,-9,-3,-30,4,32,14,-12,-17,10,-2,2,11,-35,-4,7,2,-37,-5,4,30,-14,-52,25,41,1,11,-41,-48,-70,6,1,-31,-24,-35,-22,-22,0,8,-20,-24,22,25,23,-6,-15,33,-45,-11,-17,-1,7,27,10,-60,26,25,0,27,-34,-63,-63,4,-10,-17,-34,-35,6,-5,-14,20,7,-20,6,37,9,-25,-16,28,-32,-28,-15,-5,3,4,-48,-30,12,8,3,8,-24,-58,-65,1,-4,-9,-36,-8,3,2,67,-35,32,-13,10,36,38,6,-28,50,-25,-44,-9,-12,0,-11,-35,28,18,-7,-13,7,28,18,-13,6,6,7,0,-26,7,59,18,-2,22,6,-2,-80,33,6,-11,4,-5,-9,33,-2,-16,-20,-19,16,12,1,-36,-6,22,45,-3,-28,32,17,-7,-55,6,1,-5,12,-11,-15,21,-37,23,-14,-5,-4,2,-8,8,4,36,37,-53,-16,3,1,-11,-1,48,13,25,-22,23,0,-23,-49,17,-22,16,4,-8,1,4,-14,-23,4,16,-26,18,-21,12,1,7,-6,24,-6,30,-44,-25,-50,-11,-35,22,-37,-21,-6,8,-8,16,14,13,-9,-9,23,-50,-20,-55,7,1,-50,16,19,12,3,-16,-36,5,18,-4,7,15,1,17,1,17,-33,-16,-11,16,1,-16,20,3,-12,-23,11,-2,-17,33,13,3,-15,18,15,48,-6,-12,-27,10,-35,33,6,-7,5,10,14,-26,3,-15,23,-10,-27,-2,22,20,20,6,21,46,-44,8,-10,-14,38,-26,-34,22,-35,-14,-6,-33,-18,19,35,-1,42,14,46,-19,-1,22,10,-25,-37,16,-21,-2,23,-18,33,39,-9,58,-29,23,61,50,-24,11,-25,14,30,-47,-37,-6,-3,-19,18,9,2,10,-5,-20,5,-20,-18,17,-15,3,-3,0,27,23,25,13,1,34,8,40,-19,-15,-11,26,-15,-24,-15,-6,-54,-33,-41,-3,-36,-2,-25,-30,0,-3,-7,-1,-16,5,-7,17,-3,-49,-5,-65,14,-8,-49,14,25,-14,-4,-2,-52,-2,-4,-37,-22,14,-6,11,-21,14,-8,-21,6,3,-25,-1,-12,26,-5,12,-22,-9,5,-45,5,-7,-46,19,26,14,-17,-9,-48,3,-51,33,25,-8,52,-32,21,2,5,0,5,-4,-8,-7,-17,-14,-13,-14,29,28,-25,39,8,-16,30,11,-11,-23,6,8,18,5,-17,22,25,3,47,18,30,18,-8,-12,-6,-28,1,8,-22,-7,-22,6,33,9,22,48,-17,30,74,9,-20,-38,5,3,46,-2,-14,-24,-25,-7,8,-2,-13,31,-22,-33,-3,-12,-3,-32,-10,9,-17,6,38,8,28,-7,26,8,-10,-9,-28,-43,21,28,-4,-22,18,-26,-58,-29,-35,-7,-36,-4,-21,-17,-12,-2,12,25,-10,-1,-8,11,19,-46,17,-60,10,-29,-67,-33,16,-20,19,-14,-56,-6,28,-43,-18,11,-19,7,-20,-3,-16,-33,0,-9,22,36,-4,25,-2,11,27,-24,27,-43,-14,-27,-31,-3,11,-8,8,-16,-47,-2,-26,-17,-4,-22,-9,-17,16,-3,2,5,-15,-19,3,-24,-34,-21,-30,-28,25,-17,-35,-14,2,11,-3,22,7,-8,15,-38,-22,14,-7,-12,-8,-12,6,15,38,14,26,-5,-16,-15,14,-2,-39,-12,-25,-10,13,-48,12,-17,-1,3,22,-18,5,5,12,-34,0,-2,0,-21,-33,15,-6,-29,-6,14,2,-7,-13,-6,0,5,-17,-6,4,1,11,-11,20,-28,25,7,-36,3,5,-4,14,2,-27,-9,35,-16,-52,-24,-25,-16,-42,-20,-5,6,-10,-14,15,-2,-12,-3,0,-7,7,-31,-3,-47,13,-18,-51,-3,31,-15,13,-13,-49,1,38,-20,-20,-22,-22,14,-36,-2,-5,-6,5,-40,25,16,-9,1,6,1,18,-33,22,-29,-8,-19,-38,6,18,-4,11,-9,-57,13,-5,10,26,10,-5,-48,-13,-11,26,23,-34,-16,7,-2,-28,-22,17,13,-16,36,-10,13,-2,-22,48,29,10,-24,-25,-31,-26,-7,-11,6,11,35,28,-15,-1,19,48,17,-9,-21,31,3,-40,-19,11,11,-4,9,20,-21,-8,-5,23,16,6,4,-16,-22,-4,-2,-14,-30,-46,24,11,-23,-29,11,36,7,4,-9,-4,13,-29,-14,6,8,24,27,1,-9,17,10,-12,25,9,-3,-4,6,-26,16,15,-3,-41,-15,-11,-21,-32,-5,-2,19,1,-6,3,16,-16,-13,-4,9,6,-17,-3,-40,-4,10,-25,-27,37,-9,8,13,-53,3,18,-22,-9,-42,-26,-11,-28,21,-16,6,2,-26,14,5,-14,-18,-8,-6,7,-35,18,-21,-15,13,-30,-3,23,2,4,-3,-56,26,-29,-17,-2,13,10,-55,-25,-44,-63,21,9,5,24,-32,-46,-23,-24,-24,-7,-41,2,-26,8,-23,6,17,-8,17,7,15,14,30,-9,-27,-15,43,31,7,-7,-30,-1,3,1,-14,-6,-12,-28,-26,5,4,1,20,25,-41,5,-5,-21,2,-12,8,7,-21,38,10,-1,-17,-6,47,22,-2,16,-27,14,-36,10,-13,-6,10,-2,-27,24,42,-9,20,2,-23,14,7,-16,31,-11,0,15,4,46,26,19,-10,-15,21,-21,-7,17,-32,1,8,-1,-29,2,-14,-30,-10,28,25,-10,8,-24,-34,5,3,-15,11,-25,11,10,9,-13,-1,1,-9,5,10,13,26,58,-14,23,-13,-13,-13,-26,-26,-6,-30,24,-12,-1,23,-29,4,7,-12,23,14,-20,41,5,27,-12,15,-10,6,0,-2,16,-24,-21,3,-21,9,-14,-38,9,-11,-3,-3,-12,0,23,0,-4,4,20,15,18,15,-26,21,3,-10,-8,40,19,-23,-1,10,21,7,4,-18,6,-2,-27,-32,-19,-9,-16,8,2,-32,16,34,20,5,17,10,14,-18,-40,15,-5,-15,45,-11,25,3,26,27,41,21,6,-22,23,-32,0,31,-22,2,15,-16,1,-30,-6,23,13,-2,28,18,17,19,-15,8,4,-9,64,48,64,-10,-16,13,-5,-7,-12,-10,-6,5,-9,20,-32,-32,15,-25,26,-15,-18,14,-13,-26,24,-5,25,4,-14,18,3,9,26,17,38,19,-25,38,-17,-23,-2,0,-15,-18,-2,-20,-14,-6,-11,-27,20,-11,-6,11,-41,-21,-4,-8,25,-2,-17,-42,-13,-1,-15,7,5,19,8,13,39,40,-10,17,-33,2,-10,-10,0,1,22,-16,31,52,1,-7,-9,41,2,19,5,-23,-8,-19,17,5,15,36,55,9,2,-4,12,25,14,-3,-14,14,-16,-8,1,48,4,7,7,18,12,4,19,23,-51,17,18,15,-25,30,9,-10,44,-14,60,-2,5,3,22,43,16,9,-13,15,32,20,-24,11,3,-2,-27,8,6,-18,20,-4,-23,-1,27,47,3,20,6,-15,53,50,75,-9,15,-14,0,31,2,10,-13,23,15,18,-22,-19,4,9,8,18,-14,9,-1,-28,-3,-19,-13,20,23,32,2,-7,-12,20,10,32,-10,48,-8,-6,6,18,-34,17,22,16,23,-17,-20,12,7,18,-12,9,-45,-11,-30,2,13,2,22,10,0,-29,-31,28,-7,-13,-25,9,9,53,13,17,-38,17,-26,-17,-26,23,27,-9,14,12,-23,-28,-3,24,-9,-20,-11,0,2,17,-2,-13,7,31,25,-9,-42,-37,-27,7,14,17,-27,28,-27,-13,-2,7,-2,-20,3,-19,-3,-17,6,-62,-44,-8,-40,14,-12,-3,-9,-20,-1,-36,58,-3,-29,-17,-33,4,-25,23,-36,37,-1,18,10,18,-3,-7,-22,-11,-27,-50,15,-60,-10,25,-37,17,8,4,2,-29,-9,-10,57,13,-21,-12,-40,17,-13,11,-49,2,-9,-1,6,-6,-6,8,6,14,-33,-28,-14,-61,9,11,-18,18,36,-25,-1,-28,-32,-28,-10,42,-13,36,-8,26,-14,33,-62,-24,-12,-22,8,3,-22,13,11,19,-55,15,-30,-75,-11,7,43,-17,34,16,-10,-21,14,8,-8,10,-14,-12,-24,60,-1,5,11,-7,-4,17,16,46,30,0,-15,49,32,-9,-27,5,-21,17,-9,19,-23,28,-5,35,-5,-1,24,-27,-53,10,-21,41,13,8,-16,-3,-22,31,9,25,34,-10,-8,36,15,12,-9,-29,-20,9,-33,14,-31,-9,0,-19,-8,-9,30,15,-16,-10,-34,21,-51,19,-48,22,-1,23,20,5,24,-6,12,29,2,-24,4,-50,-26,0,-9,34,-15,1,-2,-12,-45,-15,26,20,24,-6,-44,34,-4,14,-71,28,17,10,17,6,2,6,24,12,5,-12,-25,-39,-4,21,-1,10,19,-11,-4,-7,-21,-6,-15,25,26,34,32,57,3,17,-54,6,11,13,4,-10,-12,24,19,-5,-5,39,-37,-59,4,35,-10,-21,34,-11,-7,-48,45,17,-17,27,10,-19,-17,-10,-19,-11,-1,-37,21,-24,-15,6,4,32,-14,-6,-14,4,-18,10,27,5,-18,14,22,6,-13,-12,-41,-1,-27,18,33,-25,10,10,-1,3,-16,-24,12,-3,-7,-7,-6,11,-27,-22,-8,-4,-3,-8,-5,13,-10,18,-9,-2,-20,11,-11,19,-6,20,54,-10,13,32,1,-21,-10,-8,17,-26,-4,-12,35,7,-14,-23,-10,11,-13,1,-8,-15,12,-3,-7,-3,-3,24,21,14,9,-21,12,2,37,-31,-3,3,4,2,-7,-20,23,-16,49,2,-9,-8,-16,3,-26,14,-14,-21,-18,-17,-17,26,8,14,0,-4,30,-8,22,13,-2,-24,-16,-6,3,10,-9,-23,-3,-23,27,3,-19,-6,-41,-9,-4,-13,-17,-25,-40,-45,-16,-1,17,6,-22,15,-25,37,12,-3,-5,25,-11,-1,23,0,42,-8,0,4,-1,43,-6,46,16,-21,-1,-7,17,8,21,39,-5,0,0,20,-42,10,-26,13,20,-19,-13,22,9,20,6,3,13,-1,-14,-20,-7,10,-13,19,9,-18,2,-5,-27,-3,-15,14,1,10,-20,37,-31,18,-3,6,58,7,-12,19,6,-2,-26,27,18,1,-7,-21,6,32,-5,-13,-8,4,-15,-1,-32,8,27,-41,11,-9,-18,32,-11,-37,6,-8,9,36,11,-30,4,-1,-4,-3,-21,-13,-6,-7,-20,-17,3,-13,2,-11,-18,-16,2,32,-4,-38,-2,-11,-1,-5,-8,-24,14,0,-9,21,-9,-13,-8,8,5,11,-50,-21,-19,-19,-25,-36,-11,-16,-1,-19,5,-40,-8,3,-9,-51,-8,-24,8,-26,-13,3,-13,36,-14,18,-5,36,-52,1,45,15,0,-30,32,-21,24,13,-2,10,12,-52,19,-40,6,-51,-46,44,-47,-19,33,23,-37,7,-11,34,30,11,-17,19,-35,11,6,31,-14,-23,17,-21,10,16,12,-16,-28,-24,17,-32,-11,-30,-12,22,-28,-16,17,28,-17,7,3,-11,108,32,1,18,3,-6,-43,30,-1,7,0,-10,-22,33,6,2,-24,-7,-10,-44,-18,37,21,-26,4,12,7,7,-31,-17,-3,17,34,27,-17,-21,3,-2,-35,1,4,13,11,7,-49,-11,5,13,12,-17,-30,-34,8,22,19,-31,6,12,-10,-29,14,3,-6,15,22,33,-15,-14,0,14,-15,-6,-12,15,-39,36,-52,-35,0,-13,47,-38,3,-26,-29,47,28,4,0,8,-24,-17,9,9,-3,17,-28,10,-29,-11,-49,13,24,22,-16,-10,33,-11,0,-8,-28,-17,16,-51,18,-45,-18,-7,-34,16,-30,-25,47,-26,-27,-2,-10,24,18,41,-12,-38,-27,10,-20,33,-1,-23,29,-22,-6,-1,5,-9,-24,-25,0,-44,-20,29,42,1,-18,-29,28,-12,-9,-11,-6,-17,49,45,2,-23,25,-18,-41,-2,14,0,0,-12,-31,21,11,-11,-42,-26,-36,-36,-3,46,15,-8,-8,-24,18,-26,-5,-20,-5,-27,23,27,-17,-21,16,-20,-25,-26,28,-3,13,-9,-37,-10,8,6,-11,-10,-41,5,15,36,38,-21,18,-3,-21,-22,19,48,4,0,-4,7,-9,17,25,4,-27,-18,28,12,-22,49,-30,-19,15,6,7,-25,13,0,0,1,18,6,35,-10,-40,8,26,42,-21,-8,-18,50,-23,1,-37,35,14,6,-29,13,22,-30,8,-13,-5,-8,2,-9,-9,-14,-23,36,-4,25,-36,-15,21,-34,-30,-11,-28,7,9,78,-15,-22,-7,3,-9,-1,-6,-8,16,-15,3,-7,51,4,0,-13,-34,-35,5,32,24,-7,-21,-16,6,-9,-5,-22,-4,4,6,42,-9,-30,29,-39,-9,-7,14,-23,-42,-6,-20,-15,25,-5,-19,-19,-33,-21,-32,8,0,8,-13,-10,5,-25,26,-51,19,8,-5,-12,-14,-25,26,-33,-12,-33,24,-35,-29,8,-22,-34,-9,0,-16,-22,-25,1,33,0,34,16,4,-10,-2,1,32,33,23,-16,14,32,0,25,42,18,0,-14,32,4,-10,33,-14,-15,12,-14,25,10,30,-6,35,11,36,27,17,4,-8,3,40,44,14,-3,-8,36,-16,-3,-2,-11,11,36,-5,2,7,-39,-27,-28,8,-26,-14,4,-8,-22,-26,-6,-21,15,-2,0,-5,49,-2,31,10,-50,-4,-2,24,9,2,6,-11,12,-12,-16,22,-35,-16,-26,22,-20,-20,25,13,-5,25,-20,-45,-25,13,17,-15,33,-11,20,7,-59,5,-17,17,6,-11,1,-8,2,-25,-10,26,-27,-7,-14,-5,-10,-15,-11,-2,-41,8,-13,-55,-38,28,44,-23,12,-28,-8,13,-14,-27,6,-19,3,-14,-9,-18,28,-26,-16,23,-24,-31,-9,-10,-6,-16,-26,4,-45,10,-12,-38,-31,19,4,-26,-1,-9,5,-8,16,-4,37,20,21,10,-7,-14,27,-27,-32,37,-30,0,23,-24,-16,2,15,9,16,1,-17,-16,-4,4,26,-29,13,23,32,16,-5,-10,27,-5,-17,-12,-2,22,16,-3,21,30,1,0,-18,4,-5,-23,-13,8,-22,-15,-35,8,-25,5,8,7,27,-28,9,22,-16,-34,25,3,11,7,0,30,25,7,1,27,2,-17,6,1,10,-12,-20,18,-40,12,-21,-37,-20,12,8,1,21,-20,-12,-1,-47,-59,-23,-6,32,-2,-11,15,4,-15,0,-16,-11,4,6,-9,11,-7,-5,4,-95,10,4,-77,-6,4,16,-8,-15,-50,-28,1,-6,-61,-17,-40,8,-13,-14,-18,12,-2,-4,-28,-13,-16,-6,-12,-6,4,-28,0,-91,4,18,-65,-13,6,-8,-5,-2,-43,-10,0,-2,-20,28,6,11,-11,3,-13,-10,-8,1,-5,-2,11,69,-13,-1,10,6,9,-23,10,8,-23,13,4,-1,-1,17,9,29,8,38,5,-20,2,-18,-32,-12,6,-7,6,3,-31,-27,1,-14,10,-46,3,-6,-6,34,6,-2,2,3,-16,6,7,13,11,30,-1,-5,7,2,32,20,-10,-15,14,-11,28,-30,0,23,-9,-2,11,1,11,-1,16,24,0,-22,14,-7,-18,-8,13,4,47,8,1,-11,-14,-29,-21,34,2,-28,7,-37,40,2,6,58,-9,8,-31,11,26,-37,-2,-24,-7,7,-31,31,-13,-10,7,-21,-19,-17,-14,2,-24,-44,-28,0,-17,-32,0,11,33,-2,-32,24,-13,-2,-16,9,5,-8,-7,-55,-12,18,-10,33,-13,-24,13,0,-21,-11,21,39,10,16,10,1,28,-11,-6,0,0,2,-16,-26,20,60,-18,13,23,8,-11,3,-2,-4,48,29,-9,-12,14,6,36,-17,-11,2,-3,-1,-11,-26,-7,18,-23,-7,9,-21,11,-28,36,16,-3,7,-13,-1,-19,-20,-9,36,34,-18,-21,-7,-5,-4,28,14,-17,24,-9,21,13,-6,19,34,23,19,8,-44,9,23,8,-4,6,33,9,52,8,13,14,35,26,-31,-25,0,19,-8,33,-19,-13,-7,-9,13,23,50,26,42,11,14,46,8,18,37,0,-6,9,65,42,10,31,-24,-10,49,-7,25,1,46,10,1,-11,-9,-7,13,-35,-30,2,39,9,27,25,18,21,5,-18,-10,35,-27,-15,60,39,17,6,11,-36,34,-43,24,4,1,18,2,-16,15,33,15,40,16,33,23,37,18,28,7,7,5,8,-21,37,20,2,22,19,62,0,40,-22,1,15,-3,-14,-14,11,3,36,-18,-8,7,-57,9,36,-11,-22,-17,-16,38,-19,-10,5,-32,16,-1,3,-13,-53,36,-19,-5,0,8,54,-13,-34,-38,8,-34,-7,40,-30,-36,-19,-23,7,13,10,-2,-27,25,4,5,-34,17,-4,-14,5,-13,-4,44,-9,-22,4,26,15,-20,-23,-30,10,-22,11,-11,-23,-10,-13,3,50,41,-1,8,-1,-4,28,19,0,21,-14,10,44,-6,12,33,26,-34,-13,36,36,9,-2,-3,-4,3,-27,-27,-19,9,-30,-41,-9,29,-5,-7,10,-5,6,15,-32,18,-14,-2,-3,16,-13,1,5,-29,9,28,-24,2,14,-26,1,16,-62,6,-15,-5,7,-56,24,40,25,16,2,-20,-25,4,-7,1,-8,4,-16,17,-20,44,1,64,-35,2,-3,5,0,-26,-9,-12,1,-2,18,-15,8,9,-24,15,11,10,-27,3,14,-9,8,-25,-16,4,6,-4,23,-19,-11,-6,11,-18,-6,16,36,-15,-10,-3,11,-3,5,1,-2,3,-9,32,18,21,-6,11,-6,0,17,-27,30,-21,5,-11,-7,2,-3,11,-22,-2,27,-13,28,37,-17,-20,-10,2,9,0,-17,17,-4,3,-25,1,-13,3,-8,4,20,-53,16,17,-7,1,-9,17,-33,7,23,-9,10,-18,11,5,-10,-2,2,0,15,13,11,5,0,-22,-23,-17,-9,-8,-1,-10,-11,-32,-8,25,-4,10,-23,-5,-9,-5,27,-6,28,7,-2,-40,-8,-10,9,-1,-4,6,44,20,-10,-20,-4,7,9,39,30,-17,-16,-15,-11,27,-17,0,27,-17,-11,-22,7,10,21,14,-1,10,10,7,34,-15,-11,-5,0,-10,2,-26,8,14,-16,3,1,25,11,-25,-9,16,5,-20,31,16,9,29,1,14,-5,-46,25,-33,-2,-18,-1,-6,1,-18,-13,32,3,-9,3,51,-18,4,-8,1,-5,-46,-13,-8,21,-9,-2,30,-1,-44,-22,-5,-40,-19,8,-16,-3,6,-10,29,-1,2,-31,15,-2,-25,-26,6,-31,-11,6,-14,-1,-37,-4,25,-10,-9,-27,11,-18,-21,15,6,45,4,-6,-38,-12,-16,7,-19,-15,31,-22,-12,13,-30,-10,-28,-6,-12,5,-36,-45,-12,6,28,-42,-8,-11,11,3,-24,5,19,99,29,-22,-13,-8,2,-6,-24,-3,15,17,-9,-2,-2,-12,15,16,19,-7,-15,-14,-18,11,21,-31,-10,12,-6,17,28,4,-14,34,4,-8,5,-3,-5,-12,-11,-13,-25,5,-9,16,-7,-2,17,2,21,14,27,-12,38,-40,17,-16,-1,12,9,-4,57,21,-6,-30,-5,27,-6,1,3,11,-24,-33,22,-33,-2,20,-10,-2,46,5,2,33,22,-1,-43,-21,-20,-11,7,-12,9,-1,-19,45,26,-1,5,11,-15,-6,-9,22,-10,-15,52,34,12,16,-7,-47,-34,-6,-10,4,-4,-22,-60,14,-5,-27,10,-31,6,-7,-28,32,26,127,55,-12,-9,-17,-13,30,-35,-13,70,47,9,-1,0,-22,-48,-9,18,-11,-14,-51,-14,19,9,-36,2,-2,8,6,-55,-1,-2,110,17,-28,-30,-14,-6,-8,-13,-1,16,11,25,23,9,-15,4,4,24,2,22,-22,5,10,29,-1,-7,-8,17,14,-16,2,-4,20,-7,9,-9,5,-5,-13,-11,-22,-11,-20,-6,6,-19,-19,-48,10,34,4,11,13,35,-34,16,4,-15,22,-23,-26,65,5,-6,24,-19,7,-26,17,-18,33,-8,-26,18,13,-16,-6,1,-29,-49,0,12,8,-22,-26,-5,2,-8,-28,4,-2,-2,-19,1,6,0,25,25,-8,-33,-3,-18,-2,-29,-35,72,53,47,-17,-8,-35,-68,-8,-15,-31,-21,-32,-38,3,-14,-12,4,-29,-28,4,-56,24,7,95,19,-36,-57,3,-27,8,-8,-20,68,-4,59,-25,-19,-22,-16,-26,-24,-20,13,-13,-14,1,-4,9,-33,-27,-26,8,-94,-29,-30,23,11,-7,-45,5,-27,-2,-13,6,26,-16,12,-9,-20,-6,37,-60,20,6,7,9,-39,-16,-4,2,9,-21,-5,-12,-52,-31,-37,-18,2,20,-31,0,-22,-15,-23,-50,33,25,41,0,-32,-33,-93,-11,1,-33,0,-34,-12,3,6,2,14,-21,-42,-41,9,-1,33,21,19,22,-24,11,-8,3,10,-33,-7,34,35,4,17,-13,-58,-16,-23,9,0,-30,4,17,15,5,-27,-20,-4,0,-7,-24,37,18,-12,7,-63,9,6,-1,-46,-41,-5,35,-3,14,2,-5,14,-11,-4,21,28,26,12,-2,4,17,-38,-23,-12,-11,19,-18,-1,-9,-11,-23,2,0,3,-35,1,-33,-2,-9,15,-46,15,2,48,12,6,5,19,32,19,-11,-19,25,4,-21,-14,9,17,-27,-5,-11,24,-4,4,-16,11,2,24,-4,24,-3,3,-9,3,25,61,12,33,42,-13,8,-14,-6,-7,10,11,9,-20,-9,-18,-4,-8,-8,54,23,12,-8,-16,-23,-6,-32,31,4,-30,-38,-11,-38,3,-21,-16,6,24,0,0,20,2,-17,-13,-52,0,2,2,-14,-19,-6,-29,0,-13,-5,-16,29,16,-11,-10,-9,-16,-40,-11,-16,17,0,-7,19,-15,1,17,14,-23,-35,-4,-56,-6,9,60,-18,-26,-32,-27,-14,10,-4,-6,2,1,-4,-31,19,-7,-2,-6,7,28,57,19,16,-7,22,31,11,-15,21,-21,3,9,28,76,-40,8,-44,-22,-37,21,0,26,-20,-22,1,-17,53,-15,6,-19,29,21,31,15,2,6,4,-21,8,-7,-25,-19,6,-3,21,16,-18,3,-13,-21,-27,-24,19,43,4,-13,3,-20,23,-5,21,-14,8,-2,16,-12,-19,-11,0,-24,18,-18,-23,-7,-5,-14,20,25,-20,1,21,1,1,-15,7,2,20,-15,-36,31,-13,-29,-14,21,-40,-10,-34,12,-16,-23,7,-26,7,-1,-24,0,-43,2,17,33,-7,-10,11,6,7,18,10,-26,-17,9,-31,16,-19,-47,-31,11,-26,-2,-8,13,-25,-17,-8,-8,19,-24,-35,-16,-8,-10,1,16,-27,-17,-4,14,1,24,0,-21,-46,-11,-30,-29,5,-31,-7,37,10,33,11,-3,-10,10,-27,34,4,-17,1,-23,-12,-22,-4,55,-14,-12,23,-14,-18,-16,-2,-2,-51,3,-29,-29,30,-14,-12,-16,0,29,-3,-2,-8,-16,-6,29,5,5,-17,-19,-19,-7,-8,31,-14,-13,40,-21,-19,-33,4,20,-18,11,-18,30,18,-18,12,-13,0,9,8,-14,-29,-25,-14,-15,35,-16,-31,-23,-18,-2,14,12,-9,-18,31,-18,-4,10,0,21,24,-16,-38,16,-45,-30,-49,-19,-16,-35,-36,16,-2,-3,-3,-28,-23,3,-27,-7,-9,-11,-15,14,6,54,-2,15,39,3,7,-30,-17,-15,-40,24,-23,-28,-44,-28,-18,-20,-34,17,-21,-3,-16,-28,-29,-4,-26,6,-6,-40,-18,12,-29,24,-5,15,47,33,0,-31,-53,-33,-53,31,-17,-28,-26,-9,9,0,-17,4,-21,18,-46,21,-22,-2,0,11,8,-29,-17,21,-30,37,-5,20,56,54,-2,-19,-14,-10,-45,5,-2,-29,-32,-19,16,23,-22,-35,-29,-1,-6,31,-5,-3,2,-6,-11,-24,-32,39,-11,25,29,43,46,4,-6,23,12,24,-34,39,15,9,15,-6,0,14,2,-13,-23,-10,6,31,5,1,-7,-5,-28,-6,-22,-23,4,-6,42,28,5,13,-12,-30,-4,-18,-29,53,-36,-2,-34,0,-24,-31,-36,27,-8,11,-17,-17,-18,23,-17,-36,-21,-23,-12,26,-5,-7,21,18,28,-11,7,-48,-23,-21,-28,51,-15,-12,-36,3,-24,-5,-28,27,5,-3,-6,-3,-20,9,-9,-6,-13,-32,-23,-7,-24,24,3,37,34,-17,4,-47,-24,-24,-34,28,-28,9,-38,17,-12,10,-26,9,6,-24,-27,16,-33,7,14,9,-2,12,-13,-15,-34,24,42,24,40,-18,9,-37,2,-21,-11,31,-14,9,-34,-8,-16,22,-36,7,3,-17,-4,12,-8,-10,-8,-7,-9,10,8,10,-5,14,21,40,28,-1,1,-28,18,-10,1,-5,14,40,-5,7,-14,27,-37,11,6,-2,-5,-14,-2,8,-3,-23,4,-12,-6,13,-41,13,28,0,-19,-6,-18,-33,-19,-24,-27,71,-11,23,-43,16,-9,-28,-54,33,-17,26,-11,-10,-21,0,1,-14,-20,8,-14,-20,-36,50,49,0,31,-1,3,-22,20,6,-21,26,-24,18,-49,11,-13,-4,-53,-6,-5,13,-21,19,-7,-15,-2,-25,-21,-19,-17,-21,-26,62,26,-10,40,-32,-6,-49,23,-22,-34,16,-15,25,-35,21,-7,11,-24,-3,-10,-4,-26,61,-16,-8,-19,-18,13,13,-13,-24,-35,59,21,40,53,-25,-5,-54,0,-4,-7,-14,-31,39,-18,-3,-13,18,3,23,12,-16,-9,31,12,22,-25,6,2,4,10,-11,-32,50,53,54,42,-49,-7,-39,26,-12,-2,6,-8,49,-14,-6,-8,20,-1,28,31,-9,11,2,5,55,-4,20,11,1,-1,-50,-24,71,3,36,4,-55,-6,-9,-7,-24,11,23,0,25,24,-16,-67,2,-27,-2,3,-58,21,-33,6,28,-47,-26,-2,-13,-21,-20,-22,-8,3,-19,-20,31,-32,-13,16,4,-9,10,28,-43,19,21,-47,-33,-40,-14,12,-27,10,2,-8,-6,-35,54,8,-21,-8,5,28,-6,61,44,-16,-46,18,9,27,10,-3,11,-28,-6,-32,-9,-66,-13,-25,-21,2,-28,-10,14,4,18,-11,32,14,-35,5,23,2,-5,-12,-13,-21,-1,21,-38,-26,-2,-10,20,-24,8,-1,-10,-63,3,19,-10,-19,-13,-17,42,10,10,-25,7,-37,0,40,57,-16,-18,-24,-5,-28,40,9,-26,-8,-10,-19,27,-3,12,13,-38,-42,20,9,24,-30,-13,15,-24,6,36,-20,-5,-24,-5,-25,-7,1,-31,-2,17,-25,20,4,10,-22,-23,26,-15,-23,15,7,5,-12,28,-29,-13,-33,10,-13,-17,30,16,21,-83,-4,11,-1,-12,-6,10,6,-34,12,50,-17,0,12,-17,-10,-11,-35,58,7,22,42,-42,-43,-3,9,21,15,27,-42,-2,5,25,-1,2,-5,-1,24,32,70,60,10,-26,28,6,37,-6,-10,-13,-2,22,-8,5,26,-38,-34,-12,-15,-38,66,70,-32,-19,2,83,5,-26,-21,2,16,15,6,30,2,9,26,60,-9,2,-10,7,41,-20,-21,12,13,9,52,12,-9,-21,-24,-9,-3,-5,23,11,-22,3,1,92,1,-8,-25,28,-11,31,5,49,-16,-4,-22,-1,36,-17,0,-13,13,37,30,29,-11,7,-29,-20,11,1,8,-41,-11,-15,-14,-16,-17,-7,-11,-25,0,-25,4,4,12,-5,30,13,-20,-21,21,-23,-21,34,-15,20,-64,2,19,-12,20,17,13,-77,-17,-27,-13,-5,22,-20,-32,-42,11,-18,-17,-2,3,6,-13,-21,-7,48,-8,6,23,-44,16,-15,-19,-5,1,10,-22,-14,-15,-1,-1,18,-18,15,1,11,0,11,-21,15,18,2,25,24,-46,40,-2,33,54,-2,25,-71,22,-38,-34,15,-18,60,-16,-9,-30,58,-16,8,-14,53,8,-20,56,88,-30,-3,11,94,7,-6,-34,24,22,-21,15,-19,10,-18,4,-23,-4,-9,-38,31,-15,-16,-1,-2,-2,11,-9,27,22,-27,23,82,-40,18,-8,73,-22,-34,-16,-1,44,-20,-6,-24,-4,39,7,14,-17,-6,-9,-11,41,-13,12,-68,-26,-1,-1,0,-21,-41,14,-14,-11,-39,-8,-3,-6,-6,14,-24,-5,-17,-11,-24,9,22,-10,16,-1,8,30,-15,7,22,21,-55,-12,-34,-9,15,-2,-27,-49,-24,24,-9,-25,14,-7,24,-8,2,15,30,11,-11,21,-29,12,-21,21,-11,22,13,2,-6,-5,-23,-1,-19,-13,37,-33,-30,-27,-21,7,18,7,32,-10,-21,-24,4,10,4,20,19,-2,-53,49,-30,-10,-14,14,-12,-28,17,-10,13,-32,-44,-1,26,-6,35,22,0,-3,-1,10,28,23,-37,-31,43,-3,14,20,16,17,-18,16,-6,0,-22,-36,5,9,22,18,-6,6,-15,-1,-27,6,6,24,-4,-13,-27,-13,17,1,-28,-28,19,-16,-9,-22,-16,10,13,-2,-8,-15,-29,30,-27,21,-9,16,-46,5,-20,0,-18,-13,8,-3,6,-3,12,-3,-1,-33,12,2,-9,27,7,-16,-19,-27,10,-17,14,14,-10,-28,-63,20,-24,10,-13,-20,1,-10,32,21,-2,-43,-22,15,15,-18,6,4,35,-7,-27,30,-3,4,-30,11,-11,3,-13,59,-26,-24,-6,6,-22,18,-1,-5,-9,-32,35,30,18,-35,-18,21,-10,7,21,8,8,-8,16,19,-26,-43,-9,-8,-19,29,7,13,-34,-22,15,6,16,-30,20,16,-30,-36,-17,41,16,-20,-14,6,-17,6,27,21,-23,-10,34,15,26,5,18,9,-18,11,17,-9,-40,9,-17,-3,20,6,2,34,-13,4,4,-5,22,5,13,10,-35,0,-4,13,18,-21,31,-36,14,-5,-6,6,12,5,-4,-19,-25,34,-36,17,-19,29,-42,-14,-16,-2,-1,-6,9,4,-8,-2,-13,10,-16,25,-4,-10,10,2,10,21,-13,-5,-1,15,28,23,-21,-6,-36,-13,11,19,-20,10,-13,-36,-12,26,19,14,22,-11,60,-3,26,-7,7,17,32,-12,-30,-26,32,-11,39,-3,42,1,-18,22,1,-6,-28,-6,-3,2,-25,-2,1,5,5,1,12,-1,21,14,-30,1,-15,34,2,-7,-6,-46,-24,16,3,-3,4,16,-38,37,1,35,2,-35,26,-7,-24,-38,19,25,-8,-13,5,7,34,14,-17,4,-31,49,-27,16,-6,3,0,21,24,9,21,5,-16,-1,-13,2,11,9,-14,-3,6,-37,19,10,2,1,-11,10,25,-8,8,11,-29,35,-9,4,-3,12,10,15,35,1,2,3,-3,11,20,-18,-18,19,-48,4,21,-22,11,4,-4,3,1,18,-32,-13,-4,6,-21,1,7,-42,-9,-18,9,-32,-16,-42,-26,20,-28,-18,-31,-18,9,14,-16,25,-19,-16,-42,-3,-28,36,11,28,14,-17,-5,0,4,23,20,-12,-1,-19,-10,-32,12,-13,-22,34,40,22,-5,-16,-12,-12,-8,2,-22,0,15,-11,10,44,-12,22,-20,-1,12,-9,-14,3,1,0,39,-46,-19,-7,8,-22,-39,32,26,29,-43,11,1,-33,6,-18,-9,-22,27,33,-11,-21,-24,34,-17,-8,57,5,0,9,-7,24,-13,-14,-4,4,21,-53,-14,17,-29,8,-39,0,-3,10,-37,5,5,-9,11,-13,-3,2,-24,34,-22,-23,-11,-17,-5,42,1,16,19,21,8,6,21,-39,-12,-17,-30,13,38,18,-8,0,-24,11,-21,-34,24,-22,23,-13,-1,27,28,-12,12,38,-14,-9,9,-48,6,-4,14,-4,-26,-20,-4,-16,5,-60,-16,-17,39,-2,14,-6,29,-7,-21,-18,13,33,-10,7,-11,-7,6,0,7,-16,40,14,7,40,17,4,8,-8,-21,-42,26,5,-2,1,3,-9,-1,-53,50,-2,38,23,8,85,-15,1,-16,2,-5,12,-49,14,-8,11,35,18,-13,42,37,-5,-23,1,21,1,16,45,-39,-32,2,-1,39,-14,60,81,-47,-17,-39,15,39,-2,38,10,-16,31,-31,22,-1,33,-21,19,55,-10,-6,13,-2,3,-19,20,2,-3,-21,38,42,19,-15,7,31,-21,-20,-1,-21,-9,38,-13,-9,42,-7,-25,16,-9,21,1,9,-35,-17,-4,-7,13,18,1,18,-6,-14,45,-9,-6,-33,-2,26,-12,63,27,6,4,-41,12,6,-27,20,9,-32,-7,6,-35,-56,-10,50,1,24,-34,22,5,32,-18,-24,-10,43,-9,-6,4,-16,-2,-15,-12,-59,-5,18,-23,-15,-26,-12,43,23,50,-1,11,-52,-13,7,-65,-5,-22,-60,-7,6,-10,-55,-46,20,-11,-6,16,-12,18,-32,-39,-85,-19,-18,21,-27,-7,-14,-5,2,26,-38,49,7,7,-16,-17,-44,-48,3,90,-21,-17,-24,-45,35,-14,20,12,-45,-7,-65,-11,-33,-16,11,44,-17,14,-29,7,9,13,-16,12,30,-15,-11,17,9,-7,40,19,-11,-3,-16,-22,50,28,-31,-20,-8,-30,-11,-16,6,5,20,-8,-10,19,10,-11,23,-25,13,-20,-4,-40,-7,-2,8,14,23,-21,33,21,37,16,-17,-4,-38,-2,-3,-30,38,11,14,0,-25,-13,-7,-28,-4,19,-11,45,5,-47,-43,-14,12,6,0,-45,21,30,25,10,-6,18,-1,-5,46,4,-17,3,29,-30,-1,9,3,-12,-24,-24,-12,86,45,27,-36,-5,-34,-16,-22,-50,-44,-46,-17,15,-15,12,-37,-6,-1,7,14,26,-14,21,-11,-43,-75,-41,-32,16,-1,-17,27,55,10,22,-26,13,3,-14,-36,15,-50,-29,-3,20,-20,29,-52,-7,9,15,-10,-50,-43,60,-8,-22,2,-33,24,68,-14,44,-5,16,-17,17,20,-2,39,-2,8,65,-7,5,31,-3,-12,-8,-13,21,-2,30,-30,-6,-14,-25,-9,-11,38,12,37,-8,-26,45,-37,-24,-12,-29,18,-7,-18,-34,29,23,20,5,44,-5,20,13,52,47,-34,-18,-24,0,13,-34,36,17,37,28,0,-32,0,-16,2,-14,8,-7,-8,-13,4,19,28,-22,-2,-16,-20,-20,-31,5,-25,-11,-29,7,11,-17,23,-21,-16,6,4,-4,-21,12,6,-22,10,-39,8,-5,-13,-24,6,8,8,-1,-12,25,10,-33,6,25,-49,-20,-13,39,11,-10,20,18,-10,10,-18,-1,3,17,12,-26,1,-13,31,12,-33,-7,-27,35,-7,1,-24,-13,2,-21,9,17,-37,-20,18,6,-5,-34,-14,10,-31,3,26,11,11,46,-4,-4,16,-13,28,23,-18,-3,-14,42,3,28,0,-2,10,-44,-1,-19,-19,-22,18,-6,31,-18,-30,6,-6,0,30,3,-23,-3,19,-7,11,-3,3,-22,-17,27,-17,7,-3,12,-11,-14,-11,18,16,34,-23,-19,-22,-16,13,0,-42,-14,-17,17,-3,-9,-25,1,-29,-10,-7,-14,3,-37,-22,-4,-10,0,5,-30,11,-11,-41,-11,-35,-6,-13,-17,-26,-4,-38,-31,10,-44,-33,-13,-8,-7,-22,6,8,-6,-17,15,10,6,11,14,7,24,7,-24,0,11,-27,15,-7,23,-19,23,3,24,5,-30,13,-13,-31,-15,9,-4,11,23,4,-6,-20,-14,35,-23,-9,-14,-5,84,-5,5,-32,-11,-17,13,-17,-37,-27,-16,8,-17,55,-32,26,-57,-48,-18,3,-8,21,52,8,16,-31,-20,15,-26,-6,-18,19,77,18,41,-36,-18,22,4,-4,-5,12,-20,-17,-1,61,-24,-4,-30,-40,-11,0,-12,21,58,15,12,21,-7,12,-25,46,10,28,36,30,68,-17,-13,57,8,35,23,46,2,-54,-7,5,-27,-44,-9,-14,-8,32,-15,14,8,-16,-8,7,-8,12,-15,-30,4,-25,-16,-12,-8,9,-4,-4,-11,-8,-4,-17,-14,-25,-12,16,-1,-11,-26,21,-6,-6,-10,-13,-16,15,-2,1,31,11,3,-27,5,-5,4,3,-8,28,-10,-17,-13,-35,14,-26,-36,-27,-5,-26,-16,10,-8,-25,10,21,-7,15,-12,-18,-8,4,-12,19,-7,-40,-3,-22,32,6,-2,29,0,-10,22,-14,-38,-38,-49,-5,-43,-3,5,13,-24,-26,19,-2,-9,0,33,11,-16,38,-58,-1,-30,-23,-3,7,84,-2,4,-21,-3,12,17,18,-9,-2,-24,-30,9,41,-23,-35,-11,-17,11,-30,15,34,33,12,-14,12,-28,6,22,43,-26,24,45,26,24,-47,-5,33,30,24,-22,-1,-5,-22,6,21,-21,-5,-33,-1,-30,-9,5,33,22,9,19,-1,-9,6,17,16,-16,-24,-11,-31,-11,11,-13,-10,-5,-18,-6,-4,-10,-39,-1,12,17,5,18,12,-6,-8,-13,-10,6,13,10,-28,30,37,-14,17,-15,-11,12,-15,-9,3,-16,-9,-22,-17,9,5,-23,-37,0,-9,-9,-14,-28,-3,13,20,-11,-17,-8,5,-7,32,7,21,8,20,-11,-20,-10,-16,-7,28,26,2,-8,30,-9,13,-22,-25,-31,-21,4,-27,21,10,21,-15,1,-1,-19,13,-27,49,-21,-21,-11,-2,11,-1,33,2,-9,-18,18,-9,-6,19,-18,5,-11,-29,3,-21,17,-29,42,8,19,-13,30,-3,-19,20,-28,14,-31,-14,13,-12,-4,17,31,17,5,-31,26,39,12,-5,-37,-5,6,16,-5,-17,-8,-13,34,-4,-7,-30,35,4,-21,2,15,5,16,2,16,17,-2,-12,-9,-31,1,11,-10,4,2,3,-9,10,7,-30,-4,-6,14,-13,-12,-5,-2,1,1,-5,20,19,27,-4,12,5,-1,2,10,-12,23,-18,1,-8,17,19,-28,7,32,13,4,6,8,-33,-27,-14,9,6,6,5,-8,-16,28,8,16,-6,11,13,6,-8,-2,-11,1,-14,1,27,28,10,-18,12,28,6,-6,9,-20,-2,-4,1,10,-1,9,-4,-1,-6,-18,19,-10,16,31,-24,2,-20,10,-2,-18,-10,-9,16,6,12,6,2,-8,-2,-11,11,16,13,22,8,-8,-31,18,-6,23,-17,-21,20,-20,-11,16,-21,2,-34,-3,25,-2,0,-24,-33,-13,-13,0,-25,1,4,-8,13,-4,11,-2,0,-4,-21,-3,19,41,-16,-35,-1,-24,20,-5,-14,-13,-18,-10,-33,-19,-14,-21,-14,-14,-17,-22,-3,24,-4,-5,-24,-3,-23,2,-7,11,19,3,-6,19,-9,8,-2,-18,-12,0,-1,2,2,3,-20,-17,14,2,2,6,-9,-20,19,2,-16,14,-19,8,9,30,10,-17,12,13,13,8,-1,-11,6,-20,-1,22,-25,15,-10,-2,-6,-15,19,1,31,-1,-13,-7,12,-23,-18,-15,-6,14,16,3,30,-31,-6,1,16,-10,5,-9,11,-20,-11,12,7,-5,-18,-15,3,-8,11,-13,30,-5,4,-5,2,3,-19,15,-23,18,5,-6,18,-31,-26,-13,19,-10,12,-15,6,-31,-6,13,5,-29,10,-23,-20,8,6,-18,-6,7,28,-4,-3,-20,-21,-21,-14,15,48,-4,-8,-33,-16,6,10,-12,18,19,-7,0,25,-3,9,-6,-23,3,-21,5,-1,2,4,16,-20,-5,16,23,9,10,-10,-13,-48,7,12,-18,-3,14,-12,3,5,20,2,-11,-6,16,27,-12,8,-3,-20,-9,1,10,9,17,18,0,25,-4,6,13,-50,9,-11,17,16,-57,-7,14,23,10,16,-10,1,-9,-30,32,7,5,12,-9,-20,-22,3,-8,39,-3,12,19,27,-11,-14,10,-15,19,-20,-14,8,-39,-13,0,-7,3,32,4,-34,-21,-3,23,5,12,8,-22,1,-10,-2,-23,31,-13,-19,11,7,18,-45,3,-15,7,-31,-35,-17,-43,-51,-18,22,-4,21,22,-7,-32,-28,8,3,-15,2,-17,-9,-8,-16,-24,-3,23,11,-3,4,7,-38,-3,-30,22,19,-9,-24,37,-48,-5,-1,-11,-5,16,5,29,-8,1,40,14,-35,-9,-25,2,12,-18,1,-7,-3,-23,-2,9,3,2,-65,-5,-16,16,2,-47,1,3,19,-11,-19,23,-24,7,30,8,-5,-34,-15,-15,-30,-31,-5,-10,-1,-28,48,-17,11,-15,5,14,-61,-14,-23,4,-9,22,27,-7,19,0,12,-6,-30,-7,16,12,0,-24,-4,1,-22,-17,-30,-15,16,-45,63,-5,36,0,-23,11,-32,-13,32,-6,-2,8,62,-11,-8,1,9,0,-18,-7,-37,-1,26,-3,-27,4,-7,9,-34,-2,11,-36,20,8,24,-5,-54,-4,-22,0,-4,17,-10,-24,32,-21,-4,-5,22,13,0,-16,-2,33,28,-10,-8,12,8,7,-23,5,-2,-9,3,16,30,17,-25,-11,-28,25,16,-13,-19,22,-44,-28,3,-19,14,33,24,19,40,-6,3,14,-16,3,-43,32,14,10,-15,-12,18,-8,3,16,-9,12,-32,-4,6,5,38,29,19,-10,14,0,-30,19,-12,-6,68,36,-31,-11,2,-12,-25,-13,20,8,-24,-51,-18,-22,8,-22,-30,69,-25,-5,-44,-1,98,76,58,-26,8,0,4,-1,-10,-16,59,14,-9,-40,5,-10,-27,12,4,-15,2,-32,-2,11,30,18,-32,24,-47,-38,6,-11,38,1,32,-32,24,3,-5,13,-16,10,-8,-19,13,-26,-37,2,-36,5,-31,4,2,-28,14,38,32,7,-44,-13,-31,-15,30,8,-6,-56,-15,-32,14,6,-22,43,-23,9,-29,36,17,-9,-46,5,-10,13,-27,18,-37,-28,-12,23,25,9,-21,-28,-14,-3,52,7,-2,-36,-29,-30,26,1,7,27,7,13,38,16,-9,2,17,29,-34,18,-6,22,-23,9,14,28,-15,24,9,-5,-12,-26,-18,7,16,19,0,12,3,5,5,5,-16,-7,23,36,-35,18,27,15,-22,-10,-10,-14,-31,-9,-22,-6,-17,-25,-6,35,-6,-21,-40,15,45,3,-7,3,-28,17,11,-8,-7,-7,-25,20,15,-12,23,18,-31,-31,-7,-29,1,-7,2,-8,-15,21,7,10,-28,-31,-12,23,9,-44,-30,-8,-17,16,-27,30,1,13,-18,39,3,-14,12,5,-41,-11,-10,-5,1,3,4,36,2,20,3,12,-25,-6,46,-9,-25,7,-30,2,13,12,-22,24,-9,21,-27,62,-9,7,-16,-1,-7,26,6,9,-21,1,8,52,1,14,-16,21,-22,-12,61,50,-11,-30,-34,-5,34,16,-18,-3,3,12,-40,-22,-12,-48,17,-40,-19,-1,31,-4,-8,-7,-17,-2,-24,-5,-29,-9,-42,5,29,-25,-32,13,0,4,20,-8,-3,-32,12,18,-34,3,-12,-20,3,15,-1,27,-9,17,-1,-9,21,5,-27,-29,-6,-41,-50,-28,25,3,-23,-55,-4,1,53,-2,24,-21,-21,4,-81,4,-54,-16,10,38,16,11,-11,28,-32,-10,15,-5,-32,0,-3,-46,-29,-40,53,2,-41,-68,-44,-9,47,4,53,-37,-33,19,-51,10,-31,-24,0,36,53,22,-1,7,7,-14,39,-12,-21,5,-13,-38,-14,-23,15,29,-37,-43,-18,-14,48,2,43,-18,-13,4,-2,9,3,-9,25,34,63,20,-3,4,-1,-10,6,-9,-14,1,-24,-15,0,-7,58,30,3,-28,-16,-11,33,8,30,18,20,19,18,13,34,6,-12,-46,-4,22,22,21,5,55,23,-3,4,16,8,13,0,32,4,-37,0,34,-9,8,9,-11,-4,-7,20,17,2,-20,-6,-24,-16,-8,-38,19,2,14,6,6,7,34,-9,-27,1,-13,10,-11,24,-6,-29,-12,-13,10,-5,-12,34,-3,-22,24,-30,15,-26,12,9,65,-34,13,-13,16,0,-10,6,18,-7,-4,-18,1,-7,-25,90,16,-50,-67,-61,6,13,3,16,-13,-19,31,-49,16,-21,-7,-9,62,18,20,-11,29,-8,-9,21,-19,-13,16,-18,1,-38,-18,56,-7,-58,-22,-63,-10,8,-2,45,-31,-51,13,-29,19,-14,-23,-9,24,34,39,-15,9,19,-61,41,-15,-32,-15,6,26,-25,-41,5,-8,2,-45,-43,-6,38,5,47,-29,-28,2,41,5,31,-6,0,-27,14,-12,14,-5,29,34,1,-13,4,-5,-3,-8,0,20,42,-3,34,34,12,1,13,-10,-17,5,33,7,46,-30,-8,-20,2,-8,2,-26,8,5,-2,5,25,26,-9,-29,-5,-2,-1,9,5,-33,-3,4,-13,17,-1,-4,4,11,-2,-2,-6,8,-18,-12,10,23,-29,38,-23,-4,-13,9,6,7,-11,-8,-18,-14,0,-6,47,-14,-40,16,-40,7,-30,-3,20,12,8,5,-36,14,-31,-22,-26,20,-23,46,-19,-18,-14,-21,4,24,-8,17,-49,-19,-46,-24,75,-15,-58,-37,-43,-13,-10,-15,38,8,-21,12,-15,31,-36,-19,-21,26,-8,29,-3,13,22,0,72,-3,-14,-26,6,4,-17,-43,70,25,-37,-25,-17,-3,15,-6,39,-4,-28,-15,-5,-21,16,-13,-29,-23,22,17,2,21,2,2,-10,15,9,-21,21,-9,-22,17,19,21,11,-11,-7,6,5,9,-12,-15,10,0,9,-45,-12,-17,-20,-26,16,12,13,19,-1,13,13,17,-3,-40,5,2,10,9,-9,-15,-7,14,-17,17,-27,4,18,14,13,-26,-2,12,-18,7,-5,2,2,5,0,-8,-6,-28,-12,47,-5,-20,-15,-13,-3,-2,-9,-30,-37,4,-30,14,-6,-19,17,-7,51,-24,-22,-38,-18,-17,-38,-15,-24,15,-4,-12,-2,-3,-2,24,10,7,-36,-28,-29,-4,29,-6,-59,-26,-31,-1,-25,-11,12,1,20,-20,-16,22,1,-1,5,3,-50,31,12,3,8,0,-22,11,33,-17,-26,16,-22,-32,29,2,-31,-14,-29,14,-18,-5,27,-12,12,-2,-2,-24,-15,15,-39,-28,46,12,10,-18,1,58,-30,21,16,-12,13,-19,-28,6,26,-5,-3,25,10,19,-9,4,-12,-13,9,-3,11,-19,-3,6,-42,-20,17,16,-10,-7,-13,19,-15,3,1,-20,-5,-25,27,29,-15,-17,4,-10,3,22,0,2,-3,24,-4,-21,-6,-16,0,22,-6,9,16,16,5,-31,-3,-12,2,3,-2,22,-13,-31,34,9,2,-13,7,33,10,10,-13,-13,-5,12,53,-36,-3,-20,17,-16,-40,-36,0,0,-8,-9,-4,15,-2,-14,40,33,-13,-27,33,17,36,-25,-4,52,-7,-8,9,-9,4,27,10,-32,8,11,-3,-4,-6,-14,-39,4,3,12,40,19,-22,-26,36,6,-25,0,17,13,50,-9,5,41,-8,1,-5,9,3,14,33,5,-9,-10,4,-15,47,-49,-21,-15,-26,-15,33,-7,-10,-6,-32,13,-11,5,-1,-2,-62,22,-20,-31,-13,7,-14,-5,-24,5,37,23,-12,-27,22,10,34,-60,-12,7,-27,-14,16,18,-25,16,-19,-5,1,-3,-10,-5,-15,-23,-16,-27,-31,0,-9,-10,-4,-10,16,21,-3,0,11,15,-1,13,-25,-15,-14,12,-20,13,-80,13,1,20,-2,-16,6,1,-8,-24,-25,-5,-16,4,23,-8,9,31,21,-10,27,25,0,37,9,23,-32,31,19,13,-19,-30,-50,-6,41,39,-33,-16,37,-10,3,-16,3,1,-55,-26,-9,-4,16,46,-4,-36,9,33,45,8,62,2,40,-3,41,21,6,-25,49,8,38,-47,24,13,26,19,27,3,-19,20,-26,19,-22,11,4,-5,-1,13,-42,-11,17,9,-11,-25,-25,0,-10,-10,28,-24,41,-14,-35,9,-10,26,-16,-11,-21,25,15,8,23,6,16,-7,-38,4,14,23,-32,-14,-8,-10,-13,-40,-17,4,-5,-39,20,-2,-5,17,-30,-28,2,1,-12,-14,-13,7,-11,-14,4,7,-2,-17,5,-23,8,15,-50,-9,10,10,-44,11,-36,0,-23,-7,-5,-28,-57,-1,-10,7,-10,-36,-3,-37,-54,9,-54,-23,-33,10,2,-20,5,14,-1,-22,-26,21,6,4,-34,21,-28,-17,6,-19,-24,-47,-81,-28,-6,37,-35,-46,47,-13,-61,-35,-16,-26,-68,-27,-27,7,24,37,-32,-20,44,24,43,26,7,-2,30,5,8,16,-9,-15,21,-33,5,-14,-16,31,31,6,-11,-7,-16,-5,-56,12,30,36,-20,-3,-4,29,-6,-19,24,1,3,-4,13,34,7,19,-12,-14,56,-17,0,31,37,37,-10,24,3,7,12,20,54,-14,16,-4,1,-9,-5,26,3,30,-5,-19,15,22,22,-10,7,-14,-7,3,-7,30,-6,10,43,36,-29,12,-6,-5,-10,27,35,13,-11,-4,21,-15,-2,9,-43,10,-12,18,-4,26,1,3,-27,16,-29,-13,-39,19,16,19,-9,-16,10,0,4,36,-79,1,13,12,-12,-7,28,30,11,-39,-35,33,16,35,-9,36,-15,1,4,-6,-41,-22,-55,-34,14,29,-50,-58,50,26,-23,-13,-28,-32,-31,-34,-50,5,18,26,7,-28,-10,60,43,55,-8,9,25,8,-29,9,-25,-12,-9,-7,-6,9,-44,31,17,40,-13,-24,-35,-5,-21,-14,5,6,-8,5,-15,9,3,-29,-12,0,-5,-3,14,-15,-5,5,-25,2,4,-11,-19,16,22,-1,3,16,-10,-13,17,-13,27,-8,20,17,-21,-18,-25,10,-4,26,-1,-7,-4,17,16,-16,-2,-20,-21,-2,-17,11,-5,26,-2,-25,-39,-11,-19,-2,-11,0,0,6,-16,23,22,-10,12,18,-31,6,7,17,-15,2,-3,7,9,10,-45,22,-33,11,1,12,-48,-51,-1,2,-9,6,-41,-30,-24,5,-30,13,17,3,25,-22,-45,46,27,-9,-40,19,-24,4,13,6,-14,-6,-47,-9,15,-5,-43,-63,34,16,-14,-15,-12,-39,-34,-39,-49,12,12,23,-12,-1,-18,25,39,18,-24,30,-3,41,-1,5,-3,-24,-7,5,3,2,-49,13,-19,31,-15,-1,-26,-21,-19,-16,-36,-4,42,15,-21,-9,-2,-11,3,-15,2,31,11,-11,-15,5,-2,-22,45,18,-28,6,7,1,22,15,-23,-27,36,-6,15,4,-19,14,15,5,-14,4,-4,6,1,-23,5,11,2,-25,-13,-8,25,0,21,7,-10,26,-28,1,-1,-11,-30,20,17,-4,31,17,-19,0,55,-11,22,21,-1,-19,12,21,-10,27,-1,-1,8,4,4,12,-11,-21,-7,16,-52,-14,-4,-2,14,-1,-13,-14,15,12,-35,-12,16,-16,28,-1,-6,27,3,2,-9,-4,20,-16,12,14,-1,25,-6,-23,9,-23,-46,-42,29,9,-7,15,2,-20,-17,-13,-27,-11,0,20,9,8,38,5,11,-12,-5,16,18,0,33,-3,-4,12,46,2,25,-3,-29,18,-31,-9,36,0,-16,-3,-17,-3,4,-3,20,18,-15,-6,21,14,26,14,-20,-18,13,-35,7,-14,-4,46,-61,7,-16,-3,-26,-28,-10,-6,-26,26,29,24,8,-10,-35,-20,9,30,7,-7,10,-19,35,7,-22,-20,-20,-53,25,-6,1,20,-61,-32,-3,-12,6,-23,-45,-9,-66,26,55,44,12,-13,-1,-16,-13,-20,48,-25,33,26,21,9,-29,-18,-16,-28,12,-7,-1,-2,-49,3,8,1,20,-4,-37,12,-9,41,64,2,-17,5,12,-7,-24,26,57,-29,1,47,2,22,-28,-20,4,-7,9,16,-7,18,-19,11,-9,23,-1,-19,-40,25,8,6,17,-15,-48,16,12,-1,-32,1,18,-8,-25,1,-19,-29,-58,6,11,16,4,24,1,20,-22,49,-8,2,-22,-16,-6,18,-18,17,-9,-30,-49,5,10,14,-39,-16,-13,-12,15,22,40,15,-10,10,19,-8,5,-10,-29,20,-63,23,-1,-27,-10,-11,6,-15,22,-8,-1,3,30,6,-10,-14,2,31,-1,-1,10,-50,23,-31,-5,-23,-4,-46,3,-3,-18,-13,-57,-5,-12,6,9,-10,-32,13,-43,15,17,-15,-20,-11,-41,-11,-34,-15,13,-13,-33,-16,17,-23,-22,-21,-16,-55,-10,-1,-26,4,-40,-9,4,9,16,-32,-35,-3,-31,18,-5,-1,-48,15,-15,-9,-22,-4,-5,-13,-9,27,-9,-20,-27,-35,-7,-37,-4,4,-32,1,-29,7,-5,13,19,-22,-29,17,19,-16,-33,-23,-47,16,7,-1,-18,-17,0,-3,1,20,-18,-20,-24,-6,4,13,12,13,-12,26,-42,26,-9,-7,-11,-2,-2,31,18,8,-29,-23,-28,15,25,2,-14,-19,-5,10,-62,88,23,23,-16,67,4,26,26,-13,-4,21,0,8,-7,-34,-15,14,39,-28,88,-9,38,3,-4,-1,42,-17,36,17,-22,11,-46,3,3,-4,-17,30,-5,17,-8,15,5,17,-10,-5,-13,6,16,19,-4,-1,22,-3,4,-23,-49,-4,23,3,17,-13,-15,-7,-13,-23,3,-40,-33,4,-24,-30,5,15,18,-12,-21,-2,-5,-2,-17,9,-36,-5,-61,11,10,-68,-30,20,20,9,20,-12,-16,-5,-1,0,5,-28,-29,-41,-28,-28,20,-11,20,4,-30,-2,-17,-8,-9,26,-15,10,-14,-27,36,6,-19,7,-13,12,-2,-28,-2,0,3,9,18,-4,-11,-14,-27,-3,20,-4,27,55,-52,33,-15,30,-30,37,-15,29,10,-1,41,40,14,-9,-18,4,1,-19,31,27,-3,42,-9,17,-29,70,-8,35,-25,4,-14,6,7,26,-19,-29,-15,-26,6,1,38,1,-21,1,8,18,14,2,35,7,-2,31,-39,11,-9,15,-12,55,-6,53,-2,4,-28,-14,6,22,-22,2,-3,-1,-20,4,47,23,-64,-34,4,13,44,10,31,-6,5,11,-6,-18,-30,-23,-5,20,-12,12,4,22,3,9,10,27,-8,-21,-26,22,13,-13,12,11,-19,-28,7,25,7,18,14,-13,-25,-9,19,-10,12,-27,-12,-32,-16,-23,16,-7,27,11,-7,-16,1,-16,-22,20,-6,5,-61,10,32,6,16,13,-24,6,-3,-18,10,-10,2,-12,26,11,-17,-25,-29,-26,3,-6,20,21,-3,10,25,32,-23,-2,-13,26,14,22,3,24,7,-12,-29,-3,-10,19,31,33,-13,-21,-20,-24,-33,20,-3,21,-4,-5,29,35,25,3,-3,-23,19,-16,-5,1,19,5,-4,31,31,29,14,23,3,-12,-15,14,5,32,-21,16,-5,50,10,30,-3,-33,-17,-11,10,7,-3,-18,1,4,10,-6,29,29,-4,5,16,11,35,17,20,-9,15,2,0,5,-27,2,-1,40,4,29,14,-6,-21,-7,12,15,3,-12,-8,6,17,-19,1,-3,-15,20,11,3,28,26,6,16,-34,1,1,-5,8,3,-14,-35,5,0,29,0,0,-7,-16,13,-2,-14,-14,9,16,-9,-25,-18,11,8,-14,-25,-5,6,22,-13,-14,-7,-25,-21,43,-5,-38,-20,-13,-4,13,10,37,6,11,-10,8,7,-9,-11,3,-2,-10,17,2,1,-14,-26,42,-11,-6,9,11}
+
+#define CONV2_BIAS {55,50,34,43,-37,35,-21,10,35,-53,-76,7,14,-1,92,20}
+
+#define CONV3_WT {15,10,3,1,-20,-11,5,-18,-9,-1,-4,-11,-5,-19,-26,-15,13,5,1,7,6,16,1,-20,3,35,0,-2,8,12,-41,-5,-20,4,6,3,-3,0,-28,-13,6,16,-4,-4,1,8,17,2,-4,-10,-9,-11,3,-2,-4,-9,-21,-10,-18,2,-6,-18,27,5,2,5,-5,-3,-4,-2,-16,17,1,-17,-13,-12,-1,-18,10,6,7,-4,-1,7,1,-4,7,-8,0,4,12,-4,-5,-13,8,-17,0,-22,-2,5,12,20,3,-6,4,10,16,4,-3,-7,0,-15,-17,-1,14,25,2,3,-15,-11,1,-4,-2,-14,-2,5,12,-2,5,-13,8,-1,-4,0,-6,12,-3,-22,-20,7,3,11,8,6,6,2,10,7,-5,-6,-6,18,-2,-23,2,1,-12,-9,-3,-2,0,-3,-9,4,-16,-3,-7,-23,-10,-8,-1,-26,-3,9,30,-16,-11,-13,-9,-11,-14,5,5,-9,-16,1,10,-5,-18,-14,31,-6,-1,1,5,-7,1,2,-17,3,8,1,2,-3,-4,3,-6,7,23,-10,-6,-3,-9,-9,-38,14,-3,9,-10,-3,17,13,-6,12,22,8,8,-6,9,9,-6,13,-1,-11,12,-2,-4,-3,-2,17,2,-8,-15,-2,-9,-10,-9,-19,2,-11,6,-5,-5,8,28,2,-23,12,-19,-12,-33,2,-8,-17,-3,-15,-1,23,-18,9,23,2,-9,18,-17,-29,17,9,21,6,13,3,13,32,-4,0,-7,10,-2,-2,-10,-5,-4,-5,8,-1,-1,-9,-20,11,1,4,-10,1,-1,-6,-24,-1,-6,-2,11,6,7,-9,2,8,-6,-8,-19,11,6,-10,5,-5,0,-14,18,-11,13,5,3,-7,0,6,-21,4,2,12,-14,3,-14,0,-10,-6,10,0,-21,17,3,19,-19,-2,13,21,-10,-11,-5,-18,-10,6,25,1,-19,0,2,-11,-36,-13,11,-24,4,-16,-3,-28,0,1,-9,-6,-3,-1,6,-11,-19,3,5,-6,-7,16,-22,-19,-11,-20,6,-2,-1,-12,24,3,-27,-2,-15,-18,-2,-3,5,-12,-8,-4,-23,1,5,19,-30,12,-5,-4,-1,10,-1,13,2,-4,-12,6,-3,2,13,5,-18,-3,2,1,4,20,13,8,22,-6,-3,8,-12,-10,-28,12,-12,-5,5,-12,-6,13,1,-17,-32,-11,20,-14,-9,-6,-11,-1,-8,-2,-13,-10,-1,31,-12,21,-32,-2,-14,-1,0,11,-1,-1,-13,16,-10,-8,-10,20,-2,1,3,-7,-17,-1,-12,-4,-14,-16,-20,-1,-2,-4,-3,11,16,6,-12,-11,3,0,2,-1,-2,-18,-21,-4,5,-9,12,16,7,21,9,13,-4,2,4,0,-21,-17,-3,2,-10,-24,-10,7,3,-15,3,-11,1,1,5,-8,0,-7,3,0,-3,2,0,31,-5,7,0,6,-13,-7,11,-15,-1,-13,2,13,-2,8,25,-6,-1,-16,-3,7,3,-21,10,0,-14,13,17,7,0,-7,14,10,-1,-6,-8,-14,-6,-16,4,-26,-11,-5,-1,-14,15,10,26,-1,-27,-22,6,23,-16,-14,-1,-12,-26,-21,-3,-5,1,-9,-13,-9,-9,-12,0,-6,-3,-6,-9,-2,-10,-3,-13,5,-4,4,-2,1,-3,1,-3,-13,6,-5,3,-18,-11,-6,-19,-2,-13,12,4,-10,23,-8,-23,15,10,-3,18,4,-5,2,5,10,1,-16,15,-14,2,33,-5,-4,5,-11,-19,-4,6,5,16,-14,20,12,6,-14,-24,-29,-11,1,16,-6,11,-8,-25,7,-9,-6,7,20,-19,-3,-7,-19,2,-8,15,-7,-5,2,-11,-11,-6,-4,4,-7,-1,-4,15,-2,-3,-2,7,-9,-16,-6,-4,-19,5,-6,6,12,14,-2,-3,-1,17,3,-11,-5,25,4,15,-7,8,7,1,-19,9,-32,19,17,6,4,8,2,-5,29,28,-9,14,14,-13,-3,5,-9,-20,-17,4,-12,0,-4,-2,14,9,5,9,-28,-15,8,-8,11,-16,-34,8,-3,-2,4,8,-20,-1,-11,-19,-14,-1,-14,-5,-3,25,-16,12,13,14,22,6,-4,1,-12,3,-15,3,-14,-12,-5,23,4,-13,3,-22,19,8,-3,-29,9,-12,-21,-12,-18,-15,-9,8,-6,1,-5,-21,7,-2,1,3,-9,-16,27,-11,-19,8,0,0,-1,-2,-9,-28,-25,-24,2,-7,-5,-20,1,-27,-15,2,8,-13,-1,-1,3,-21,-6,-10,9,4,15,-15,12,-12,-10,-6,-15,2,0,5,0,-2,-18,4,-8,-3,4,1,-9,-1,16,-8,20,11,7,12,-4,-34,11,-21,-6,-9,-12,-18,-39,7,14,-14,-1,-11,4,4,-5,-21,2,-7,9,-5,-9,20,31,17,-18,-14,-1,-6,1,-6,-14,-21,-11,-6,3,8,-4,5,26,2,-18,0,19,0,4,3,-8,2,-10,-6,13,7,-3,3,16,-7,-11,9,-11,19,-9,-6,-5,2,-1,-1,-8,-10,13,7,-9,-12,-2,0,24,10,-7,13,13,14,19,-6,0,-19,-14,10,-21,15,-1,-11,-3,21,0,1,23,-37,1,3,-9,0,13,10,13,31,-29,-3,-1,9,-2,-8,-11,-11,-6,-5,-17,15,-20,0,11,7,12,6,13,0,-12,-8,-13,-2,-3,-5,-1,-10,-6,4,6,0,8,-8,-7,10,-5,-6,-2,-23,-4,-4,-13,3,16,-1,16,5,-3,-12,9,-5,-2,11,-16,13,15,7,-12,0,-4,-18,-9,-7,4,-25,-6,-6,6,-19,-5,13,9,-1,-5,3,4,-7,16,5,-8,-6,3,5,1,-6,4,14,-4,14,2,2,-10,13,23,7,3,5,11,3,-13,-4,9,12,-6,-7,-2,-11,-3,0,-6,2,-2,23,8,-4,-13,-1,-7,4,-9,4,9,-8,-3,-8,5,-12,-14,-15,-13,-1,3,14,1,-9,4,-2,-8,-2,18,-5,-5,10,-4,-2,-8,12,30,-10,17,1,-10,-1,-15,-17,17,6,-3,-4,0,1,-13,0,2,-14,3,-4,-18,6,-12,9,12,-12,3,13,5,-6,-4,-7,2,-17,11,-7,3,-7,-2,12,0,-9,21,1,9,24,13,11,-2,-18,2,-3,-1,-14,-13,-14,2,-1,0,-2,19,-8,-3,-9,6,-20,-9,-5,-5,-4,-9,-17,-4,-4,3,-3,14,11,3,1,6,-20,12,-9,-16,-12,1,4,-6,-7,6,3,-5,0,-14,-2,20,-12,-2,-29,-15,-10,6,14,-17,1,-23,4,-2,-5,0,3,5,4,12,-20,1,6,-2,-1,1,-7,2,-2,-23,0,0,3,8,0,1,-20,17,-16,-7,-12,-2,5,1,-6,-15,6,-8,3,5,-14,-11,-2,3,-1,7,-2,13,-13,4,6,21,-18,-1,9,-7,-12,2,-12,-9,-6,1,10,5,-12,-5,5,-1,-18,-15,-13,19,1,10,-21,1,-8,8,-2,-5,-13,-8,9,0,-17,-11,-9,0,6,13,-4,10,0,-19,0,15,-15,-3,15,-7,-9,-8,-3,-5,2,5,-8,17,-5,1,-11,3,-10,10,8,4,11,-1,4,-12,-10,-11,9,-1,2,-6,-3,16,2,14,-15,2,0,0,8,-3,-20,-16,13,-1,-14,2,8,4,13,5,2,14,7,-26,11,2,-1,-13,-5,2,-9,1,-9,-3,-6,-2,19,-2,-26,0,8,-19,14,2,-8,2,-13,-16,-1,2,-12,-8,32,9,-23,-3,15,19,-22,2,25,-4,-1,-16,10,8,-18,4,23,8,4,8,-9,-12,-7,2,21,-4,3,-13,-13,-10,-12,-2,-13,-20,11,5,-8,8,-2,14,13,-5,8,-2,-10,-16,-4,-2,7,-7,12,-13,-5,10,11,18,10,6,12,-9,-12,17,15,3,13,7,-26,-13,16,-8,18,-9,1,8,-7,-3,4,6,11,-19,10,6,-4,-10,13,5,-16,-2,32,-2,-11,-2,-4,15,-30,-5,13,17,-14,9,-4,10,-30,-14,16,-17,0,-11,-8,-37,-5,9,-36,-6,-8,11,3,18,-11,-6,-1,-8,5,-10,0,-9,-7,4,-20,-7,-1,16,-5,-8,-3,-10,-7,-5,9,-2,-15,-11,2,8,-9,-18,16,7,-13,-1,0,10,-12,-12,13,21,-20,7,8,-17,-6,-9,15,9,11,-7,-17,-4,-11,-34,-5,0,-19,9,-15,-28,-10,0,-13,13,19,-7,-10,-10,-4,-8,4,-7,-15,18,18,10,-27,-9,-14,3,7,20,-5,1,-10,10,-3,-16,-1,2,14,-18,-2,-3,-2,5,8,12,-7,7,-9,14,1,-12,2,7,-12,4,-6,-7,-21,-3,15,0,-14,-9,-11,-11,3,-7,0,-4,-22,-15,2,-8,-25,11,15,6,8,2,4,-7,5,-2,-4,-13,-13,-17,-3,9,-9,-5,3,-5,-4,-1,-9,-7,3,-29,-1,0,-9,6,9,6,-13,-2,-5,10,0,4,-9,-3,12,-17,-3,-5,2,-3,13,2,-5,-6,-2,7,-8,9,5,5,-2,-15,-6,0,-9,-21,9,13,-13,-19,2,4,-2,-9,2,-12,4,-21,15,2,-12,0,-2,-17,-1,11,-7,2,27,-16,16,0,22,-18,-2,-8,-19,-16,-6,23,-12,-20,5,-24,-15,-3,-8,7,-1,-6,-1,-28,-6,0,11,-20,-1,-14,8,-8,-3,11,-4,-8,2,-15,-8,7,3,8,18,-20,-6,-17,3,-25,-6,-11,-15,-7,-12,20,3,0,-9,-31,20,-11,-16,-14,5,-5,-19,-10,-20,-11,-10,17,4,-18,-4,-5,-12,-6,-13,-11,-4,-8,1,-28,7,7,-7,16,1,-7,2,3,1,19,25,-10,12,-13,16,2,11,7,1,-2,5,8,-9,-7,0,3,27,-2,-11,-3,2,8,23,4,-7,-7,-3,-5,6,1,3,-1,12,8,3,-15,-2,0,-7,1,8,9,18,26,10,13,1,-7,17,2,-13,-2,17,-19,-7,-7,0,-4,6,-3,-5,-9,-12,-13,7,-11,-2,-11,5,-1,15,-12,-24,7,-13,0,-1,24,4,-21,-4,27,16,-9,-2,5,-7,-11,-19,-6,-7,33,2,6,9,-15,15,11,-16,-3,-26,13,-7,5,-15,-12,13,-9,-7,-1,3,-10,8,18,-2,-14,0,0,-5,-2,15,6,-4,15,-18,23,-2,-9,20,8,-21,-16,0,-8,11,-28,11,-8,2,-2,-11,6,-18,-11,-1,4,-16,-7,2,11,-15,-26,11,3,-20,-1,-14,14,-6,-8,-4,-16,3,6,13,-4,9,9,3,-14,-13,-11,13,5,0,-4,9,4,-13,10,-1,-19,-1,-5,-6,-12,-14,-23,-1,-19,-2,2,19,38,-8,-18,11,-2,-7,-12,14,-7,-18,-1,6,3,4,-12,-5,-20,3,8,4,4,-22,-5,-20,-2,10,-1,-3,-7,3,-2,9,-23,3,13,20,-3,19,-18,-29,-12,-4,-6,7,-2,-18,-8,2,-13,4,3,10,0,-1,15,5,-8,-5,2,14,18,4,-1,5,4,13,2,-1,-11,-2,20,16,12,3,-5,-16,1,10,3,9,13,3,-14,18,8,7,-1,8,-3,-22,7,-28,-7,8,19,-19,-12,2,3,-9,-1,13,2,9,-9,-4,-18,-19,0,-6,-1,0,-14,5,16,-14,-23,-14,-29,-12,-2,17,-11,-1,28,21,6,1,-2,5,1,-10,6,17,8,-2,-16,-11,0,-6,-1,-2,-18,6,1,0,-5,-3,5,-3,1,9,-16,-12,-13,-13,-7,-11,-11,4,26,5,-5,-4,1,-6,2,13,2,-21,-9,-12,-7,-24,-26,5,28,-8,-15,10,-23,-7,8,20,35,-4,-12,-19,-16,-21,-16,-4,-7,-16,10,9,9,-24,0,-4,8,-1,-5,1,15,8,-26,-23,0,2,1,-17,3,7,16,-14,1,-9,-7,-10,-22,8,-4,-6,-18,-14,-7,-19,3,-10,4,13,-4,-2,-2,-10,-24,-1,-18,3,-16,-7,2,-2,7,-12,1,-3,-32,-6,2,1,-31,-6,-24,-2,15,-13,-6,11,-6,9,2,26,14,1,9,-17,-22,-11,-16,5,-6,4,3,15,32,0,17,2,0,3,32,5,-8,-16,13,-1,-13,7,0,-8,-12,-18,-2,-22,-11,8,-11,-5,-17,-2,7,-9,-22,-4,-10,-6,-8,-4,1,8,4,25,-6,13,-3,-8,1,-10,-25,-5,-5,-10,-13,-2,19,-11,-7,10,9,-13,-19,-4,4,1,-6,5,1,-8,-7,8,4,-11,2,3,14,-3,-15,-18,14,-6,-4,24,-11,-4,1,-13,-8,-7,13,2,2,30,1,-18,32,-18,-15,24,6,5,-2,-24,2,-21,-1,8,-23,-3,21,-5,-5,-8,-9,7,-9,-13,-12,-17,-13,-2,5,21,6,5,10,-12,-16,-1,29,21,3,-7,-31,4,-25,-6,18,-8,10,-7,-2,-14,-19,3,26,33,10,0,-10,5,-11,-20,-4,-9,-14,10,-16,-12,6,21,3,10,-14,-20,4,-29,12,-8,9,1,29,-13,-4,9,-12,-20,-10,-1,-6,18,7,-16,-1,-25,14,-10,1,-22,6,8,-10,-27,-13,-8,-11,2,-1,1,13,-15,-1,15,-5,8,3,-8,3,-3,7,6,2,-3,-18,17,-15,-9,-21,0,0,18,-16,-22,-9,-9,-9,9,-16,-12,-17,25,-11,-15,-9,11,-21,21,-14,-9,7,-12,-24,-7,-5,-2,0,11,27,-12,0,25,17,2,8,23,-6,13,0,-10,8,-12,3,-25,10,12,2,-29,5,-6,-18,16,6,-9,2,-1,17,-9,-1,0,14,-2,-1,-2,-10,0,-16,-19,-15,6,18,-12,19,5,-10,43,-1,4,6,-8,-6,18,-11,6,-12,6,5,17,-34,-17,-3,-6,-17,-4,4,-1,3,12,5,41,10,-16,-16,-15,8,-6,-8,-27,6,-14,-14,-11,-13,-10,22,11,2,4,-5,-7,-2,-11,-10,-19,0,14,-8,-10,-1,-1,-7,10,6,-20,-11,-8,13,-2,-17,2,-11,-8,4,1,5,-10,-18,3,-9,-1,-10,-13,-4,15,2,11,8,-11,4,0,-17,19,0,3,-18,16,-10,1,-14,-5,-3,-10,7,-11,0,-9,5,-4,-1,16,4,-2,-24,6,9,-18,-22,-5,-9,1,-9,-3,-14,-7,-3,-16,18,37,-7,1,0,-3,-4,-1,2,12,-7,-3,-11,4,1,11,-9,-1,16,5,24,10,6,-5,1,-1,6,-10,-11,-7,-1,6,3,12,-1,2,5,-17,-1,6,13,8,-4,-11,-24,0,-2,0,1,16,10,17,6,-6,-4,8,-1,16,-8,-10,-6,-5,-9,14,1,-22,-13,6,1,-11,-18,1,4,10,-1,0,-1,-25,-4,-6,-8,-2,5,-2,20,9,1,-18,22,-9,-8,11,-25,1,5,5,-12,7,5,-15,15,-4,15,1,3,0,-17,1,-21,-11,-4,4,6,-4,4,-2,-12,-18,-3,-9,3,-10,-23,-8,-14,-1,-14,14,-7,4,-7,-4,4,-19,-14,-13,-9,-2,0,1,-2,-11,-5,8,1,15,-6,-2,-1,-13,-12,-9,18,-2,-8,-1,0,8,12,-4,-8,-23,-8,0,6,-4,-8,-4,32,-28,-3,1,-9,6,-24,-8,-11,12,22,7,1,6,-4,1,11,3,-6,-6,-22,14,1,14,-3,1,6,-11,-1,-8,-9,-1,0,-2,-14,-13,-7,9,-4,9,-15,12,13,-22,6,-1,-21,-2,-25,9,-7,-13,-11,16,9,4,20,12,-8,-25,12,0,-6,-12,14,6,1,9,-4,1,4,-3,-2,-21,-7,13,-1,-13,3,5,-1,-11,2,-16,-15,-6,-35,3,-11,0,11,47,-1,-13,3,7,-15,-2,-1,-9,-25,6,-12,-10,-15,-6,6,19,-1,-4,-15,7,-13,0,-4,2,1,4,15,-24,-23,18,22,-3,0,8,-1,5,-19,22,1,15,8,12,10,-1,-5,-5,14,-31,18,-8,10,-19,-17,1,2,-3,23,-6,-8,7,-3,-3,-2,1,-14,-16,12,-3,-14,0,0,-7,27,-6,-15,-8,13,-1,-23,47,-19,-15,13,3,-5,4,-4,-2,3,9,-34,-2,1,7,-22,20,-14,0,-2,26,-5,-17,-4,-5,-5,-7,10,-16,-28,13,-7,8,-13,-12,11,0,1,-14,-13,4,20,-9,2,3,-14,-5,-4,-10,1,-5,32,-26,12,-13,-9,-23,6,-1,-10,5,-2,-4,3,-25,-5,-2,-6,-5,17,23,0,-6,22,-9,29,-8,7,-14,-5,-9,1,-8,-13,28,1,6,-5,6,-1,-6,-20,15,-4,-2,-5,-25,-6,-19,-14,31,-10,-17,4,-6,-7,12,-2,-8,-7,-5,-7,-12,-13,-9,5,1,4,-4,-13,17,6,2,-1,3,3,-23,12,-8,19,-5,-7,-9,17,-8,-4,-2,3,-1,-13,-30,25,-17,-25,9,1,-1,2,-31,6,-5,-21,-15,2,1,-11,4,20,-10,-17,5,-3,-2,6,3,12,0,-11,13,12,0,-9,-1,-13,11,0,-2,-17,5,0,-15,8,5,-6,-5,-3,4,7,13,15,5,-10,0,-14,-10,-7,-9,-9,7,-4,-15,-11,12,-7,-2,13,-28,-7,-14,-2,-8,16,-9,-3,-8,16,-4,14,1,-12,1,-10,1,-7,-16,-7,0,7,-25,13,5,-17,-10,-2,-11,27,3,11,-2,-2,2,10,4,-11,-9,-6,3,-18,22,15,-4,-2,-9,-7,-2,1,-5,-15,-3,11,-20,2,6,-2,4,-6,4,1,-2,13,-9,2,-5,-13,6,1,0,-3,16,12,-13,0,10,-18,6,-12,-14,0,6,18,30,-3,-19,-16,0,21,-15,-1,-4,-4,8,-10,18,5,-2,-20,-6,-8,8,-8,21,0,-16,-19,6,17,-4,-22,24,-13,4,-8,4,-27,14,8,5,-11,0,-10,4,1,21,16,1,0,6,-23,-5,-10,-25,-17,16,-12,15,-11,9,-2,29,13,3,21,2,-34,-1,-11,1,-14,13,-1,-19,-7,-3,-9,-14,-14,-5,17,22,-24,17,12,-12,-23,0,-3,-8,-3,-3,5,-9,-24,-16,7,-19,5,-7,5,-2,-11,-8,1,-10,17,18,4,6,-6,7,9,-12,20,-10,-18,6,-2,4,0,-15,-2,0,15,6,-8,-20,6,-4,-3,-7,-5,-10,0,-4,-19,-10,-22,4,-2,20,7,-4,-15,-2,-7,-1,6,8,1,-15,-8,-17,-9,-9,-1,4,-3,7,-9,8,-7,-11,2,-9,-4,-5,-17,4,15,-2,24,21,10,-1,-2,-13,9,7,-4,-24,-25,-17,6,12,21,6,13,-18,8,1,-6,-9,8,-26,-14,-9,-28,6,22,19,4,-6,13,-17,5,-11,12,-7,-21,-2,-21,-5,-7,21,-3,-17,-11,-13,1,-15,2,-6,-22,-1,-8,-7,-1,8,-6,14,-3,-17,-2,-10,18,9,-6,-15,-15,3,-5,-17,1,3,-10,10,-7,-4,27,-1,-1,22,16,4,4,9,-13,-3,3,7,-6,-13,-43,0,3,21,-10,-3,-2,13,2,-11,-12,2,-11,-19,-12,-5,-10,-18,15,11,-6,-10,-7,-4,-17,8,14,-4,-4,-16,-10,-26,-19,-11,-9,-8,15,11,-11,-5,-19,-4,9,4,8,10,8,14,-10,4,16,2,17,13,-13,-2,2,18,12,1,6,3,10,18,4,-14,8,-2,13,5,-3,-9,-20,37,-16,-19,18,2,-15,-21,5,-1,-9,3,-12,-11,-2,16,-2,1,-12,-6,-3,-3,-27,-10,-16,16,22,-13,22,16,-17,-8,-23,-1,7,-15,9,-6,5,-22,-6,15,3,-7,-1,26,9,-3,-6,-5,8,4,8,-11,-5,2,-14,7,7,15,-6,19,14,9,-5,8,9,0,2,11,-4,15,-8,-4,4,-4,-4,-8,-5,-5,-4,-20,-9,-5,-40,-40,-20,-3,60,-16,-24,-3,-25,-23,-20,-4,4,-10,-5,-1,2,-17,-18,-19,35,25,-32,4,20,-6,-25,-25,-10,2,-2,2,2,1,8,-25,-1,12,-6,-15,11,-7,-18,-5,11,10,-10,19,-8,7,-13,-6,-13,-1,-8,1,0,-23,1,5,5,3,8,6,-6,1,-7,15,-7,-8,1,-1,-2,-9,-2,-6,0,-25,-4,-7,-13,-3,-4,-13,68,-31,-6,-7,-27,-10,7,-18,12,-1,12,-11,16,-12,20,-7,-4,-9,7,14,-14,-10,3,-10,-9,4,15,-22,3,-2,-1,-25,-26,0,8,-12,-19,-15,1,7,12,3,-21,9,-3,3,6,-16,-11,-1,-8,0,-4,-2,1,-2,9,1,-2,-8,6,-12,-12,-4,-7,0,6,-10,7,-6,5,2,6,15,-6,-2,-7,-8,-7,-26,5,-23,2,11,-25,-15,5,-26,8,-8,-10,5,3,4,22,-7,-30,0,16,10,-16,5,-7,-13,-8,-8,-2,-7,17,9,14,2,-28,6,-2,0,16,8,9,5,-10,3,-7,1,9,-2,14,28,-11,-2,-6,7,19,6,-7,8,-2,14,9,-14,5,-6,-8,22,-12,5,1,-4,6,14,4,3,1,18,10,-6,-8,-24,-5,-2,-4,-5,-15,-4,-5,-1,-7,-14,-11,15,7,2,-2,-7,2,-3,13,3,-12,-11,-1,10,-9,-28,2,12,-18,-3,-12,0,-4,-17,-6,-5,14,2,-16,-1,-19,-7,2,-16,-8,6,-4,14,-8,-7,-15,-6,5,36,-8,1,-20,11,5,1,-2,9,-1,-5,6,-21,-1,-7,9,16,-5,13,1,1,-14,-17,9,23,-10,14,-6,-4,-13,-9,-16,-21,-13,15,-21,-3,-19,3,25,8,-14,9,1,-9,-12,3,8,-3,-8,23,-5,-13,-16,-11,-6,-14,-15,-10,12,-13,-3,-10,19,-9,-18,-11,-9,-8,-7,-9,-12,-3,-10,-2,-22,-6,4,3,-10,3,-4,-1,-23,6,-2,-7,14,-6,14,10,-17,-13,-13,-13,-7,-11,-4,-6,-5,8,-12,26,-22,0,-9,9,-10,-27,-13,-4,3,0,11,-11,-1,25,0,24,-2,-5,-19,27,13,-15,-11,1,38,5,-3,8,-8,5,-14,2,-19,-6,0,-8,39,-10,-13,11,2,19,-11,26,-18,-12,0,4,-20,-4,-10,-16,-10,-17,9,-6,-3,-7,-1,-4,-19,0,-1,15,4,9,8,5,-2,-2,-12,-3,-9,-14,-4,-4,0,-9,-3,4,-7,-20,-5,12,5,-13,10,-3,8,2,24,-8,-10,-6,6,-20,11,-15,-9,21,1,-27,27,-1,-12,-17,10,-1,-10,-4,16,-7,9,2,22,-3,1,18,10,24,4,-1,-8,1,15,-7,-9,-13,-4,1,10,-8,-8,-3,12,3,-9,1,-3,-11,13,0,15,2,3,-6,0,2,-1,11,22,13,-13,-25,21,-1,-7,13,3,-9,7,-15,5,-29,-4,-37,-7,-18,-2,11,-2,17,15,-13,10,-18,4,-18,-8,-6,11,-1,-2,-17,2,10,-6,-21,2,-11,1,17,-4,-12,-12,-4,5,16,-9,18,4,1,8,6,-7,-6,-7,6,7,4,-1,-1,2,1,6,20,10,-26,2,-5,-3,29,-4,12,7,11,5,17,7,0,-9,-13,2,0,-11,-10,-16,-15,-1,9,-5,13,3,16,1,13,-1,-1,8,10,4,-12,-13,6,5,9,-11,-14,8,-3,12,9,2,-12,8,3,-2,-9,-12,-11,-2,-9,18,-22,11,-10,5,1,4,-10,-20,8,-1,-13,1,-10,-4,-7,-6,-12,4,-5,-12,12,-13,-25,21,-10,-15,9,-5,20,-2,14,-19,-9,7,-12,-2,-10,-5,-39,13,2,-2,-16,7,-15,-11,-6,-4,3,0,-15,-7,-7,-10,-11,-18,-5,-9,-21,-12,2,5,-9,6,11,-8,-9,5,-11,3,-4,-10,0,6,-21,5,8,19,4,-18,27,-25,-6,15,-4,-17,-23,-14,-7,-4,-10,13,-15,4,6,-5,1,-18,-8,-7,2,-10,-1,6,10,-1,-6,-22,3,2,5,-16,-11,-4,0,1,-10,-3,23,-14,-6,7,-30,3,20,-15,3,16,-22,5,-16,-7,-8,7,9,3,-20,-8,-14,4,4,-4,-15,22,-1,-11,-5,6,-5,4,9,12,12,7,-13,22,0,5,-3,17,11,6,-14,-5,0,-3,-12,-13,1,3,24,18,8,-19,13,-8,2,1,-20,-11,-1,26,26,-16,6,1,5,-8,-7,-2,1,17,-17,-33,8,16,-30,16,89,-24,13,8,-16,-3,8,16,14,15,-24,-14,3,-16,-9,4,-6,9,-1,9,1,-13,3,-4,-5,20,-9,5,16,-9,-10,0,-1,16,5,5,-5,-19,-8,-4,-8,11,7,14,39,-9,14,-13,-14,-5,12,-1,13,-5,2,-16,6,-10,-14,23,-17,1,3,-2,-18,-13,-8,-20,18,-2,-19,16,-1,5,-12,0,0,24,-19,-26,11,0,-12,8,-3,-13,-9,0,8,-13,3,10,15,-3,12,2,-3,-2,19,3,-9,15,2,4,-4,-3,2,4,-11,4,-2,1,-5,-5,7,-6,-1,4,-1,-2,-8,-10,-14,-6,1,-8,11,-10,-9,-17,1,-3,-16,-8,0,15,4,-13,4,6,-23,8,3,9,-19,-6,-6,1,-9,1,-11,11,4,-7,-3,-3,-6,6,-14,18,-6,-1,0,-3,-4,4,4,-6,-12,8,6,-17,0,-8,-5,-3,-2,-15,-6,-8,8,4,-4,-20,6,-5,5,0,0,-15,-6,-1,-10,-7,-12,-10,15,-7,-23,-36,-5,-9,19,19,-5,-14,-13,-11,21,-15,-21,-21,-16,6,5,0,-10,2,11,16,-4,-16,-11,-1,6,-5,-10,-2,-15,-11,18,2,5,1,-3,5,-1,-6,2,9,0,-17,7,-16,6,-16,-15,6,20,2,-2,-17,-2,12,-9,1,11,-28,-4,4,-11,7,-8,-4,-2,1,-2,-7,1,11,4,0,-4,1,-14,-3,13,4,-9,-23,17,-10,-5,-23,1,4,-8,3,2,0,-2,9,16,1,20,-12,8,-11,-2,1,-3,6,-8,-8,12,-13,4,-5,5,4,3,-10,7,-4,-2,10,12,10,-4,0,34,-7,17,7,15,0,-9,16,0,-8,-12,-2,-6,-7,-7,23,-8,-25,-19,15,-33,-5,3,6,4,-9,1,-11,-5,-6,-2,11,-8,-4,-31,11,0,-16,-1,30,-7,-10,-9,-22,-15,-9,1,-2,-5,30,-23,13,1,2,15,5,-1,4,-17,-11,2,7,-9,10,-2,2,-5,-9,-11,-17,-2,-1,-2,14,-2,-7,-6,-4,-12,3,16,-1,-1,2,-1,-17,9,-24,4,-12,-22,1,-5,-28,-29,18,-12,9,-11,7,-25,3,-1,-28,-3,-13,-7,-12,1,-22,-11,-7,-15,1,-24,13,-9,-19,13,20,-18,-13,1,-22,-9,-22,-4,-32,-22,45,-9,14,-6,-3,-5,31,-4,-13,-12,-24,-1,-6,9,0,-11,-9,-1,7,-2,-3,-5,15,-1,-8,-13,-16,-3,0,-3,-6,-5,-6,-8,-8,-13,-9,31,-8,11,-8,-16,5,-11,-4,7,28,23,13,16,26,-22,11,6,-13,-1,-5,6,-8,-2,-15,-7,-8,16,10,-4,4,-24,3,15,-23,7,-9,22,-5,7,0,18,-24,-12,12,-4,7,-4,4,1,-23,3,-12,-5,-6,0,-1,31,-16,10,0,7,11,19,-18,-4,0,10,-4,-19,-2,-18,6,7,16,3,-9,-5,10,3,0,-10,18,2,8,-10,-13,-15,-6,-2,1,8,6,0,-2,-3,-10,23,6,-5,-2,-1,2,23,0,2,-15,-9,-3,-5,10,-20,4,-4,-21,5,-17,34,17,-10,-2,-5,2,-6,6,-3,2,-9,-9,1,-14,-7,-21,-7,-9,-6,4,-15,-6,10,-10,10,7,11,-26,-12,-10,9,-24,4,-20,10,7,7,13,24,8,-3,17,-14,11,-2,1,-19,9,-16,-8,-6,-9,-10,-8,-4,-1,-8,-2,8,-3,12,26,2,7,-5,-11,-3,-12,-6,-2,-14,9,-4,-1,4,-1,1,-2,0,-3,11,-9,-28,6,-10,-1,-20,3,-6,0,10,-9,-23,-20,-11,2,-11,-12,-5,-11,-26,3,17,-30,-2,-3,9,13,-23,-8,4,-9,-8,-11,-1,2,-12,0,16,-12,0,1,-12,-15,-13,-13,-28,14,9,-19,-16,0,-5,-12,2,-7,-1,-7,5,13,27,7,9,20,-27,-22,-7,-7,-23,-17,1,-2,9,5,-6,9,30,29,-10,19,-20,11,-19,-9,-4,-20,-17,27,-5,20,4,4,-5,13,-17,19,-30,-3,-23,-2,13,1,-6,3,-15,-29,5,-13,-18,30,21,34,-15,0,-12,40,12,1,-13,-11,-4,-20,-8,0,-8,-11,2,-8,-4,-10,10,-2,-6,-14,3,-9,0,-5,-4,0,-20,-29,-12,6,0,-1,10,-4,-10,-16,22,-14,3,3,-4,19,11,0,-13,34,-14,1,-4,-7,-15,-18,-8,4,-7,12,18,19,2,5,-6,11,0,-5,-13,-14,-6,-1,-4,-6,-7,-9,3,1,-4,23,-6,21,6,-1,2,10,-1,7,-9,3,6,-3,-10,13,29,-1,10,-12,8,13,-15,2,1,15,1,-3,-8,-12,-20,-15,-19,-11,8,-9,-9,19,-6,3,-4,-21,-13,8,-12,-5,-9,-6,-7,-7,15,17,-13,10,-4,-8,-22,-8,10,8,-7,13,20,16,11,2,1,-10,5,-5,-13,-14,-5,-9,-3,-5,-15,8,7,5,12,11,6,-5,-3,0,10,4,-14,6,3,3,-7,8,-7,-9,-19,-13,-13,-4,-4,-7,-4,1,-11,-8,3,22,11,-4,-3,26,-29,-18,3,-7,2,14,5,2,4,9,6,-6,3,3,4,-14,27,21,-13,4,2,6,10,-3,22,9,-5,-16,18,-1,-18,-17,22,4,-14,5,20,10,-10,-1,27,-10,-18,7,5,-7,8,2,-3,6,-7,-5,-3,10,3,-10,-13,12,5,-3,-1,3,7,6,-7,-11,-9,0,4,-8,-16,-2,5,-15,-30,-2,0,-14,2,2,-16,-23,5,3,-3,-8,-27,-13,-11,-11,-8,-6,-14,-31,6,-12,11,0,24,13,-2,-2,-7,-23,-23,9,3,-16,-20,-11,-31,-9,22,0,8,-5,6,-19,-14,-2,7,-4,-13,-8,-15,-7,-12,-17,28,-8,-8,-4,-8,-10,-15,-11,-3,4,20,-18,-4,-11,1,-9,-11,-31,0,-11,11,28,8,-5,-5,1,15,7,4,-2,21,19,-18,-22,8,9,-2,14,11,-1,5,-7,7,1,-2,-10,24,-12,-3,-19,11,21,-1,15,27,-7,-20,1,-15,-6,-13,-10,-2,-13,18,1,-5,-7,2,-5,0,-17,8,9,-8,12,1,-8,-5,-13,8,-2,2,3,-6,2,4,-12,-12,-1,7,7,-2,5,9,3,14,-9,-12,4,-12,-16,1,10,-3,10,-21,-9,-8,5,6,2,1,-10,2,2,-2,12,-20,2,6,-7,0,14,-7,-4,13,-13,2,-10,5,14,-1,-6,8,-12,-22,-11,-12,-15,-5,-17,-14,-8,18,9,-21,1,-9,-3,-13,-17,-3,-12,4,-17,5,5,19,-12,2,1,-1,-2,-7,3,-16,4,3,-8,-4,21,1,6,16,-28,7,0,-18,17,-17,-2,-2,0,-10,5,17,-16,-9,16,-9,-1,-3,-5,-4,5,3,19,5,16,11,-10,17,-8,8,-5,5,-4,-1,-14,12,-20,27,2,-6,0,-20,0,15,5,-4,-28,-18,1,10,2,-5,-12,9,-13,-3,-12,-15,9,25,-16,4,-3,-4,-12,-3,-23,-2,-12,3,-2,16,8,-7,0,-5,8,14,-7,13,-21,-2,-22,5,-5,0,-11,-15,-7,14,-30,25,6,1,25,4,-15,3,-7,-9,-5,15,-12,10,-1,6,-9,9,-2,17,7,4,-24,2,11,-11,-23,4,4,-8,-14,-13,35,-7,-15,-12,-6,3,-23,7,13,-4,-3,15,-1,-9,-7,-3,19,-12,-13,4,-9,-1,-18,5,-20,6,-1,4,3,-4,0,0,-27,-16,6,-3,-4,-7,0,2,-19,16,-1,-2,5,-5,-7,13,-15,7,10,-28,16,1,17,9,-4,-6,4,-7,-6,11,-18,-18,-13,13,10,-16,23,-11,4,2,11,-24,-8,5,-16,15,3,-22,2,-9,-10,-18,-16,-24,0,0,17,7,-12,8,-6,8,14,-7,15,14,-18,12,-19,-5,-4,4,-9,15,-15,4,14,-7,17,1,-4,11,15,8,-11,9,-12,-8,13,1,-13,9,8,3,3,-3,-25,-21,5,3,13,16,4,3,-16,28,-11,-11,5,2,-37,-13,-2,-1,42,-16,28,6,-11,-7,-1,-5,-2,-10,-15,21,2,6,-4,-4,-7,-25,-20,-41,9,-18,-9,22,7,-10,0,-2,15,-18,-8,39,-8,19,-25,17,-4,-5,-6,5,6,-12,9,-2,-28,-11,-17,11,8,-2,-13,0,-15,0,-13,7,-6,10,4,9,7,0,-7,6,-20,1,-18,2,9,2,-20,-3,-12,-16,14,-12,-16,-6,21,-7,-11,26,-16,16,-1,-10,0,-24,-7,-11,-17,-27,-6,3,-21,-15,-21,-4,-22,-34,-1,-27,-17,-13,2,0,3,-8,-11,8,7,11,8,9,-5,3,-26,13,-3,9,28,-13,8,15,-15,0,-14,-19,0,5,11,15,-3,8,-13,-9,-5,-7,-11,-11,6,-7,3,2,-4,5,-18,13,7,17,1,-16,-4,-4,5,-18,-27,16,19,9,9,9,7,2,0,9,4,-29,-15,4,-11,-14,8,1,9,-24,-2,-6,-33,-34,11,-8,6,-1,3,22,-11,-13,28,0,11,-19,16,4,1,5,-25,-8,14,14,2,-9,6,6,-2,4,-9,2,-6,5,27,6,-3,4,1,-8,14,-8,-3,-17,-10,-13,-4,-6,-14,-11,8,-20,-4,-5,29,-13,33,-3,-26,-3,-8,-14,-10,-24,12,-19,-8,1,12,1,26,16,2,-21,19,4,0,6,-7,-5,-2,2,3,5,7,-10,-23,8,-23,-3,-12,-3,5,-6,3,6,-4,16,5,1,-20,-13,-15,-27,14,-3,-1,-6,-4,1,25,21,-5,-2,-6,-3,0,19,8,0,17,4,-26,-13,-9,-11,8,-2,-7,-12,-2,-15,9,6,15,-4,23,5,-10,18,-6,2,-10,-7,-1,3,-9,-5,2,9,4,-3,-24,-9,-17,14,13,23,-15,5,3,-8,-5,-2,-1,-10,-25,-17,-18,14,-21,-17,7,-3,-6,9,-6,-9,2,-5,-4,-13,-13,-11,-3,-6,-9,-3,1,18,2,0,4,-3,-15,-12,3,-6,1,6,6,4,-19,16,-2,-4,26,7,-20,26,-20,-13,36,-4,-9,8,5,-9,13,9,38,-5,-1,17,-21,9,-6,0,-4,7,-21,9,-5,-9,-7,-6,31,-6,2,16,-2,-1,-15,-29,4,18,8,-8,2,10,0,-15,5,-18,-12,-7,-24,-11,-24,0,-13,2,10,8,7,-2,-6,2,11,13,-1,0,-11,-11,-8,-8,-17,-12,-17,40,-2,-16,-20,-3,16,-21,-16,-12,-3,11,-11,-20,-8,1,-5,-8,-25,-27,11,8,-20,-4,-17,8,19,-14,-1,9,-5,7,-13,-5,7,-8,2,-9,-10,-19,18,-1,31,-16,-5,-8,11,16,6,-22,-2,15,-14,-10,5,-7,-7,-13,-2,-5,-3,12,-11,15,17,-19,9,9,12,-27,7,-3,-15,-20,-18,-12,-7,-12,-18,-29,-21,-4,-21,-9,-13,-3,26,-12,-6,0,3,34,4,-4,-4,-12,-9,-8,2,-34,1,-14,-26,13,1,15,18,9,17,25,-23,-8,-17,-5,3,3,9,-11,-18,1,6,2,2,-1,2,3,-13,0,7,9,14,8,-4,8,5,-10,-5,-4,-1,9,9,19,5,18,9,14,17,0,3,19,-10,-7,-16,-22,-15,-2,-8,8,10,-16,-10,2,-16,-23,-10,-28,6,0,-18,-15,5,3,4,12,1,4,2,4,-13,-14,0,-7,-1,3,-15,-13,2,-11,-9,-7,1,5,-3,0,-12,15,-2,-10,14,-3,-21,-2,17,-3,-13,1,-22,2,4,14,1,-1,25,-10,-20,1,35,3,-11,2,-2,2,-4,-17,-18,-6,32,-11,-9,-13,-13,-2,24,2,-7,-1,5,15,-1,-10,-15,-18,-17,-16,19,-16,7,19,-11,-13,7,9,1,7,-2,1,6,17,-14,-9,15,-13,5,20,2,-6,-1,-7,-9,-28,-13,0,-2,10,-17,1,4,-12,15,16,-16,13,-6,-9,4,-4,4,6,-7,-3,-3,0,3,-19,-1,19,-7,16,4,0,-4,15,1,-16,3,9,13,-3,-11,-19,0,1,-15,18,-15,-9,0,3,-22,8,13,-24,-4,-10,-8,10,25,-2,9,-11,3,22,9,-9,-1,16,9,7,-7,1,-7,11,-17,-14,-14,-12,-15,-2,-1,-19,0,-17,7,-9,-23,29,4,-9,0,4,-15,-29,0,10,-17,-2,9,-12,3,-12,-9,7,34,-1,24,30,-20,-1,17,2,-18,9,-11,-14,2,11,9,8,12,6,-3,0,-28,-5,-16,-13,-3,1,-19,-5,15,-8,-4,-20,-8,7,19,11,-5,11,5,9,-7,-16,-21,11,-20,-11,-9,-29,-2,12,-8,-17,6,0,-1,-9,3,-23,-8,-19,-17,-16,4,10,7,22,-13,-26,-3,-13,-16,-11,-6,-19,16,-4,2,5,3,10,8,3,0,-2,-20,-14,4,3,-21,-6,11,5,-13,8,3,-9,-9,17,-3,-1,-5,-22,-3,-10,-3,1,-1,-8,-3,0,3,7,2,-1,20,-6,-14,11,-14,3,6,-8,-5,-7,-10,4,-14,-10,-4,-4,-18,16,-7,3,7,-8,-2,-2,-23,7,-14,-12,11,1,-6,-6,1,11,13,5,-4,6,-4,-8,-8,12,-9,1,4,-20,-21,2,15,-6,23,9,2,7,-5,-6,14,17,8,1,-10,-9,-22,3,13,-3,9,-3,-7,10,-4,-1,8,-7,-32,29,-13,-20,2,0,-4,-2,4,-15,16,9,19,6,-9,-17,4,36,-11,4,17,29,-10,-24,-4,24,6,19,4,-19,11,-4,0,-20,-17,-10,-14,3,-17,-5,-1,6,-5,22,-11,8,-5,-4,-18,-13,-12,-4,18,-13,-14,-9,-6,15,1,7,-8,25,-10,-10,1,5,7,-3,-2,-24,-2,-6,8,7,16,0,-11,5,12,-18,-33,-34,1,-13,-11,3,19,6,-4,-13,11,-5,-6,12,-5,-13,-5,-9,13,4,8,24,-8,-17,-15,1,-13,-17,-6,-19,9,-12,-3,-13,-1,4,-24,6,4,-5,1,2,-2,-34,4,-18,-3,-14,-7,-14,7,-19,-6,-3,-6,-2,12,8,-15,-14,3,-1,1,-2,-3,12,12,-6,-25,-20,-4,-4,10,-7,-21,-20,-2,-14,21,14,-3,-29,5,-11,-18,-18,5,4,-4,13,-19,-20,-18,48,-5,3,9,-11,23,0,-23,-14,-22,3,-30,4,-24,-21,-16,1,1,11,2,-4,10,5,-21,-25,18,14,-27,-3,-11,-4,-20,10,-10,19,4,-5,17,4,-2,22,11,0,-5,-4,-9,-8,-4,22,1,29,0,23,18,-2,-7,0,-11,-8,-12,1,7,-6,8,3,-15,19,-7,-11,-19,-11,-6,-23,-14,7,-15,-5,-23,-7,-22,-2,-4,12,12,-9,-14,-5,-12,-5,-9,15,-18,-8,-16,-9,-20,-15,-18,-7,5,-11,-3,-7,9,-1,8,-2,-18,-4,-16,4,11,-8,-11,-10,9,-12,-12,14,-5,5,-1,-1,8,-6,4,-13,-13,9,-2,6,15,-7,-5,3,4,-1,-6,-7,0,5,1,6,3,16,-8,27,-12,8,1,-6,0,-10,-8,10,1,2,-11,-2,2,-9,-3,6,1,15,2,-16,-15,-1,-7,7,-14,8,-6,-11,-15,-8,-12,-6,-5,4,3,-13,7,-8,21,7,-4,4,6,15,16,-14,3,1,16,-10,-10,2,-7,7,10,7,14,-3,13,4,8,-7,18,9,13,12,-2,-5,-7,-5,-2,-9,-12,8,11,-14,0,-15,6,4,-2,11,-2,-5,-3,-4,-27,-4,14,-8,5,-20,9,6,-7,-1,-1,22,-1,-11,13,-10,-23,-1,1,-11,7,-21,10,8,-11,-8,-12,24,2,-18,-10,-12,0,-9,-13,1,-21,-16,-14,4,-4,-17,-10,-7,-9,-7,-6,7,6,-3,-8,0,-5,-7,0,9,12,1,-4,8,-7,-2,-5,21,0,3,-6,15,-6,-14,-6,5,-3,17,-10,13,1,-5,11,14,5,-13,17,1,11,8,4,9,-1,-10,-7,17,-3,-19,11,6,4,-7,31,-7,-13,4,1,5,12,-10,-27,2,18,-25,0,-13,-12,-6,10,-10,-8,10,-8,17,-5,-21,-11,-13,0,-1,5,-5,20,-1,12,-8,-13,0,1,0,14,-14,9,0,1,-18,-2,6,-3,-13,7,18,-8,7,28,3,18,15,-1,-12,9,-10,8,3,12,-10,-7,-3,-12,2,-12,13,-12,5,-6,8,-10,-12,4,-18,-2,-7,3,3,-19,7,-17,-12,-23,10,-20,-19,18,23,8,-31,-15,19,-12,-27,2,-6,-22,-23,-16,-6,13,-12,-7,34,11,-25,15,15,-2,-6,5,-11,-11,-13,-9,-19,13,-1,12,12,6,-27,-3,-12,-15,-7,24,6,8,0,-11,-21,8,-6,-13,-9,-7,-16,-13,12,-14,3,-17,3,9,-13,9,-2,2,-9,-17,-6,0,-5,-16,9,-3,14,-4,9,3,-20,-7,17,-6,-22,13,-2,-6,-9,-8,21,-10,-10,-3,16,-10,-13,2,2,8,-11,-7,0,-11,-14,-2,13,-3,-7,2,14,-22,-15,4,1,5,3,6,2,-7,-10,-4,-5,-16,-16,11,-6,-5,-10,-24,1,-14,8,9,-23,9,-10,-12,-7,0,3,-8,2,13,10,-12,4,-19,-1,18,-8,9,-5,-5,-1,2,5,25,9,17,8,-11,15,-4,0,3,0,13,-1,-1,-12,-16,-11,-10,17,-2,5,11,0,12,-11,-10,-4,12,0,11,5,-1,-11,9,9,-15,1,-11,1,5,0,-15,0,14,2,4,-1,-7,-7,29,8,-15,10,0,-14,3,8,-7,12,-3,-24,3,10,0,-21,-21,-13,-17,-33,-30,22,18,-6,-7,13,-29,-33,-11,3,-6,-17,-13,-12,-18,-29,-23,33,13,-4,-16,31,17,-12,8,31,15,-9,-3,12,-17,-21,-19,4,-10,6,3,-3,20,16,10,-4,5,5,14,4,-9,6,-14,-10,-5,4,5,-29,-7,-3,-2,-17,-20,1,-13,9,-9,24,0,-7,15,1,-5,23,-1,-12,17,9,-19,-7,-1,-4,-18,-27,-7,2,9,-5,-8,2,-1,-5,18,1,-23,-1,-9,-5,-10,5,-7,5,-15,3,-20,-15,5,-14,26,-7,-9,-8,-11,-12,-15,-1,-2,9,-8,12,-11,-17,6,-12,-8,-12,5,12,7,-16,-6,-6,7,-6,6,12,22,-3,0,-17,-10,8,-9,18,2,6,7,4,-8,-15,16,-5,-2,15,14,-9,3,13,-5,2,15,-7,-10,5,-4,-8,-6,-2,-5,-7,2,19,12,5,-7,-6,3,10,-11,3,15,-16,7,-2,-1,-14,-1,-4,17,10,4,-20,-2,2,-13,-15,-11,-33,-9,-12,-13,-7,7,13,-14,-13,16,3,-9,-3,12,-3,-6,-10,6,-7,9,13,11,2,2,4,-11,-8,-16,-10,3,-8,18,-18,0,-4,14,0,-5,6,1,2,-16,-16,9,-11,-10,8,1,-3,-1,18,5,-23,8,8,2,1,-19,-4,1,9,-6,16,-4,6,-4,5,-7,0,-6,2,14,10,-2,-8,12,1,-30,14,12,-7,-11,-7,-8,-7,8,4,-23,-15,1,-10,1,-8,3,8,2,-13,8,17,7,-7,1,4,-9,1,-16,-23,-10,0,5,17,-2,-7,13,9,7,-2,-7,-3,15,4,-23,-16,-7,-6,-16,-15,14,0,-6,0,5,-11,12,-9,-2,-4,-5,-11,-3,-7,-12,0,-12,2,-12,4,15,-7,25,0,11,7,8,12,23,15,-18,2,-4,5,-16,9,-3,-13,0,-7,-14,-19,16,19,7,5,7,-3,-2,-5,4,10,-3,-3,-15,9,-8,-7,5,-4,-2,-3,8,5,-4,0,-9,-4,18,11,-2,6,5,-11,1,-8,33,2,-12,1,-5,4,-7,24,19,-14,15,-17,-1,-7,4,-15,-8,-6,-4,-16,12,1,17,5,-2,-19,0,-4,-15,-13,-1,4,-9,-8,-7,-14,26,9,-2,6,-5,-16,-1,4,-23,-15,-22,-12,-13,-2,-8,4,-13,15,4,2,-11,-5,11,16,-10,-8,-13,-12,4,-7,-21,1,-8,10,17,-2,-9,24,11,13,-4,-8,1,4,-2,11,6,-4,6,-11,-7,-10,15,6,-11,-17,-6,1,11,-12,-3,-5,-15,12,12,-18,-8,-10,-3,-25,-16,-4,-19,2,-13,3,-8,-19,-5,12,9,-13,-8,-4,-1,-8,-2,1,-14,-2,-7,2,-14,-7,-19,16,-9,0,5,-13,-4,6,7,5,-6,3,-1,0,1,-14,-7,-1,-6,-3,20,3,6,10,-7,3,-7,11,8,3,-14,4,8,-4,6,-8,-4,-14,18,5,-2,-13,-3,-5,-13,-3,-10,-2,-5,12,-9,2,2,-10,10,-27,-2,12,-20,-6,-9,3,-9,-16,3,15,4,12,-3,-12,-5,-19,-20,-4,-2,-9,-19,-1,-27,-14,-10,2,-6,27,11,-20,3,-24,-1,2,-5,-1,-16,8,-15,-13,4,-5,2,-7,14,8,8,-25,-10,-8,2,-13,-14,3,5,9,-7,25,2,-19,-3,-9,-3,-27,-12,-18,-6,-16,-18,-1,-7,-3,-7,8,6,-12,7,-15,-1,-29,-7,-10,-7,-2,-4,6,-23,-10,9,27,13,-12,-8,-6,7,-27,-37,-8,15,-1,-10,15,-19,-6,-5,5,10,-14,-1,-4,13,-10,-30,-19,39,-12,-16,8,-19,11,-5,9,5,12,8,9,0,-15,-13,15,3,-2,-7,10,15,-4,-7,2,-3,14,-1,-9,-5,4,-6,-18,-14,3,6,-7,-5,-8,2,-10,1,19,-5,-31,-17,5,-3,3,4,19,10,-8,18,-1,23,10,8,20,-11,-12,-13,18,-11,-10,10,11,17,1,5,2,-9,7,-12,7,-19,2,2,26,-13,4,35,-1,15,-2,-20,4,-25,-4,-4,8,-18,-5,-29,-16,16,-23,8,1,16,-15,-2,24,-1,-1,-2,8,-24,-3,-11,-8,-11,-12,-5,3,0,-19,-11,8,1,-22,-21,4,-18,2,-7,-20,-24,-32,21,18,3,8,2,-17,3,5,-25,-2,-11,-13,8,-19,-5,-28,4,24,-3,-1,-19,-7,-10,-14,-9,9,-10,-6,-28,-16,-10,-8,1,3,1,3,-2,5,10,2,-8,-16,-16,-6,-10,-2,17,-5,14,-19,10,17,-6,-6,-14,8,16,15,-9,-16,5,-1,-4,-9,-3,8,0,5,-2,-3,-10,-10,1,10,-14,-8,-18,-1,-12,-15,-2,20,9,-6,0,-2,-1,3,-18,-12,-11,-18,-1,-27,9,1,-17,3,-6,13,4,-10,-12,-10,-10,0,6,2,-18,-6,1,0,-13,-7,7,4,5,-2,2,15,-8,27,5,-12,-14,3,5,4,-14,-14,10,-20,4,8,2,31,-10,8,-5,-3,-1,-12,-6,-17,-1,-9,8,14,-1,3,-4,4,-1,16,-5,-5,-4,10,-15,-14,10,11,15,12,1,-3,1,-21,9,15,0,-22,14,-5,2,-1,23,-17,8,18,-7,-12,7,-1,-8,6,6,12,14,-9,5,-5,13,12,6,-8,12,3,-8,27,1,-10,15,-5,2,-4,-11,-20,-21,1,-19,7,-3,-6,8,5,14,10,-9,-15,-6,-6,-3,-22,-18,14,-6,-16,8,4,29,-12,-14,-12,-20,0,3,5,-28,-12,18,-9,-14,1,-8,-15,-11,-34,-11,-23,-9,-12,-13,13,-17,-9,26,-6,-13,1,-9,12,-3,-16,-21,-16,10,-11,5,14,-13,-15,9,-2,-18,-21,-10,4,-21,-6,6,8,13,20,20,2,8,14,-8,0,13,9,0,7,1,-21,10,3,10,-17,2,-10,9,-12,-10,32,18,-10,11,-2,9,-7,-11,10,-3,5,-4,-14,16,-14,3,-12,17,1,-3,6,-3,-8,-29,5,0,3,-7,-4,11,-13,-18,-12,19,6,-4,7,-17,-15,-15,5,26,13,-2,-2,-1,0,16,5,5,-8,4,4,-6,-1,11,-3,-11,-2,8,-10,-9,-1,1,-11,-2,-6,-7,3,-14,-7,3,19,4,14,7,-18,-11,-5,-12,10,-8,-8,1,16,0,23,-26,12,13,-23,21,-10,-13,-13,2,3,6,3,1,-16,-20,20,-12,-5,4,4,-1,-6,10,-11,16,5,-2,-10,-1,15,-23,-7,4,-13,-2,18,-22,-19,-5,-9,6,-10,4,-20,-1,38,-5,-5,7,-24,-4,15,-20,-13,19,1,21,2,4,8,6,1,-10,-13,18,4,18,15,-1,-12,-7,-5,-12,7,-7,-1,1,10,-10,-8,5,3,14,-9,-7,-3,-11,1,-7,-11,3,-19,8,-1,-6,0,-10,7,-4,-4,-13,-3,3,-3,3,3,-7,-18,1,-8,-15,-1,-4,-3,-12,4,-5,-10,0,-3,-8,-5,0,4,11,-12,20,-5,9,-18,-31,6,-15,12,6,-21,6,-11,-9,33,9,5,18,6,-15,23,9,-28,-1,23,12,-6,-10,10,-15,19,22,-16,-2,-21,2,5,29,-32,11,0,9,-2,-2,4,-14,-1,7,-9,6,-24,15,16,-1,-24,0,-2,-5,-3,-2,34,8,3,8,2,-6,-2,-1,12,-33,8,16,11,-6,-2,-8,2,-13,13,2,2,8,-5,-3,-15,-34,-8,1,-6,-17,-5,-29,-13,-9,48,8,-12,-4,4,-7,3,-10,-13,18,-14,-9,-5,-13,-11,-2,-6,-2,-1,-13,-18,-7,-14,2,-30,9,-18,6,7,7,-6,-4,-2,-1,-1,0,8,-2,-17,-10,-7,-2,2,11,1,4,12,10,2,-6,7,-12,-5,-2,3,-20,4,8,-1,10,6,0,-12,-10,10,-17,12,4,13,-25,-13,-11,-1,17,-17,-10,13,-11,-3,0,-22,-6,-14,-13,-17,0,-6,2,14,20,4,-19,-8,-16,-2,-8,-28,-20,5,-10,-7,12,27,9,-8,-4,0,0,15,7,1,-13,-20,-21,1,-7,-4,-8,18,15,-10,-11,17,10,8,-4,6,10,-6,-3,-2,-19,2,-17,6,5,8,2,7,13,4,-14,-1,10,1,-6,0,-10,26,-18,1,6,-18,-8,-10,4,-18,-5,9,0,-8,-5,4,1,-3,17,-10,15,-15,-8,-1,-1,9,1,-16,-4,-18,10,-4,-32,-3,-4,-1,5,9,14,-5,-3,-10,1,-3,-10,-14,3,-1,2,-5,-11,-7,-5,-7,-4,-1,-3,-10,15,7,8,7,7,1,-12,19,-7,-5,10,13,9,13,-21,-6,-6,-15,-1,1,20,-21,-9,-18,1,9,-5,1,4,0,15,-5,-12,7,4,5,0,3,9,-15,-10,2,3,6,8,14,-15,3,-6,-17,-11,-15,24,-5,-11,-15,-14,-1,1,1,-1,-2,-23,-18,-4,-5,-5,-23,17,1,10,-20,-23,-7,-7,-18,-23,-1,0,-3,10,-3,10,-6,28,2,-8,7,-6,6,0,33,0,1,-12,-4,-5,-8,8,-6,18,-3,9,-10,14,-23,6,-2,11,6,7,-2,6,8,-2,-17,-11,-11,12,27,-13,-9,8,12,6,-4,-12,4,2,-14,-4,-5,0,-18,3,17,-15,-1,10,24,18,9,-1,17,6,-1,-6,-13,8,-26,14,27,-11,-2,-1,-3,-21,-2,-1,7,6,-20,-5,-20,-14,-14,-7,11,-9,-4,-30,20,-5,-7,-18,14,1,-3,21,-19,3,16,7,8,-25,-15,-22,4,-8,10,-17,-15,7,0,-8,8,-3,-12,-7,0,-5,-7,-11,17,3,14,-7,-13,-4,23,-12,-2,-13,-11,16,-8,-12,-2,12,10,12,-7,-18,-1,-13,20,-23,-5,-35,-9,12,-3,-23,-1,-6,0,7,1,13,-11,3,11,1,13,-12,-24,9,-12,2,11,-27,3,-7,0,3,-10,9,12,4,-13,7,-2,-12,15,1,5,-18,3,13,-7,-2,-17,-7,-9,19,-2,-1,-13,11,-18,-16,12,-12,5,11,-9,2,-4,-12,14,15,-9,-4,-10,13,-8,-14,12,-7,28,1,-14,-11,-7,0,9,3,-22,-9,1,12,-8,-26,-6,-22,8,19,-10,-4,12,9,-1,-13,1,6,-1,-5,-2,-8,13,-16,1,3,2,16,-2,2,-9,17,4,-7,-8,5,11,-8,-3,-12,-13,-4,29,4,1,-32,0,-19,3,-7,-15,2,-1,-12,-1,-8,2,-19,-6,22,15,-10,1,13,-13,-12,-2,16,3,-16,-4,-4,13,-14,-3,-2,9,4,-5,-6,-13,-6,-5,-8,-6,-22,0,1,-5,-14,-9,-3,16,3,5,-10,2,-4,0,-6,-1,-17,2,9,11,-15,-15,-8,-5,-16,-2,0,-1,23,15,-23,7,8,-1,-28,20,-11,-2,-25,15,-17,-6,6,19,-6,-13,-8,5,-19,-1,-28,0,-11,-6,2,15,-2,-15,10,28,10,0,5,4,-6,1,4,7,11,34,-27,-12,2,-11,-15,-2,3,2,-15,-9,-27,11,19,-2,1,6,-3,-26,-12,1,5,-18,-4,-7,1,-15,-14,9,6,-10,4,-7,-13,-12,-26,-6,14,2,6,-3,-10,-21,-19,-6,9,-6,-1,-26,-17,-17,12,-14,9,0,-5,-7,-15,8,7,-9,8,-7,30,-9,5,-15,-2,6,15,-3,12,2,7,4,6,10,11,7,29,-1,-6,-17,0,-11,3,-13,7,-9,1,-25,-10,21,23,2,-4,12,2,-11,-4,-1,-1,-15,-12,1,3,-11,-11,13,-1,-6,17,2,-4,-7,-8,-22,0,-12,-7,-9,-23,1,-14,6,32,12,11,-31,2,-24,13,-17,-25,-18,-4,-7,-21,3,1,-22,31,-12,-5,-21,22,-11,-10,18,4,7,3,7,6,9,-10,-2,-19,-22,-11,-10,13,2,-5,8,1,6,6,-8,3,-10,-3,-9,-9,-8,-11,-11,-2,1,4,-7,-8,-3,11,-1,15,-6,-4,11,0,0,6,0,3,1,-1,-12,-15,-7,-10,-4,7,3,-9,11,9,-9,9,4,7,-6,12,-1,-5,-16,-7,-1,-12,2,24,-4,11,11,1,-6,-2,-3,0,15,17,-19,21,-2,11,11,-9,16,-10,-10,2,-4,4,0,3,4,-6,-2,-6,5,4,9,-13,8,2,-7,9,-18,9,-7,-1,-2,1,-2,9,1,15,9,-4,11,6,6,-3,-10,5,6,-10,-21,-24,-6,-3,1,15,0,-6,2,-2,6,-5,-14,6,-9,-7,-17,-14,-2,-5,3,-21,3,-2,-10,-1,-15,-14,-5,0,-4,-13,-8,-5,-13,6,-7,-9,8,-15,14,-20,1,4,-20,-9,-18,21,17,1,6,-2,-8,-2,9,-22,8,-8,-14,-22,4,-15,-17,41,6,-24,4,-16,-6,-14,-3,-17,-21,-11,-9,27,17,0,-21,27,-12,-39,-4,-19,-6,-27,10,2,-7,-11,5,6,2,-1,-11,-9,6,-8,3,-5,-11,-11,14,0,-14,7,-14,-11,8,-8,3,-12,1,-6,4,8,-13,-13,11,-14,9,-1,-14,4,-10,-2,-19,-5,8,10,-8,15,-11,2,6,-17,13,-7,7,-14,-12,-11,-15,6,8,-2,-3,-9,9,8,7,-13,10,-5,25,4,1,-8,34,0,-9,-6,4,-23,-2,-27,-22,-4,-3,-22,-6,17,1,-6,-23,-13,4,-13,3,-17,2,-17,-5,11,-8,0,-9,-4,-5,-14,-2,-20,8,-21,8,6,1,-9,3,-18,-12,1,4,-9,-11,2,-23,-20,24,6,2,8,-4,1,-11,5,-7,-18,-8,-1,-4,10,-7,-25,-6,15,5,16,5,21,3,-10,1,-21,33,-3,5,10,25,-6,-10,9,22,-4,-5,-1,-13,10,20,6,-16,10,17,-2,2,-2,12,-15,-15,4,0,-9,-1,3,19,20,-8,-5,15,-6,17,-16,8,-2,-14,11,4,-6,3,-5,-1,13,-4,14,9,-10,3,-8,-5,-4,-7,2,13,-15,6,-3,-10,5,2,0,9,0,-13,4,5,-4,-20,8,10,9,-12,-15,-6,-12,-15,-13,-10,6,-1,4,-1,-1,-13,1,-1,-4,-14,5,2,4,23,2,-8,16,4,9,-9,-11,-9,2,8,-7,-6,-2,12,15,26,-2,-7,-1,-1,-10,10,-9,17,5,6,-10,-6,-6,15,20,30,20,-20,-3,8,-4,-11,-11,-5,-8,11,-4,-8,3,-5,1,11,-9,-5,-8,-7,12,-3,-5,-3,9,-11,-18,-3,-10,0,-13,15,-7,-16,-18,-20,-6,-2,13,-4,-3,-14,-20,15,-19,1,-20,-1,-13,-5,6,-11,-19,-23,15,18,-11,0,33,-3,-7,6,7,8,-5,7,-19,0,-12,-18,-2,-5,-8,0,9,2,-8,-7,5,-19,-15,-6,7,-3,19,-1,-14,-18,1,-7,-3,9,-2,-15,-10,-27,6,-5,19,22,23,8,-12,-9,-8,-4,-16,-10,-6,21,-6,-14,4,-8,4,28,1,2,-13,-11,-21,-16,-16,-22,-21,-6,-18,-19,-4,-1,-12,-5,-14,-11,18,7,-27,-16,0,-9,-2,-5,20,-10,-13,-7,2,-2,-17,8,-7,-12,-17,-6,-3,-5,-5,-10,7,-12,-14,-21,22,-29,28,28,0,0,-5,-6,-15,1,19,-4,-5,-21,7,-13,6,0,8,12,10,-4,-14,-8,-14,-16,-10,23,0,-11,1,-31,-9,-8,-1,-3,-4,-25,-12,7,-12,-15,-16,-12,19,-10,-15,-24,20,-2,-8,-21,-15,-4,6,-22,-18,-2,-10,9,16,16,-20,5,40,6,3,17,19,-12,5,-1,-7,-30,-35,5,-10,-7,-2,-1,1,-1,-28,14,19,-14,3,28,2,-14,-23,0,14,-9,-14,18,-8,-8,-18,-16,0,-5,12,8,-8,8,-8,0,33,3,-5,-14,-8,-32,-14,0,-13,-14,3,1,4,-2,-7,16,11,3,-2,-6,5,7,27,-8,-16,-17,-4,-3,12,-5,12,4,-20,5,8,6,0,-18,6,24,15,7,-2,-11,-13,1,2,13,-27,3,40,1,-14,-10,-19,-3,19,30,1,4,17,1,-12,3,2,-1,14,-6,4,-11,-9,0,-17,10,-8,19,10,3,2,-5,-7,4,1,11,0,-9,18,15,-6,15,-6,14,8,13,2,13,11,5,2,4,-5,-2,4,7,-2,-5,10,6,12,4,-8,-16,-10,-6,-3,15,-6,0,-16,-5,3,5,16,2,-10,21,-4,-2,14,-4,10,8,-8,0,-5,-19,9,11,-4,-4,3,16,-8,-6,18,-1,6,-5,0,11,-5,-4,-9,-5,2,10,-8,5,-18,-14,13,-5,24,0,2,5,-9,-1,-23,-7,-3,-12,-10,-10,4,8,12,12,7,-14,11,-20,-10,1,8,-11,1,19,4,-17,1,2,-11,-3,-11,-19,-12,-34,2,-1,-15,-9,-12,8,-2,0,-14,6,2,-6,-22,-24,15,-6,1,-2,-26,-2,-12,-8,20,-9,12,-4,-16,1,-13,-8,4,-5,4,17,-14,1,-7,-20,14,16,7,5,-17,8,14,2,-12,0,-2,-13,-14,0,-3,-4,1,18,12,-5,4,8,4,-4,22,0,-3,3,1,-8,3,-7,0,-5,8,-2,-12,-2,-14,-14,-6,-7,18,-1,-9,-1,-25,-13,-13,14,5,-6,-19,11,-4,-10,25,12,14,-7,2,18,-11,8,0,4,6,1,-11,8,0,1,0,18,-8,18,-11,-3,-2,-4,-9,12,-3,15,2,-2,5,1,-19,4,-15,-11,-14,5,-2,-6,-7,9,9,2,0,-5,1,-8,0,10,5,-5,19,-8,-7,10,-11,-8,6,15,5,-8,-19,8,-5,12,0,2,-4,-6,-6,6,-1,-8,-5,1,-8,-1,-4,4,36,-18,0,6,7,-9,-4,20,2,1,-13,14,4,0,1,-8,24,-4,5,5,-10,-23,4,17,-11,-1,0,1,-6,0,9,-6,-11,-15,-8,-9,2,13,-4,-8,-10,19,9,-13,10,1,-18,-12,5,10,-10,4,3,-6,-20,-8,-5,-2,-5,-5,12,8,5,-1,-5,5,-16,-1,-9,5,-13,-15,-8,-5,-16,-8,-19,-7,-12,-11,-5,20,3,2,14,-9,-2,-13,-10,-22,-9,-3,1,-8,3,14,-20,5,-9,-1,-27,-13,-10,-1,-21,12,-6,-9,1,-5,1,2,2,4,-15,1,-12,-9,-14,-2,-16,20,-21,-1,-6,21,-2,-11,20,7,-7,10,-9,7,4,17,10,1,5,9,4,3,15,-18,-13,-5,6,-1,-3,-1,-8,3,-13,-1,1,8,-13,3,-9,-13,-12,4,-22,5,28,17,22,0,-28,-22,-25,19,5,4,4,8,-20,9,-25,-12,-5,10,-1,-18,-38,-11,-13,17,-3,0,-5,28,-4,23,-9,13,8,11,-4,1,-32,-14,-21,8,-12,6,-8,40,-13,-8,-15,-17,-2,-3,-8,-3,-15,4,-10,14,-13,7,-17,-19,-19,-14,-16,-5,21,26,8,-11,1,-1,-9,-6,-20,-7,-22,-16,10,11,13,-15,-8,29,1,-22,2,-2,-21,-3,-18,0,1,-4,4,1,7,-2,-9,-28,5,-12,-28,-10,11,0,5,-16,1,-1,-7,-10,1,10,2,15,2,4,0,-12,-4,15,-17,4,-21,7,-10,11,12,-19,0,-8,-21,18,-15,6,-15,2,2,-2,6,8,-2,-5,19,-12,-8,-2,-9,-4,-8,-11,-10,-4,7,3,20,11,11,10,21,-6,-6,-10,-4,-5,-9,-3,-23,-10,2,4,-6,-7,5,1,-5,5,-9,-14,-2,-5,-21,3,5,-27,40,-10,17,5,-12,-23,-7,4,-24,-9,3,15,5,9,12,19,24,-6,16,6,13,5,0,-4,-6,-34,-19,6,-3,-7,-19,-11,15,10,13,1,17,-15,10,1,0,-6,-9,1,-10,-1,10,-19,-5,34,20,-20,51,-4,9,8,-4,-19,-32,-9,-10,-13,10,-6,-16,15,-12,-17,24,21,-13,9,-7,-20,-22,8,-2,-29,-9,6,9,-10,-1,-14,2,6,-27,-9,-20,-3,31,2,33,6,-7,-17,-1,3,3,11,8,16,-15,1,-16,8,-10,-3,1,1,3,-3,1,-6,1,5,-6,8,-16,-4,2,17,-7,16,-11,-2,-11,-6,-13,0,-14,-12,-16,-12,9,-7,-14,-10,-5,13,-11,10,10,-11,-11,7,-12,-11,-18,-21,-1,-7,3,-4,-35,1,-10,9,14,12,-2,-15,-9,-9,-2,-20,-7,-8,11,14,-17,9,10,4,-17,5,-13,10,-12,-2,-13,-8,0,-1,-28,8,-6,-13,-8,-1,3,3,9,13,-5,-6,-9,11,-2,12,-9,0,25,3,-12,14,-5,8,-14,-6,-7,-17,-11,-24,0,9,-5,-10,5,-5,-27,32,1,-2,-7,2,9,-12,-1,-11,6,1,40,-8,-8,-7,-16,-6,-13,-5,-1,-25,5,1,14,-21,3,-17,52,-17,-5,13,-19,-10,-17,-10,-5,-19,-8,-3,15,-3,-18,-9,-6,-24,17,8,-2,-6,-38,-9,-8,-5,11,-18,14,-7,-33,-25,20,8,5,23,5,0,-3,8,-10,18,4,4,-1,4,5,-12,7,2,2,15,4,3,-8,12,-5,-7,-11,-4,11,-8,-7,14,4,-4,-6,-7,11,-19,-21,13,-1,3,16,10,-3,-13,-7,4,-9,-5,13,9,-16,12,-7,-7,-13,-22,-9,-9,4,-27,-16,-11,-4,-2,-31,-29,-6,-8,-15,-10,-10,-2,5,-7,0,-20,-24,-13,1,5,2,-23,-9,-1,9,8,-3,14,0,-3,2,-7,23,-11,1,-2,17,-5,-2,16,10,-9,-12,13,-11,-17,-12,-16,5,9,4,-14,-17,-4,5,-7,-2,12,-3,13,3,0,-9,-2,-5,22,-3,-21,-7,5,6,-1,-16,5,-19,-5,-7,-2,20,-12,2,1,-7,6,1,-22,7,17,4,-18,0,5,10,-16,2,-15,1,-14,1,30,2,0,5,12,-2,-6,-6,15,9,-7,1,-3,15,-8,1,4,10,10,-6,2,0,-10,-8,7,1,-32,-13,-13,4,-16,-8,-4,1,-12,-12,-15,-13,4,-6,1,-1,-21,-9,-14,-15,-2,11,-20,5,-3,-3,-13,-1,-7,-8,-13,-6,8,4,1,7,3,-15,16,16,6,0,16,-9,-17,5,7,5,8,-5,-3,-6,-13,-11,13,7,-2,12,3,1,11,-1,9,-13,9,2,8,20,0,3,-2,3,13,-2,10,-15,3,-12,2,4,-25,-14,-1,14,-7,-5,9,-6,-9,-10,1,23,21,-9,-4,-13,-11,-3,-2,-4,-12,4,-5,-4,-2,-1,-13,11,-18,-19,4,-23,-7,-4,21,8,-13,-4,9,13,1,-4,-3,0,-16,6,-4,-19,7,7,0,0,-12,-10,-6,-7,-7,-2,-13,3,-1,3,-2,-15,5,10,0,10,8,9,-6,5,5,1,13,-11,8,-1,14,-3,-6,-1,-3,-4,0,-1,0,-4,-4,-3,6,11,-9,18,14,-24,-16,13,2,-12,-1,14,-12,8,9,-2,-4,12,-7,7,32,-3,-24,-18,-5,-9,-29,6,4,0,-14,-5,-22,24,21,32,6,-13,-4,-11,3,-18,8,1,-2,2,-5,23,-4,-7,27,-15,-17,-23,2,-22,-5,-13,30,12,-5,4,0,6,11,-7,-14,-13,2,-11,-19,-2,-9,5,19,-5,-7,-8,-2,-13,-11,-6,-8,4,-5,10,-2,15,-12,10,-9,-11,-4,-9,-3,4,-23,-9,4,3,-11,-21,-7,16,4,-4,11,-8,-13,8,-2,4,-14,28,-17,-14,3,-13,-10,0,32,10,1,2,-13,-6,-11,11,17,13,-12,-8,-9,23,25,23,-9,20,-10,17,4,-2,0,-22,-5,12,-9,5,-8,4,10,15,-12,13,-15,-5,5,21,-5,-10,-2,18,6,2,0,14,-3,10,-15,-9,-1,-11,-6,0,13,8,1,-15,23,-11,-2,-17,-16,18,27,5,0,-16,13,2,-5,-14,-18,-25,11,-23,6,13,-19,-16,7,-25,-15,19,-5,13,-8,15,21,-1,-23,-7,7,5,-7,-11,1,-11,-22,6,-3,-1,10,-7,-8,-12,-5,2,8,-7,-9,0,2,10,-11,-7,26,7,17,-1,-26,1,2,7,4,15,7,0,-6,9,9,-14,-3,-11,-11,-19,-8,24,16,-4,0,-6,11,-8,-17,-13,-14,-4,-3,-23,-9,-33,0,-12,1,-3,0,7,-3,2,-25,-12,6,17,2,-6,-5,1,23,-20,0,-10,-11,5,-24,-8,3,-29,4,-20,1,-2,-1,2,8,-12,3,-1,5,-14,-11,-14,8,-12,16,-5,-15,14,-10,20,2,-12,7,-12,-14,-12,-14,7,-8,6,6,-1,-4,8,-6,-7,-10,-3,10,-15,10,-4,26,-24,-4,-10,-22,-18,9,-10,3,-24,2,-13,5,-1,-4,-5,8,11,-3,-2,-8,14,-5,0,-6,-5,9,-23,1,-15,4,-16,-12,-7,2,10,-16,7,-9,-5,-13,-10,6,0,9,2,9,-4,4,-20,16,4,-11,19,-12,2,-1,-2,25,2,9,11,10,22,-20,-16,2,-3,-9,-6,-5,4,16,-7,4,8}
+
+#define CONV3_BIAS {18,36,-46,-45,64,8,13,-19,28,1,14,-57,23,20,-2,32,48,-11,85,73,-7,52,125,33,125,13,92,-72,89,-1,11,70}
+
+#define IP1_WT {38,-13,5,-20,15,-4,-3,13,36,-19,10,14,-18,-17,-11,15,25,-18,-16,-9,-9,-8,-4,21,-11,10,4,6,7,9,-14,-9,17,-6,1,8,18,5,14,12,0,2,9,-8,-3,15,-7,2,-18,-6,-26,20,-6,9,10,-2,12,11,-10,-7,18,-12,-12,41,18,6,11,-3,-1,2,11,24,1,22,9,6,4,33,-18,-12,-26,-22,7,3,-2,-8,-3,-8,19,-16,28,-18,1,2,16,-26,-30,-18,-12,-30,9,9,-7,13,7,12,-11,16,-2,-1,14,-10,15,11,4,-15,-5,-12,7,22,-15,-12,-7,1,22,1,-12,29,3,1,-3,-12,-5,-7,-1,22,20,-6,-20,25,3,5,1,2,-17,-2,-2,4,4,0,15,-19,6,-7,-20,11,-18,-26,14,1,0,-9,4,-8,11,-16,-1,15,-16,-7,17,-13,-10,1,2,8,14,-2,-16,32,-10,-10,12,-16,9,-11,-10,-15,3,9,-30,58,4,-20,6,-10,-6,11,0,-1,13,14,-14,0,-3,-12,-4,22,2,2,14,1,10,1,-12,-8,17,-12,-1,-10,-9,-9,-2,11,-28,-25,-6,-5,4,10,-3,11,6,16,13,-3,-6,5,35,-12,-1,5,26,-3,-29,2,-4,-6,-15,-6,-11,8,-8,-12,1,-13,-2,7,12,-16,7,-5,11,20,-6,-2,42,21,36,-4,-12,1,-4,25,6,0,-12,5,28,-5,1,5,-15,1,-2,1,-5,-4,-10,41,-14,9,-11,-23,0,33,0,-9,13,-9,-3,6,1,4,1,3,-9,4,2,-16,-2,21,-32,-9,6,-12,-2,3,-7,29,-8,-27,10,-3,-3,10,-20,-3,12,0,-4,-4,-11,7,-6,8,2,-11,6,7,-6,-13,-2,-4,17,-11,5,0,11,1,4,-6,-17,-9,-25,7,4,4,21,17,14,-5,4,-12,-17,5,46,-7,17,-10,22,0,-9,-13,-13,12,-4,1,-4,-17,-2,-7,-14,-14,-14,7,9,-11,-2,3,6,15,4,6,3,8,20,10,16,-4,-20,-10,25,7,-4,5,3,1,-2,-2,8,2,9,-2,-6,2,-5,-11,4,1,24,-10,-11,11,-17,-4,-5,-14,11,2,2,0,10,13,-31,9,-14,13,19,21,-29,-10,4,9,-33,3,9,0,-13,-16,-1,-3,-4,20,3,-2,12,-6,-38,-16,-17,-8,10,12,18,-10,-8,16,25,8,-4,-1,4,-12,-6,-8,-2,11,-10,2,-16,-11,40,-17,11,5,4,-21,-1,25,33,-5,12,-7,1,6,2,-4,5,-17,17,12,-18,3,6,-8,-9,33,-5,10,-22,-3,16,-3,-22,29,45,-5,-5,-19,11,13,-10,-12,-15,8,1,16,14,-3,-4,7,-9,-5,-6,1,2,5,-15,-32,-11,-9,8,-8,9,3,-18,-4,23,12,4,-14,36,3,-4,41,14,-3,-2,2,-3,12,4,-4,8,-15,-2,7,8,8,3,2,4,11,-1,10,-26,12,-7,11,-16,-1,28,-25,19,8,-5,-8,-3,-8,-4,16,-8,-14,-9,3,16,8,8,8,4,-12,3,-6,-12,3,22,-7,-21,-11,-14,-4,-1,7,-10,-3,6,13,-14,61,-10,-14,-15,-3,13,5,-30,-20,-15,-25,5,10,-9,12,8,-1,-7,5,-17,-1,-13,0,11,-6,60,5,0,-20,6,21,-22,6,-12,-9,1,-3,1,-19,-21,-7,-4,7,-15,-6,-9,11,-5,-5,12,-12,6,8,15,2,-6,-23,-23,0,-4,6,-8,-9,-21,31,-5,3,-5,1,5,-1,-7,-6,30,-14,-3,11,11,5,-4,0,15,-7,-8,-3,-4,2,-15,-4,-20,-7,13,-10,-15,-19,-25,-16,76,-15,17,26,2,7,-16,-17,-14,-1,-9,21,-4,-5,2,42,-6,2,-8,-6,6,6,-5,3,-10,-2,6,14,-12,0,4,22,26,-5,-12,-1,-4,28,19,2,11,-22,27,-20,-6,15,-9,1,-10,-9,4,-6,-5,4,0,2,5,-11,-9,-11,-6,16,22,-4,-3,2,1,-7,11,6,6,8,-13,-9,-16,7,1,7,-16,-15,-2,-3,11,3,-12,6,-17,-15,18,-5,11,8,-10,-8,22,5,-6,-4,-17,-10,-2,12,-16,2,-7,-5,12,4,-18,1,9,-14,-5,1,-4,13,-6,-2,11,-8,4,1,-8,-8,0,1,6,-4,23,4,-3,-7,1,20,-11,-8,1,-14,3,12,10,-6,-5,44,11,5,-2,-3,24,3,6,2,1,0,-13,7,2,13,-9,7,9,2,-14,2,27,-14,1,-10,3,1,23,-10,6,4,-5,12,11,-7,-3,-7,1,-11,-9,12,-21,7,-10,-9,-14,17,-2,-2,-4,2,24,-8,-12,-4,4,14,5,21,12,-7,0,-3,3,-12,0,2,5,-5,11,-4,6,-2,33,-10,-2,15,-6,-10,12,-10,5,6,7,-2,11,3,1,16,2,-11,-6,11,-10,-5,-3,6,-7,-27,5,1,-1,-2,-2,3,-18,0,4,-10,4,1,-3,-10,5,-7,13,-16,-15,22,2,8,-2,16,23,-11,19,-3,-12,-11,9,-11,-24,8,-17,-5,-3,-22,6,2,8,12,-17,10,4,-1,-6,0,-17,5,5,-1,-22,1,1,3,31,-7,6,-3,5,6,8,-4,-1,-11,11,-12,-10,31,-8,-2,-10,-9,28,2,-10,-8,-19,6,7,4,-16,17,-5,9,-17,-11,-3,13,15,-14,-5,10,-19,0,-2,7,-6,1,2,13,3,4,-14,-1,-14,-7,-11,46,18,2,6,-10,-18,55,14,-4,-1,3,7,-12,23,19,8,0,-12,9,-4,-11,-10,-21,7,9,4,-16,-2,-6,18,-14,-15,-9,-11,-10,-8,3,4,-21,3,-9,9,13,0,19,8,-12,-8,-30,1,6,6,9,-23,-1,3,-25,39,-27,11,-19,8,-12,7,17,0,10,-7,-5,-2,19,-3,-12,-20,9,4,18,4,4,10,-6,10,-20,1,6,-3,6,4,5,-7,-3,-6,-8,5,16,14,12,-20,2,-6,9,-14,-2,-26,2,12,12,-15,-16,14,6,3,-8,-5,-1,-4,0,-12,1,6,6,1,8,-4,-9,-13,-5,-16,42,-11,7,-8,-21,-2,-5,-13,-15,-11,-7,5,-11,15,32,-3,-10,6,-10,11,-8,10,19,-17,-28,-1,-14,-8,-12,-5,2,-12,3,31,5,-19,33,3,10,-9,-1,4,-15,1,12,0,-8,-20,2,2,-13,1,-5,11,0,-17,1,-8,5,4,3,15,27,4,-2,5,-36,2,-2,-16,0,12,-5,-3,6,31,-12,-14,0,-8,-1,-14,8,29,-8,-2,5,-3,1,-12,-4,10,12,-4,-8,19,-1,-15,-1,9,14,2,-16,-2,15,3,10,-11,-5,-2,2,9,2,-12,4,11,0,3,2,-10,-11,15,1,0,-23,3,-3,-23,0,-6,-12,-1,11,-12,-13,2,-4,-5,-3,17,0,0,0,9,-3,4,5,-9,-11,4,-5,7,-9,-1,-9,11,-7,-16,-3,-9,-7,-11,8,-2,-6,3,11,3,-4,-7,-14,1,12,-12,-1,-9,15,19,8,5,2,26,7,-19,-7,-17,-4,-7,-1,33,-15,1,-8,1,-4,-6,-5,-3,-14,10,-1,-2,12,6,8,-7,26,-6,-12,6,0,31,-9,19,-11,-2,14,-6,16,-1,1,5,6,-4,-13,4,11,-6,2,7,-6,-15,2,-3,7,-3,-13,2,-5,-6,6,4,-4,11,-2,32,-3,12,0,20,-1,-8,-1,-10,5,18,-37,-13,-5,16,-3,11,6,10,-3,-2,9,1,-17,1,-13,4,4,5,-6,2,1,11,-18,-7,-11,11,-2,-9,-3,7,0,-16,6,3,3,9,-9,1,26,-9,3,-6,-12,3,1,17,1,-5,4,2,3,-22,6,-18,-18,-17,-15,27,6,9,5,9,12,-9,-2,-13,-3,-12,10,0,-3,-8,-3,8,7,16,1,-9,38,14,-9,-8,0,-8,6,-11,-19,-3,-9,-18,13,5,-5,13,-4,-2,-8,-15,5,42,-14,-4,0,0,-3,12,-2,-5,5,18,-39,13,-6,12,7,13,6,8,0,7,2,-49,23,-15,-11,23,3,1,11,-1,-3,7,-1,-26,24,7,-2,-11,3,15,2,14,3,-5,21,-19,-17,4,-4,-16,-18,9,-11,25,1,5,-7,7,-2,-15,0,-5,6,10,-2,5,7,-12,-8,-4,10,16,0,-32,7,-7,-26,-24,20,-5,4,-6,2,-5,-4,-3,6,13,-17,10,14,-3,-11,-9,15,4,1,-11,-8,-10,0,20,18,19,-5,22,-11,23,-2,-8,5,9,11,2,-11,31,-14,0,-14,1,11,-6,28,30,-7,-7,9,14,-6,8,1,-5,22,17,-19,-18,-13,5,18,3,11,-1,3,3,-7,4,-11,-3,-9,-30,20,-18,14,8,11,-1,23,-13,5,-11,0,-8,-4,-5,-9,10,2,2,0,17,14,-21,25,-18,-2,-8,0,1,0,-1,-29,17,-4,-18,18,-10,7,9,3,-3,0,-5,23,18,8,9,9,17,-14,4,10,-10,3,-13,-4,-3,6,-9,21,-25,2,6,-2,-26,23,-9,1,20,8,9,-12,-2,3,-6,-2,-10,-6,5,-3,23,-7,14,-7,-14,3,-7,-4,-16,-6,13,-3,-5,-5,30,-6,7,7,6,5,-23,12,8,-18,-14,-6,23,-14,-5,4,-1,19,18,1,-6,6,-14,-2,9,21,5,5,-1,2,2,-8,4,-5,2,-8,-14,-1,-3,15,13,6,-10,0,3,1,5,-12,12,-6,10,-7,16,3,22,10,-6,25,-19,-2,-11,-15,8,-14,6,-9,10,-3,8,5,-2,-13,-10,-4,-5,0,-1,-5,-1,-16,-3,3,6,4,22,9,17,-2,-6,-16,25,1,4,4,23,-10,0,-2,-26,10,2,25,22,-1,0,1,-16,-5,-7,-10,-22,-14,10,-7,15,31,15,-17,-7,36,12,-22,-3,-7,10,8,-2,2,7,-4,8,2,-11,6,17,-15,2,-22,-33,-19,19,-23,-8,1,9,-10,10,2,4,5,-6,17,6,0,11,-8,-10,-18,-6,-2,21,3,-11,-1,4,6,2,17,15,-13,-1,7,27,-11,-1,-2,2,-9,2,5,3,2,-3,-3,3,18,12,-1,11,-7,24,-7,-5,-9,16,3,6,-5,8,7,1,-12,-2,15,9,-18,1,-1,-12,0,-5,5,9,-17,31,-4,-8,-10,-12,9,-2,9,4,-18,12,-11,-6,5,0,3,16,2,1,10,-16,-4,9,-6,-3,-2,9,-10,-16,12,5,9,-6,31,11,-2,-16,-12,10,3,1,-9,-8,10,4,-7,-6,-15,6,-10,4,17,-24,17,2,-33,9,-23,-1,-1,32,-5,16,-8,3,12,21,-1,-24,-1,-24,27,-3,-8,18,-1,-10,-7,45,-39,-18,2,-4,-6,5,5,-19,34,15,36,11,8,-7,-14,-11,0,0,-11,2,6,-18,-25,-17,1,-3,-15,-31,37,-9,-14,10,-24,1,-16,1,13,-21,-11,-5,15,5,6,27,4,27,24,-5,-19,37,-19,-18,-10,-17,-17,-16,-1,18,14,11,12,6,35,-17,35,-14,-12,-5,7,1,-8,-9,10,-8,-14,1,23,1,-24,-7,-4,-20,49,18,15,8,-12,-30,10,0,-7,-8,10,-3,-2,6,-25,-22,-10,-10,0,1,9,-18,-7,6,-11,11,-6,7,-12,1,-16,-15,-10,22,-5,86,-12,16,-41,-11,-1,-1,12,26,11,-7,12,-20,42,6,8,16,6,-27,4,2,-24,-20,7,43,-15,-5,-2,-8,3,-15,12,-16,-4,5,-5,1,2,9,3,-17,19,4,-11,-1,8,0,-13,6,-10,15,12,-28,-14,55,-4,13,16,-42,-5,46,-10,-8,-3,-14,-13,-15,0,26,-7,-8,25,34,-12,-34,22,-10,-9,-17,-5,11,-11,-14,-6,-6,-9,0,14,7,-14,-19,7,-8,37,-13,-7,-7,17,-16,19,23,-14,-13,-7,-18,9,18,-20,-25,1,-22,-26,14,-28,16,-17,-6,4,6,-26,-6,-5,-6,-17,-5,-5,19,13,22,-16,-14,4,-8,-15,3,2,8,7,-3,-6,-2,17,9,5,1,-12,-14,0,-14,-9,11,15,39,-5,-25,2,-18,-5,4,-1,23,18,-9,-7,2,22,-10,10,-13,37,14,-11,-9,-6,-5,14,-6,-9,-8,0,-8,6,28,-7,7,25,-13,-6,25,-3,7,-11,-14,-8,-7,-14,15,10,6,11,33,-2,-23,-16,10,14,-3,8,-10,0,-14,17,7,2,24,-6,-9,23,-1,-19,-7,14,-14,-4,-2,6,2,5,6,-1,-11,9,5,-7,9,-7,-6,14,-14,-6,3,9,0,-1,-4,2,14,-22,-11,18,8,-36,2,-1,9,-14,-4,2,-8,4,-1,5,-8,29,-16,10,-1,1,-8,35,1,10,4,-17,-13,15,-4,0,0,22,-31,4,18,-7,0,-2,-11,-15,29,10,3,7,3,5,-16,26,-6,33,-8,8,2,-7,9,12,11,10,11,10,2,-4,-17,9,-16,-29,-7,-1,-19,29,-11,0,3,-3,1,26,23,-3,7,-4,19,-5,-29,20,21,7,-4,3,-23,-5,-11,3,1,-27,-5,20,-12,22,-5,7,14,-6,-15,-16,5,-19,-7,11,-2,-16,-7,-12,-12,-21,-9,3,-9,1,-14,11,-11,-9,6,-27,-13,-7,-6,4,49,-7,4,9,-13,-12,56,4,11,14,-15,12,19,-4,-25,11,-17,-22,15,4,-6,-8,2,-5,0,-11,1,-23,-6,3,21,-12,-1,-7,-22,-23,-1,19,1,-7,-10,-20,5,-8,5,-21,-1,-2,13,-6,-11,5,-9,-9,-13,18,-3,2,5,-12,-6,30,-13,-4,-18,20,0,8,-8,8,4,0,-7,6,5,6,29,9,42,-2,-12,7,14,-10,-13,1,-1,-17,8,-19,4,-17,-13,-20,8,25,-1,-17,19,10,5,10,-6,3,-12,14,30,-6,-11,-8,-4,-17,-12,-9,-12,41,-16,-21,-2,-15,22,2,9,2,-5,5,2,22,-8,-13,-19,56,-16,6,33,-4,35,-3,10,-15,13,-11,-5,11,0,1,-5,3,-6,21,-12,-6,-16,7,15,-6,-5,-15,1,-8,1,-3,-17,-29,-2,1,-12,9,-8,-8,-8,30,10,18,17,-34,10,5,2,17,-14,-6,-3,0,-22,6,20,-32,-5,-21,-16,8,-8,1,-2,16,-3,-6,-20,-13,5,3,10,13,-9,-13,11,0,4,5,-10,-9,5,-19,10,-4,-11,-3,-12,-4,-11,-7,-21,-30,-10,6,14,8,-2,-1,16,1,19,-15,-14,4,-1,-4,4,-7,12,-13,-12,29,-1,3,0,2,4,-11,-3,-3,-22,4,-6,3,6,9,-25,16,-4,-14,36,3,5,2,-15,-4,1,-11,13,-4,-3,-4,0,-4,-3,44,14,-2,-7,-7,13,-1,-6,-9,2,0,28,-13,-8,-6,1,-5,-6,-9,6,6,10,-3,6,-2,12,-15,5,0,0,17,-2,-18,4,6,2,3,-7,-2,-10,-19,-4,-7,-3,-4,3,9,-17,10,-4,-10,-12,-2,-10,7,-22,-7,-7,1,7,-2,6,-8,-8,7,18,9,-5,3,-6,-4,-10,4,-14,-11,-7,9,13,1,-5,-15,27,7,-4,-13,0,-9,11,8,11,0,-1,0,-4,9,5,9,-15,5,-10,27,-8,6,11,15,-11,13,-25,-15,-16,-9,9,-4,14,-7,-6,-2,-2,3,2,-21,-8,-1,-22,-5,-27,-6,4,9,7,13,0,-7,-2,-8,-12,5,6,-5,-8,-7,-2,-5,8,7,-16,1,16,1,4,0,-6,11,7,-14,-6,15,-4,17,-5,-1,5,24,5,5,-14,11,-8,-6,-18,-3,-14,-13,3,-4,-10,8,-7,19,-2,4,6,36,22,8,18,-20,-8,10,-7,1,-12,8,-5,4,-8,7,-2,-3,-7,-6,-16,0,-9,11,10,3,-1,-9,-6,0,-3,-8,-11,2,-13,19,-1,-22,4,0,-6,0,-10,-12,12,7,-12,-5,-17,3,-12,-4,12,-5,27,19,-11,10,-6,13,7,1,-1,27,-8,12,7,6,-2,-6,6,-7,-16,24,-10,-2,-12,-11,13,-10,-5,-19,-27,-14,-11,1,-1,-15,-2,18,-19,-10,3,3,-3,19,9,-11,-8,-5,24,-3,2,20,-18,-24,-5,11,4,14,14,2,-9,8,0,-9,-12,-2,12,-2,17,-19,-17,4,16,17,-10,5,-5,14,15,5,20,-10,3,-8,24,-16,-4,-1,4,-20,-10,-20,-5,-2,5,2,5,6,-22,-18,3,-8,-7,5,5,-3,12,0,18,-8,12,-2,13,-4,2,3,-4,11,-15,-13,-14,11,0,-4,-10,-10,-22,17,25,20,-4,-10,-5,34,-2,29,3,4,-12,-8,-11,9,-10,13,-16,-5,3,-2,6,8,-4,-5,-10,1,-2,-6,15,3,14,-7,-5,6,-5,-2,13,7,-9,-13,-10,14,-7,-4,6,-2,8,-15,29,15,-14,15,-8,48,-2,-13,-3,-8,-6,-20,12,-3,6,-9,-16,-4,-8,-8,31,-6,11,-3,0,-10,-10,-6,17,6,-12,1,18,-10,1,9,2,3,2,-27,17,-5,-8,-6,-7,-2,6,-15,-12,-12,13,-12,-10,1,-6,10,13,7,0,-13,8,-18,0,25,7,-7,4,-8,-5,1,9,-3,-2,-12,-3,-4,-22,-7,4,6,18,1,1,-11,-7,7,5,-3,15,-6,5,10,-22,11,-9,-16,10,-6,6,2,21,-2,-1,2,12,-10,-2,-4,5,3,20,3,10,-9,-7,-4,-10,12,-14,-6,2,1,-8,7,5,0,-9,-4,46,-14,4,12,-13,3,2,0,8,4,31,-13,6,-7,1,-14,-2,8,4,-12,9,-13,-7,5,4,1,10,-7,-14,-6,-7,10,15,1,-4,-4,11,-2,-3,-13,15,-4,-11,-8,4,2,7,10,-1,-7,37,-15,-3,-7,-16,2,13,11,-3,-23,3,-19,-5,-2,-21,9,-8,1,0,8,10,-4,0,-4,-17,11,6,9,-22,12,-11,11,-1,14,10,22,2,1,6,-11,-7,-4,-4,5,-5,-13,9,-9,-5,-7,-9,-17,1,9,-2,3,0,7,-1,16,-5,14,-4,3,-6,-12,8,16,1,7,-14,1,0,-4,9,-14,1,1,7,13,-3,11,7,9,5,-12,5,-8,0,-2,18,-15,-9,-5,-13,12,-7,-5,2,13,-11,3,2,19,21,-16,-8,12,13,7,4,2,-18,-4,-5,-6,-16,-12,-16,0,4,9,4,-10,9,0,-7,10,-8,6,-5,6,-20,-14,-6,-5,3,-23,-10,-4,18,3,-10,6,-7,15,-8,7,0,-22,3,-21,-2,5,-11,33,2,-6,6,-13,-7,14,1,10,0,13,-20,-16,-15,21,-16,16,5,40,5,1,15,-10,8,17,2,-21,-24,3,-9,5,3,-4,-6,-7,-13,-4,0,2,-23,28,-4,0,6,0,3,11,1,-13,-6,-8,16,-3,36,2,8,6,-12,20,6,8,9,-7,-5,6,3,-1,16,-14,-10,-1,40,-16,-5,12,26,-1,5,-3,23,21,-14,-5,0,2,-3,15,6,-13,5,-14,-4,-1,-16,-13,-16,20,-1,23,0,-38,-2,-10,17,16,-9,-24,-9,7,17,9,-8,-10,11,0,-26,6,-5,-5,-8,-17,1,1,10,0,-11,-16,-18,-10,5,11,14,7,-22,5,-1,-1,-9,1,-5,18,-17,17,-15,-6,-7,31,-4,-4,-1,4,6,-2,-4,-11,1,-15,-3,-5,11,1,1,-16,-6,-1,7,4,-5,18,0,-9,2,17,-14,-15,-8,-25,-7,-9,13,14,5,12,-4,31,-6,2,-1,0,7,3,-18,18,-5,-7,14,-13,-11,-6,-10,-6,2,-16,-14,26,9,-3,28,-9,9,7,-2,-2,-15,-1,3,9,-6,-24,-2,2,-7,-1,3,3,21,-3,3,-7,-10,18,-1,-7,14,1,22,-10,-18,-5,26,7,-6,-6,-8,-6,-2,7,2,-18,10,9,9,3,-2,5,-11,16,27,-5,-5,-16,-12,-14,-3,7,2,-16,-17,-1,-10,7,0,-8,22,-9,7,-5,-1,5,4,13,-6,8,7,-10,-2,-5,-8,4,-1,-2,7,-16,0,-2,-12,3,-10,-3,-2,1,-8,-1,-13,5,5,9,16,-4,14,6,12,4,7,29,14,3,-4,-6,-3,22,-23,-5,15,-28,5,2,8,-12,5,-18,8,5,-3,-18,-8,4,-17,18,5,-6,-8,-12,0,-5,-2,-10,1,6,-17,6,10,-8,-6,17,2,-5,5,11,1,-5,-4,-26,-3,14,-10,-4,3,-11,-11,-16,-3,-1,-5,2,9,2,24,17,8,4,-4,38,16,14,6,-6,-11,-2,14,-10,11,-11,12,4,-8,-8,-26,-13,13,-8,-14,21,1,-8,10,11,22,-17,7,-4,-7,-10,0,-11,18,9,5,0,10,-17,0,-2,2,15,1,-2,-7,-2,2,17,-6,-14,-13,7,11,3,1,14,0,10,8,-11,20,-19,0,-4,16,8,-13,3,4,-1,4,0,29,-12,-24,26,4,11,-14,-12,-10,15,15,-5,1,6,-3,-8,3,-13,-15,7,-6,1,-3,-11,2,-18,14,-1,-17,-2,12,12,-6,12,-23,-13,9,-18,4,0,5,-3,-14,20,7,-6,-6,-11,10,-7,-6,6,-24,4,-18,26,-23,-2,18,-14,-13,-3,-10,28,-4,30,-22,9,-13,-17,11,-19,-9,-12,14,2,-6,25,27,0,1,-7,-17,-6,-16,4,1,13,-12,-6,-6,-11,2,-12,-22,17,-5,-15,-7,-4,5,15,35,0,-3,10,-14,13,-16,4,7,-5,12,-1,11,-2,-23,-9,-3,10,-8,-3,-5,9,12,-3,-12,-4,6,3,-26,-7,5,-24,-11,-3,-9,23,-9,-32,-16,3,30,0,16,7,5,-1,0,-15,-7,40,-20,-6,-9,-2,-13,-7,-2,-2,-4,0,-22,-4,12,-14,5,-1,-3,-6,-8,2,41,32,-10,-6,3,-10,-15,-16,10,-12,-7,-12,-5,-13,51,4,-9,5,-16,15,2,1,-10,40,-14,9,36,1,-16,-7,-3,6,5,25,23,-15,-6,-14,-11,-4,-2,-9,3,-2,17,-5,-4,0,10,17,18,-1,1,-17,3,8,-6,6,15,-1,2,-14,5,-10,7,-5,11,-1,0,-10,3,13,1,-10,16,7,-8,-6,-6,-13,-3,9,20,4,2,-13,-9,-22,-8,-2,-16,12,5,-16,7,-15,9,6,1,5,2,8,9,-7,15,-2,7,35,-11,0,-2,1,1,-9,17,-7,7,-7,-20,-8,11,-2,6,9,0,10,-4,-11,29,7,0,-2,-10,-15,-3,-1,-4,4,-15,-27,1,12,22,-10,-3,20,-17,19,7,-3,5,37,-4,9,18,-3,-7,-12,-3,15,2,0,16,-11,-12,7,8,10,-2,-15,20,-14,2,13,-1,6,4,13,17,15,6,8,-6,9,-7,6,16,4,-2,-3,-7,14,-14,2,-3,-10,-7,-14,4,8,-3,10,0,12,-4,4,5,-5,-10,17,42,-9,2,5,2,5,3,-3,-14,12,2,6,4,-41,-6,2,-3,9,3,0,-7,-8,7,1,-3,11,-3,0,-11,4,2,-3,23,-11,-3,-8,10,-21,26,12,-9,16,-8,16,2,-8,13,-13,-8,-16,-18,18,-19,8,-3,2,-6,-21,8,1,-13,-17,-14,-7,6,18,-21,-36,16,8,20,-13,-8,-16,8,-9,2,-3,2,-5,-5,-7,-3,-12,-4,0,3,-7,-5,-9,12,17,-16,-6,-18,-19,7,25,-12,-17,9,1,-14,-3,-14,6,4,-17,-5,-1,-13,-7,-4,15,-6,5,1,-5,1,-4,-2,1,3,9,-28,-9,-28,-10,28,-3,-12,-6,24,6,-5,-7,-6,1,5,1,38,-34,-3,-14,-15,14,-4,37,2,-8,-21,-21,-6,-13,1,6,-46,-12,-8,-5,5,10,3,-15,28,-10,4,-4,-12,2,1,5,-13,10,19,-12,2,7,-8,-3,0,-13,-33,-5,10,0,13,-2,-2,1,-21,-6,6,-19,-3,14,-21,9,7,6,23,-1,-23,-13,8,-2,16,1,0,-4,11,7,5,-6,-11,-10,0,1,20,6,12,-2,-1,-5,20,-14,5,44,4,1,-7,-7,10,1,-12,22,6,1,10,-6,-9,4,3,3,0,5,-1,0,0,13,-10,-1,-14,13,0,8,11,-14,-15,19,-11,-4,-3,-15,2,4,-9,37,11,-28,-4,-6,-4,-2,-17,-3,-2,-5,-28,19,15,-8,-30,-1,-13,4,6,-3,8,-9,18,6,-2,-24,-14,-13,-16,3,12,34,8,9,0,-3,-9,18,-13,0,-8,-11,19,-1,15,-6,37,7,-12,-1,11,-2,-12,3,-3,-29,2,-10,0,-7,36,-18,-10,-4,-9,0,3,-1,12,-8,-6,-14,13,18,24,12,-1,-4,40,3,-10,10,4,-3,21,14,13,-2,11,12,-5,-11,-6,11,-2,18,7,4,-3,-3,-2,10,6,6,6,-11,9,-14,-14,6,17,11,0,0,1,-1,8,-3,1,-3,5,0,14,-3,4,12,3,-4,-1,-19,-9,-6,3,-1,-1,-7,8,7,10,-14,10,-19,-7,3,10,-8,-2,-1,2,-7,-8,-7,2,26,2,17,-3,5,0,6,4,32,-3,-4,-3,-13,-11,-7,-10,12,4,11,33,0,-12,-10,11,5,16,-7,9,-4,-4,-16,-8,-8,14,2,9,-6,-5,-11,-6,14,-13,-6,-5,-8,-7,-3,-11,28,9,8,6,5,10,-11,-8,3,-23,-16,1,12,-14,0,-2,18,3,-10,5,25,9,-20,0,15,-3,1,4,1,-10,-7,-7,16,23,-7,21,0,-14,2,3,6,-14,-3,-15,-5,-15,-4,-8,14,5,4,-19,4,-6,-18,-8,15,-2,8,-4,3,-6,-5,-11,14,16,-10,-12,-3,-10,6,-1,-6,-9,12,-4,-9,-8,-1,-4,30,-4,24,-11,16,-2,7,6,27,5,-10,-2,0,17,-8,-14,-7,-10,3,27,29,-19,-11,-5,2,-10,-1,-14,2,-2,0,-9,1,-2,23,-23,15,-8,-12,21,4,14,-6,-1,-11,-1,10,-5,11,16,-3,9,35,-7,2,1,6,-4,-11,-7,-14,-3,-8,18,9,12,7,-12,34,-1,-27,9,11,7,-9,-8,9,6,-9,-4,7,-4,-10,10,10,-11,6,-7,4,-11,-6,0,-4,-1,10,-2,15,-13,13,-31,24,1,-13,-19,8,-2,-5,7,-4,10,-4,-10,-10,-10,-8,-1,-10,-8,-3,1,-9,-3,1,1,-11,-2,6,-10,34,-5,12,-15,34,-1,-20}
+
+#define IP1_BIAS {30,-121,-51,77,40,20,46,-35,28,-33}
+
+#define CONV1_BIAS_LSHIFT 6
+#define CONV1_OUT_RSHIFT 9
+#define CONV2_BIAS_LSHIFT 4
+#define CONV2_OUT_RSHIFT 9
+#define CONV3_BIAS_LSHIFT 1
+#define CONV3_OUT_RSHIFT 7
+#define IP1_BIAS_LSHIFT 1
+#define IP1_OUT_RSHIFT 8
+#define INPUT_MEAN_SHIFT {125,123,114}
+#define INPUT_RIGHT_SHIFT {8,8,8}
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/readme_iar.txt b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/readme_iar.txt
new file mode 100644
index 0000000..8ca1d5c
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/readme_iar.txt
@@ -0,0 +1,7 @@
+CMSIS NN Lib example arm_nnexample_cifar10 for
+ Cortex-M0, Cortex-M3, Cortex-M4 and Cortex-M7.
+
+The example is configured for IAR Embedded Workbench for ARM Simulator.
+
+When changing target, remember to change the ARM_MATH_CMx and __FPU_PRESENT
+Preprocessor defines for C/C++ Compiler
diff --git a/NN/Examples/IAR/iar_nn_examples/NN-example-gru/NN-example-gru.ewp b/NN/Examples/IAR/iar_nn_examples/NN-example-gru/NN-example-gru.ewp
new file mode 100644
index 0000000..4d30325
--- /dev/null
+++ b/NN/Examples/IAR/iar_nn_examples/NN-example-gru/NN-example-gru.ewp
@@ -0,0 +1,2276 @@
+
+
+ 3
+
+ Debug
+
+ ARM
+
+ 1
+
+ General
+ 3
+
+ 30
+ 1
+ 1
+
+ ExePath
+ Debug\Exe
+
+
+ ObjPath
+ Debug\Obj
+
+
+ ListPath
+ Debug\List
+
+
+ GEndianMode
+ 0
+
+
+ Input description
+ Automatic choice of formatter, without multibyte support.
+
+
+ Output description
+ Automatic choice of formatter, without multibyte support.
+
+
+ GOutputBinary
+ 0
+
+
+ OGCoreOrChip
+ 2
+
+
+ GRuntimeLibSelect
+ 0
+ 1
+
+
+ GRuntimeLibSelectSlave
+ 0
+ 1
+
+
+ RTDescription
+ Use the normal configuration of the C/C++ runtime library. No locale interface, C locale, no file descriptor support, no multibytes in printf and scanf, and no hex floats in strtod.
+
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
CMSIS-Core (Cortex-M) uses the common coding rules for CMSIS components that are documented under Introduction.
-
CMSIS-Core (Cortex-M) violates the following MISRA-C:2004 rules:
-
-
Required Rule 8.5, object/function definition in header file.
- Violated since function definitions in header files are used for function inlining'.
-
Advisory Rule 12.4, Side effects on right hand side of logical operator.
- Violated because volatile is used for core register definitions.
-
Advisory Rule 14.7, Return statement before end of function.
- Violated to simplify code logic.
-
Required Rule 18.4, declaration of union type or object of union type: '{...}'.
- Violated since unions are used for effective representation of core registers.
-
Advisory Rule 19.4, Disallowed definition for macro.
- Violated since macros are used for assembler keywords.
-
Advisory Rule 19.7, Function-like macro defined.
- Violated since function-like macros are used to generate more efficient code.
-
Advisory Rule 19.16, all preprocessing directives must be valid.
- Violated to set default settings for macros.
-
-
CMSIS-Core (Cortex-M) violates the following MISRA-C:2012 rules:
-
-
Directive 4.9, function-like macro defined.
- Violated since function-like macros are used to generate more efficient code.
-
Rule 1.3, multiple use of '#/##' operators in macro definition.
- Violated since function-like macros are used to generate more efficient code.
-
Rule 11.4, conversion between a pointer and integer type.
- Violated because of core register access.
-
Rule 11.6, cast from unsigned long to pointer.
- Violated because of core register access.
-
Rule 13.5, side effects on right hand side of logical operator.
- Violated because of shift operand is used in macros and functions.
-
Rule 14.4, conditional expression should have essentially Boolean type.
- Violated since macros with several instructions are used.
-
Rule 15.5, return statement before end of function.
- Violated to simplify code logic.
-
Rule 20.10, '#/##' operators used.
- Violated since function-like macros are used to generate more efficient code.
-
Rule 21.1, reserved to the compiler.
- Violated since macros with leading underscores are used.
-
-
<device>.h files generated by SVDConv.exe violate the following MISRA-C:2004 rules:
-
-
Advisory Rule 20.2, Re-use of C90 identifier pattern.
- Violated since CMSIS macros begin with '__'. Since CMSIS is developed and verified with various compilers this approach is acceptable and avoids conflicts with user symbols.
-
Advisory Rule 19.1, Declaration before #include.
- Violated since Interrupt Number Definition Type (IRQn_Type) must be defined before including the core header file.
-
+
CMSIS-Core (Cortex-M) uses the common coding rules for CMSIS components that are documented under Introduction .
-
Generated on Wed Aug 1 2018 17:12:08 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:25 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:08 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:25 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
Generated on Wed Aug 1 2018 17:12:08 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:25 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
@@ -183,7 +263,7 @@ $(document).ready(function(){initNavTree('globals_a.html','');});
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.
-
Generated on Wed Aug 1 2018 17:12:09 for CMSIS-Core (Cortex-M) by Arm Ltd. All rights reserved.
+
Generated on Wed Jul 10 2019 15:20:26 for CMSIS-Core (Cortex-M) Version 5.3.0 by Arm Ltd. All rights reserved.