Official ARM version: v5.6.0

This commit is contained in:
rihab kouki 2020-07-28 11:24:49 +01:00
parent 9f95ff5b6b
commit 96d6da4e25
2939 changed files with 339304 additions and 113320 deletions

View file

@ -0,0 +1,63 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctions.c
* Description: Combination of all basic math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f32.c"
#include "arm_abs_q15.c"
#include "arm_abs_q31.c"
#include "arm_abs_q7.c"
#include "arm_add_f32.c"
#include "arm_add_q15.c"
#include "arm_add_q31.c"
#include "arm_add_q7.c"
#include "arm_dot_prod_f32.c"
#include "arm_dot_prod_q15.c"
#include "arm_dot_prod_q31.c"
#include "arm_dot_prod_q7.c"
#include "arm_mult_f32.c"
#include "arm_mult_q15.c"
#include "arm_mult_q31.c"
#include "arm_mult_q7.c"
#include "arm_negate_f32.c"
#include "arm_negate_q15.c"
#include "arm_negate_q31.c"
#include "arm_negate_q7.c"
#include "arm_offset_f32.c"
#include "arm_offset_q15.c"
#include "arm_offset_q31.c"
#include "arm_offset_q7.c"
#include "arm_scale_f32.c"
#include "arm_scale_q15.c"
#include "arm_scale_q31.c"
#include "arm_scale_q7.c"
#include "arm_shift_q15.c"
#include "arm_shift_q31.c"
#include "arm_shift_q7.c"
#include "arm_sub_f32.c"
#include "arm_sub_q15.c"
#include "arm_sub_q31.c"
#include "arm_sub_q7.c"

View file

@ -0,0 +1,16 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPBasicMath)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPBasicMath STATIC ${SRC})
configdsp(CMSISDSPBasicMath ..)
### Includes
target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/../../Include")

View file

@ -3,13 +3,13 @@
* Title: arm_abs_f32.c
* Description: Floating-point vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,124 +30,117 @@
#include <math.h>
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicAbs Vector Absolute Value
*
* Computes the absolute value of a vector on an element-by-element basis.
*
* <pre>
* pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Floating-point vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_abs_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4_t vec1;
float32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vabsq_f32(vec1);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* read sample from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
/* find absolute value */
in1 = fabsf(in1);
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* read sample from source */
in4 = *(pSrc + 3);
*pDst++ = fabsf(*pSrc++);
/* find absolute value */
in2 = fabsf(in2);
*pDst++ = fabsf(*pSrc++);
/* read sample from source */
*pDst = in1;
*pDst++ = fabsf(*pSrc++);
/* find absolute value */
in3 = fabsf(in3);
/* find absolute value */
in4 = fabsf(in4);
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* Update source pointer to process next sampels */
pSrc += 4U;
/* Update destination pointer to process next sampels */
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_abs_q15.c
* Description: Q15 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,139 +29,104 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q15 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Q15 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_abs_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
#if defined (ARM_MATH_DSP)
__SIMD32_TYPE *simd;
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1; /* Input value1 */
q15_t in2; /* Input value2 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
simd = __SIMD32_CONST(pDst);
while (blkCnt > 0U)
{
/* C = |A| */
/* Read two inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
pDst = (q15_t *)simd;
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in1 = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t in; /* Temporary input variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_abs_q31.c
* Description: Q31 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,90 +29,104 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q31 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Q31 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_abs_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
*pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
*pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
*pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement the loop counter */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
/* Decrement the loop counter */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_abs_q7.c
* Description: Q7 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,117 +29,106 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q7 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
@brief Q7 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
void arm_abs_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in; /* Input value1 */
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read inputs */
in1 = (q31_t) * pSrc;
in2 = (q31_t) * (pSrc + 1);
in3 = (q31_t) * (pSrc + 2);
/* find absolute value */
out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* read input */
in4 = (q31_t) * (pSrc + 3);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* find absolute value */
out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* store result to destination */
*pDst = (q7_t) out1;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* find absolute value */
out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
/* find absolute value */
out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
/* store result to destination */
*(pDst + 1) = (q7_t) out2;
/* store result to destination */
*(pDst + 2) = (q7_t) out3;
/* store result to destination */
*(pDst + 3) = (q7_t) out4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #define ARM_MATH_CM0_FAMILY */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t) __QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* Store the Absolute result in the destination buffer */
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_add_f32.c
* Description: Floating-point vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,110 +29,117 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicAdd Vector Addition
*
* Element-by-element addition of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Floating-point vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_add_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variabels */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* read four inputs from sourceA and four inputs from sourceB */
inA1 = *pSrcA;
inB1 = *pSrcB;
inA2 = *(pSrcA + 1);
inB2 = *(pSrcB + 1);
inA3 = *(pSrcA + 2);
inB3 = *(pSrcB + 2);
inA4 = *(pSrcA + 3);
inB4 = *(pSrcB + 3);
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* C = A + B */
/* add and store result to destination */
*pDst = inA1 + inB1;
*(pDst + 1) = inA2 + inB2;
*(pDst + 2) = inA3 + inB3;
*(pDst + 3) = inA4 + inB4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_add_q15.c
* Description: Q15 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,100 +29,98 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Q15 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_add_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t inB1, inB2;
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *__SIMD32(pSrcA)++;
inA2 = *__SIMD32(pSrcA)++;
inB1 = *__SIMD32(pSrcB)++;
inB2 = *__SIMD32(pSrcB)++;
*__SIMD32(pDst)++ = __QADD16(inA1, inB1);
*__SIMD32(pDst)++ = __QADD16(inA2, inB2);
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from sourceA */
inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
/* read 2 times 2 samples at a time from sourceB */
inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
/* Decrement the loop counter */
/* Add and store 2 times 2 samples at a time */
write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_add_q31.c
* Description: Q31 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,108 +29,80 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Q31 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_add_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QADD(inA1, inB1);
*pDst++ = __QADD(inA2, inB2);
*pDst++ = __QADD(inA3, inB3);
*pDst++ = __QADD(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_add_q7.c
* Description: Q7 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,94 +29,81 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Q7 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
@brief Q7 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_add_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
#else
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_dot_prod_f32.c
* Description: Floating-point dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,95 +29,135 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup dot_prod Vector Dot Product
*
* Computes the dot product of two vectors.
* The vectors are multiplied element-by-element and then summed.
*
* <pre>
* sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of floating-point vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
void arm_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
float32_t sum = 0.0f; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary return variable */
#if defined(ARM_MATH_NEON)
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
float32x4_t accum = vdupq_n_f32(0);
#if defined (ARM_MATH_DSP)
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
while (blkCnt > 0U)
{
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
accum = vmlaq_f32(accum, vec1, vec2);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
/* Decrement the loop counter */
blkCnt--;
}
#if __aarch64__
sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
#else
sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1];
#endif
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* Store the result back in the destination buffer */
/* Store result in destination buffer */
*result = sum;
}
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_dot_prod_q15.c
* Description: Q15 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,100 +29,92 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q15 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
* results are added to a 64-bit accumulator in 34.30 format.
* Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
* there is no risk of overflow.
* The return result is in 34.30 format.
@brief Dot product of Q15 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
results are added to a 64-bit accumulator in 34.30 format.
Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
there is no risk of overflow.
The return result is in 34.30 format.
*/
void arm_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Calculate dot product and store result in a temporary buffer. */
sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 34.30 format */
/* Store result in destination buffer in 34.30 format */
*result = sum;
}
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_dot_prod_q31.c
* Description: Q31 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,103 +29,87 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q31 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
* are truncated to 2.48 format by discarding the lower 14 bits.
* The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
* There are 15 guard bits in the accumulator and there is no risk of overflow as long as
* the length of the vectors is less than 2^16 elements.
* The return result is in 16.48 format.
@brief Dot product of Q31 vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
are truncated to 2.48 format by discarding the lower 14 bits.
The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
There are 15 guard bits in the accumulator and there is no risk of overflow as long as
the length of the vectors is less than 2^16 elements.
The return result is in 16.48 format.
*/
void arm_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
sum += ((q63_t) inA1 * inB1) >> 14U;
sum += ((q63_t) inA2 * inB2) >> 14U;
sum += ((q63_t) inA3 * inB3) >> 14U;
sum += ((q63_t) inA4 * inB4) >> 14U;
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement the loop counter */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14U;
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Store the result in the destination buffer in 16.48 format */
/* Store result in destination buffer in 16.48 format */
*result = sum;
}
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_dot_prod_q7.c
* Description: Q7 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,61 +29,58 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q7 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
* results are added to an accumulator in 18.14 format.
* Nonsaturating additions are used and there is no danger of wrap around as long as
* the vectors are less than 2^18 elements long.
* The return result is in 18.14 format.
@brief Dot product of Q7 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
results are added to an accumulator in 18.14 format.
Nonsaturating additions are used and there is no danger of wrap around as long as
the vectors are less than 2^18 elements long.
The return result is in 18.14 format.
*/
void arm_dot_prod_q7(
q7_t * pSrcA,
q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary return variable */
q31_t sum = 0; /* Temporary variables to store output */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t input1, input2; /* Temporary variables */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input1, input2; /* Temporary variables to store input */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
#if defined (ARM_MATH_DSP)
/* read 4 samples at a time from sourceA */
input1 = *__SIMD32(pSrcA)++;
input1 = read_q7x4_ia ((q7_t **) &pSrcA);
/* read 4 samples at a time from sourceB */
input2 = *__SIMD32(pSrcB)++;
input2 = read_q7x4_ia ((q7_t **) &pSrcB);
/* extract two q7_t samples to q15_t samples */
inA1 = __SXTB16(__ROR(input1, 8));
@ -97,51 +94,46 @@ void arm_dot_prod_q7(
/* multiply and accumulate two samples at a time */
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 18.14 format */
/* Store result in destination buffer in 18.14 format */
*result = sum;
}
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mult_f32.c
* Description: Floating-point vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,134 +29,120 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicMult Vector Multiplication
*
* Element-by-element multiplication of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Floating-point vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_mult_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variables */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
float32_t out1, out2, out3, out4; /* temporary output variables */
#if defined(ARM_MATH_NEON)
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* loop Unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vmulq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
/* read sample from sourceA */
inA1 = *pSrcA;
/* read sample from sourceB */
inB1 = *pSrcB;
/* read sample from sourceA */
inA2 = *(pSrcA + 1);
/* read sample from sourceB */
inB2 = *(pSrcB + 1);
/* out = sourceA * sourceB */
out1 = inA1 * inB1;
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* read sample from sourceA */
inA3 = *(pSrcA + 2);
/* read sample from sourceB */
inB3 = *(pSrcB + 2);
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* out = sourceA * sourceB */
out2 = inA2 * inB2;
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* read sample from sourceA */
inA4 = *(pSrcA + 3);
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* store result to destination buffer */
*pDst = out1;
/* read sample from sourceB */
inB4 = *(pSrcB + 3);
/* out = sourceA * sourceB */
out3 = inA3 * inB3;
/* store result to destination buffer */
*(pDst + 1) = out2;
/* out = sourceA * sourceB */
out4 = inA4 * inB4;
/* store result to destination buffer */
*(pDst + 2) = out3;
/* store result to destination buffer */
*(pDst + 3) = out4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mult_q15.c
* Description: Q15 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,65 +29,65 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q15 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 vector multiplication
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_mult_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2, inB1, inB2; /* Temporary input variables */
q15_t out1, out2, out3, out4; /* Temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
q15_t out1, out2, out3, out4; /* temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* temporary variables */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read two samples at a time from sourceA */
inA1 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB1 = *__SIMD32(pSrcB)++;
/* read two samples at a time from sourceA */
inA2 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB2 = *__SIMD32(pSrcB)++;
/* C = A * B */
#if defined (ARM_MATH_DSP)
/* read 2 samples at a time from sourceA */
inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
/* read 2 samples at a time from sourceB */
inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
/* read 2 samples at a time from sourceA */
inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
/* read 2 samples at a time from sourceB */
inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
mul2 = (q31_t) ((q15_t) (inA1 ) * (q15_t) (inB1 ));
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
mul4 = (q31_t) ((q15_t) (inA2 ) * (q15_t) (inB2 ));
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT(mul1 >> 15, 16);
@ -95,48 +95,49 @@ void arm_mult_q15(
out3 = (q15_t) __SSAT(mul3 >> 15, 16);
out4 = (q15_t) __SSAT(mul4 >> 15, 16);
/* store the result */
/* store result to destination */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
#else
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the blockSize loop counter */
#else
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mult_q31.c
* Description: Q31 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,120 +29,91 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q31 vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_mult_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* Loop counter */
q31_t out; /* Temporary output variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4; /* temporary input variables */
q31_t inB1, inB2, inB3, inB4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB2) >> 32;
out3 = ((q63_t) inA3 * inB3) >> 32;
out4 = ((q63_t) inA4 * inB4) >> 32;
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
*pDst++ = out1 << 1U;
*pDst++ = out2 << 1U;
*pDst++ = out3 << 1U;
*pDst++ = out4 << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement the blockSize loop counter */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inB1 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out1 = __SSAT(out1, 31);
*pDst++ = out1 << 1U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
*pDst++ =
(q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
/* Decrement the blockSize loop counter */
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mult_q7.c
* Description: Q7 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,87 +29,91 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q7 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
@brief Q7 vector multiplication
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_mult_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t out1, out2, out3, out4; /* Temporary output variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in temporary variables */
#if defined (ARM_MATH_DSP)
/* Multiply inputs and store results in temporary variables */
out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Store the results of 4 inputs in the destination buffer in single cycle by packing */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
#else
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
#endif
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
/* Multiply input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_negate_f32.c
* Description: Negates floating-point vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,106 +29,117 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup negate Vector Negate
*
* Negates the elements of a vector.
*
* <pre>
* pDst[n] = -pSrc[n], 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_negate_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
float32x4_t vec1;
float32x4_t res;
#if defined (ARM_MATH_DSP)
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
while (blkCnt > 0U)
{
/* C = -A */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Negate and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vnegq_f32(vec1);
vst1q_f32(pDst, res);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* negate the input */
in1 = -in1;
in2 = -in2;
in3 = -in3;
in4 = -in4;
/* store the result to destination */
*pDst = in1;
*(pDst + 1) = in2;
*(pDst + 2) = in3;
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
@} end of BasicNegate group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_negate_q15.c
* Description: Negates Q15 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,103 +29,98 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Negates the elements of a Q15 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_negate_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q15_t in;
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2; /* Temporary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read two inputs at a time */
in1 = _SIMD32_OFFSET(pSrc);
in2 = _SIMD32_OFFSET(pSrc + 2);
/* negate two samples at a time */
in1 = __QSUB16(0, in1);
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (2 samples at a time). */
in1 = read_q15x2_ia ((q15_t **) &pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
/* negate two samples at a time */
in2 = __QSUB16(0, in2);
in1 = read_q15x2_ia ((q15_t **) &pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst) = in1;
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst + 2) = in2;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
@} end of BasicNegate group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_negate_q31.c
* Description: Negates Q31 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,89 +29,104 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Negates the elements of a Q31 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_negate_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t in; /* Temporary variable */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QSUB(0, in1);
*pDst++ = __QSUB(0, in2);
*pDst++ = __QSUB(0, in3);
*pDst++ = __QSUB(0, in4);
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement the loop counter */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
@} end of BasicNegate group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_negate_q7.c
* Description: Negates Q7 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,85 +29,98 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
@brief Negates the elements of a Q7 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
*/
void arm_negate_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in;
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variable */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input; /* Input values1-4 */
q31_t zero = 0x00000000;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read four inputs */
input = *__SIMD32(pSrc)++;
/* Store the Negated results in the destination buffer in a single cycle by packing the results */
*__SIMD32(pDst)++ = __QSUB8(zero, input);
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (4 samples at a time). */
in1 = read_q7x4_ia ((q7_t **) &pSrc);
write_q7x4_ia (&pDst, __QSUB8(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
/* Decrement the loop counter */
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */ \
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (q7_t) __QSUB(0, in);
#else
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
@} end of BasicNegate group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_offset_f32.c
* Description: Floating-point vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,126 +29,119 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup offset Vector Offset
*
* Adds a constant offset to each element of a vector.
*
* <pre>
* pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_offset_f32(
float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
float32x4_t vec1;
float32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vaddq_f32(vec1,vdupq_n_f32(offset));
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
/* read samples from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
/* add offset to input */
in1 = in1 + offset;
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* read samples from source */
in3 = *(pSrc + 2);
*pDst++ = (*pSrc++) + offset;
/* add offset to input */
in2 = in2 + offset;
*pDst++ = (*pSrc++) + offset;
/* read samples from source */
in4 = *(pSrc + 3);
*pDst++ = (*pSrc++) + offset;
/* add offset to input */
in3 = in3 + offset;
/* store result to destination */
*pDst = in1;
/* add offset to input */
in4 = in4 + offset;
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of offset group
@} end of BasicOffset group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_offset_q15.c
* Description: Q15 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,96 +29,93 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
@brief Adds a constant offset to a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_offset_q15(
q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PKHBT(offset, offset, 16);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer, 2 samples at a time. */
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (2 samples at a time). */
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
@} end of BasicOffset group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_offset_q31.c
* Description: Q31 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,100 +29,100 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
@brief Adds a constant offset to a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_offset_q31(
q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QADD(in1, offset);
*pDst++ = __QADD(in2, offset);
*pDst++ = __QADD(in3, offset);
*pDst++ = __QADD(in4, offset);
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = __QADD(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
@} end of BasicOffset group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_offset_q7.c
* Description: Q7 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,95 +29,88 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
@brief Adds a constant offset to a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_offset_q7(
q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PACKq7(offset, offset, offset, offset);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination bufferfor 4 samples at a time. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
#else
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
@} end of BasicOffset group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_scale_f32.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,129 +29,131 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup scale Vector Scale
*
* Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
*
* <pre>
* pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
* </pre>
*
* In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
* a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
* The shift allows the gain of the scaling operation to exceed 1.0.
* The algorithm used with fixed-point data is:
*
* <pre>
* pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
* </pre>
*
* The overall scale factor applied to the fixed-point data is
* <pre>
* scale = scaleFract * 2^shift.
* </pre>
*
* The functions support in-place computation allowing the source and destination
* pointers to reference the same memory buffer.
@defgroup BasicScale Vector Scale
Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
<pre>
pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
</pre>
In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The algorithm used with fixed-point data is:
<pre>
pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
</pre>
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
* @addtogroup scale
* @{
@addtogroup BasicScale
@{
*/
/**
* @brief Multiplies a floating-point vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scale scale factor to be applied
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_scale_f32(
float32_t * pSrc,
float32_t scale,
float32_t * pDst,
uint32_t blockSize)
const float32_t *pSrc,
float32_t scale,
float32_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
float32x4_t vec1;
float32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variabels */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
/* read input samples from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
/* multiply with scaling factor */
in1 = in1 * scale;
/* Scale input and store result in destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* read input sample from source */
in3 = *(pSrc + 2);
*pDst++ = (*pSrc++) * scale;
/* multiply with scaling factor */
in2 = in2 * scale;
*pDst++ = (*pSrc++) * scale;
/* read input sample from source */
in4 = *(pSrc + 3);
*pDst++ = (*pSrc++) * scale;
/* multiply with scaling factor */
in3 = in3 * scale;
in4 = in4 * scale;
/* store the result to destination */
*pDst = in1;
*(pDst + 1) = in2;
*(pDst + 2) = in3;
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
/* Scale input and store result in destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of scale group
@} end of BasicScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_scale_q15.c
* Description: Multiplies a Q15 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,66 +29,66 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup scale
* @{
@addtogroup BasicScale
@{
*/
/**
* @brief Multiplies a Q15 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
* These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
@brief Multiplies a Q15 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
void arm_scale_q15(
q15_t * pSrc,
q15_t scaleFract,
int8_t shift,
q15_t * pDst,
uint32_t blockSize)
const q15_t *pSrc,
q15_t scaleFract,
int8_t shift,
q15_t *pDst,
uint32_t blockSize)
{
int8_t kShift = 15 - shift; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
int8_t kShift = 15 - shift; /* Shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1, in2, in3, in4;
q31_t inA1, inA2; /* Temporary variables */
q31_t out1, out2, out3, out4;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Reading 2 inputs from memory */
inA1 = *__SIMD32(pSrc)++;
inA2 = *__SIMD32(pSrc)++;
/* C = A * scale */
/* Scale the inputs and then store the 2 results in the destination buffer
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia ((q15_t **) &pSrc);
inA2 = read_q15x2_ia ((q15_t **) &pSrc);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) inA1 * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) inA2 * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
@ -102,49 +102,43 @@ void arm_scale_q15(
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store the result to destination */
*__SIMD32(pDst)++ = __PKHBT(in2, in1, 16);
*__SIMD32(pDst)++ = __PKHBT(in4, in3, 16);
/* store result to destination */
write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
#else
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q15_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 16));
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q15_t) (__SSAT(((q31_t) * pSrc++ * scaleFract) >> kShift, 16));
/* Decrement the loop counter */
/* Scale input and store result in destination buffer. */
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of scale group
@} end of BasicScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_scale_q31.c
* Description: Multiplies a Q31 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,199 +29,163 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup scale
* @{
@addtogroup BasicScale
@{
*/
/**
* @brief Multiplies a Q31 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
* These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
@brief Multiplies a Q31 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
*/
void arm_scale_q31(
q31_t * pSrc,
q31_t scaleFract,
int8_t shift,
q31_t * pDst,
uint32_t blockSize)
const q31_t *pSrc,
q31_t scaleFract,
int8_t shift,
q31_t *pDst,
uint32_t blockSize)
{
int8_t kShift = shift + 1; /* Shift to apply after scaling */
int8_t sign = (kShift & 0x80);
uint32_t blkCnt; /* loop counter */
q31_t in, out;
uint32_t blkCnt; /* Loop counter */
q31_t in, out; /* Temporary variables */
int8_t kShift = shift + 1; /* Shift to apply after scaling */
int8_t sign = (kShift & 0x80);
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variabels */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read four inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* C = A * scale */
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* Scale input and store result in destination buffer. */
in = *pSrc++; /* read input from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the result */
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out; /* Store result destination */
/* apply shifting */
out1 = in1 << kShift;
out2 = in2 << kShift;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* saturate the results. */
if (in1 != (out1 >> kShift))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
if (in2 != (out2 >> kShift))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
out3 = in3 << kShift;
out4 = in4 << kShift;
*pDst = out1;
*(pDst + 1) = out2;
if (in3 != (out3 >> kShift))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
if (in4 != (out4 >> kShift))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
/* Store result destination */
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update pointers to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read four inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* C = A * scale */
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* Scale input and store result in destination buffer. */
in = *pSrc++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in >> -kShift; /* apply shifting */
*pDst++ = out; /* Store result destination */
/* apply shifting */
out1 = in1 >> -kShift;
out2 = in2 >> -kShift;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
out3 = in3 >> -kShift;
out4 = in4 >> -kShift;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
/* Store result destination */
*pDst = out1;
*(pDst + 1) = out2;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update pointers to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
if (sign == 0)
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
while (blkCnt > 0U)
{
/* C = A * scale */
out = in << kShift;
/* Scale input and store result in destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* Decrement the loop counter */
blkCnt--;
}
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
while (blkCnt > 0U)
{
/* C = A * scale */
out = in >> -kShift;
*pDst++ = out;
/* Decrement the loop counter */
blkCnt--;
}
/* Scale input and store result in destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
}
/**
* @} end of scale group
@} end of BasicScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_scale_q7.c
* Description: Multiplies a Q7 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,109 +29,101 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup scale
* @{
@addtogroup BasicScale
@{
*/
/**
* @brief Multiplies a Q7 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
* These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
@brief Multiplies a Q7 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
*/
void arm_scale_q7(
q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize)
{
int8_t kShift = 7 - shift; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
int8_t kShift = 7 - shift; /* Shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t in1, in2, in3, in4; /* Temporary input variables */
q7_t out1, out2, out3, out4; /* Temporary output variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t in1, in2, in3, in4, out1, out2, out3, out4; /* Temporary variables to store input & output */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * scale */
#if defined (ARM_MATH_DSP)
/* Reading 4 inputs from memory */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* C = A * scale */
/* Scale the inputs and then store the results in the temporary variables. */
/* Scale inputs and store result in the temporary variable. */
out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
/* Packing the individual outputs into 32bit and storing in
* destination buffer in single write */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
#else
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 8));
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) (__SSAT((((q15_t) * pSrc++ * scaleFract) >> kShift), 8));
/* Decrement the loop counter */
/* Scale input and store result in destination buffer. */
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of scale group
@} end of BasicScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_shift_q15.c
* Description: Shifts the elements of a Q15 vector by a specified number of bits
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,208 +29,173 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup shift
* @{
@addtogroup BasicShift
@{
*/
/**
* @brief Shifts the elements of a Q15 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Shifts the elements of a Q15 vector a specified number of bits
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_shift_q15(
q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign; /* Sign of shiftBits */
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q15_t in1, in2; /* Temporary input variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1, in2; /* Temporary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Read 2 inputs */
/* C = A << shiftBits */
#if defined (ARM_MATH_DSP)
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
/* C = A << shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16);
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16));
#else
write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16);
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16));
#else
write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
#endif
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift and then store the results in the destination buffer. */
*pDst++ = __SSAT((*pSrc++ << shiftBits), 16);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Read 2 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* C = A >> shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16);
#else
*__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16);
#else
*__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
#if defined (ARM_MATH_DSP)
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16));
#else
write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16));
#else
write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift and then store the results in the destination buffer. */
*pDst++ = __SSAT(((q31_t) * pSrc++ << shiftBits), 16);
/* Decrement the loop counter */
/* Shift input and store result in destination buffer. */
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of shift group
@} end of BasicShift group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_shift_q31.c
* Description: Shifts the elements of a Q31 vector by a specified number of bits
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,163 +29,153 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup shift Vector Shift
*
* Shifts the elements of a fixed-point vector by a specified number of bits.
* There are separate functions for Q7, Q15, and Q31 data types.
* The underlying algorithm used is:
*
* <pre>
* pDst[n] = pSrc[n] << shift, 0 <= n < blockSize.
* </pre>
*
* If <code>shift</code> is positive then the elements of the vector are shifted to the left.
* If <code>shift</code> is negative then the elements of the vector are shifted to the right.
*
* The functions support in-place computation allowing the source and destination
* pointers to reference the same memory buffer.
@defgroup BasicShift Vector Shift
Shifts the elements of a fixed-point vector by a specified number of bits.
There are separate functions for Q7, Q15, and Q31 data types.
The underlying algorithm used is:
<pre>
pDst[n] = pSrc[n] << shift, 0 <= n < blockSize.
</pre>
If <code>shift</code> is positive then the elements of the vector are shifted to the left.
If <code>shift</code> is negative then the elements of the vector are shifted to the right.
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
* @addtogroup shift
* @{
@addtogroup BasicShift
@{
*/
/**
* @brief Shifts the elements of a Q31 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
@brief Shifts the elements of a Q31 vector a specified number of bits.
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in the vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_shift_q31(
q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in1, in2, in3, in4; /* Temporary input variables */
q31_t out1, out2, out3, out4; /* Temporary output variables */
q31_t in, out; /* Temporary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the results in the destination buffer. */
in1 = *pSrc;
in2 = *(pSrc + 1);
out1 = in1 << shiftBits;
in3 = *(pSrc + 2);
out2 = in2 << shiftBits;
in4 = *(pSrc + 3);
if (in1 != (out1 >> shiftBits))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
/* C = A << shiftBits */
if (in2 != (out2 >> shiftBits))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
/* Shift input and store result in destination buffer. */
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
*pDst = out1;
out3 = in3 << shiftBits;
*(pDst + 1) = out2;
out4 = in4 << shiftBits;
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
if (in3 != (out3 >> shiftBits))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
if (in4 != (out4 >> shiftBits))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update destination pointer to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the results in the destination buffer. */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* C = A >> shiftBits */
*pDst = (in1 >> -shiftBits);
*(pDst + 1) = (in2 >> -shiftBits);
*(pDst + 2) = (in3 >> -shiftBits);
*(pDst + 3) = (in4 >> -shiftBits);
pSrc += 4U;
pDst += 4U;
/* Shift input and store results in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* C = A (>> or <<) shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (sign == 0U) ? clip_q63_to_q31((q63_t) * pSrc++ << shiftBits) :
(*pSrc++ >> -shiftBits);
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Decrement the loop counter */
blkCnt--;
/* Shift input and store result in destination buffer. */
*pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
}
/**
* @} end of shift group
@} end of BasicShift group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_shift_q7.c
* Description: Processing function for the Q7 Shifting
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,180 +29,147 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup shift
* @{
@addtogroup BasicShift
@{
*/
/**
* @brief Shifts the elements of a Q7 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x8 0x7F] will be saturated.
@brief Shifts the elements of a Q7 vector a specified number of bits
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par onditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_shift_q7(
q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign; /* Sign of shiftBits */
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t in1, in2, in3, in4; /* Temporary input variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t in1; /* Input value1 */
q7_t in2; /* Input value2 */
q7_t in3; /* Input value3 */
q7_t in4; /* Input value4 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A << shiftBits */
#if defined (ARM_MATH_DSP)
/* Read 4 inputs */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
*__SIMD32(pDst)++ = __PACKq7(__SSAT((in1 << shiftBits), 8),
__SSAT((in2 << shiftBits), 8),
__SSAT((in3 << shiftBits), 8),
__SSAT((in4 << shiftBits), 8));
/* Update source pointer to process next sampels */
pSrc += 4U;
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
__SSAT((in2 << shiftBits), 8),
__SSAT((in3 << shiftBits), 8),
__SSAT((in4 << shiftBits), 8) ));
#else
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
#endif
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((*pSrc++ << shiftBits), 8);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
else
{
shiftBits = -shiftBits;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
#if defined (ARM_MATH_DSP)
/* Read 4 inputs */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
*__SIMD32(pDst)++ = __PACKq7((in1 >> shiftBits), (in2 >> shiftBits),
(in3 >> shiftBits), (in4 >> shiftBits));
pSrc += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the result in the destination buffer. */
in1 = *pSrc++;
*pDst++ = (in1 >> shiftBits);
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* Decrement the loop counter */
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
(in2 >> -shiftBits),
(in3 >> -shiftBits),
(in4 >> -shiftBits) ));
#else
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT(((q15_t) * pSrc++ << shiftBits), 8);
/* Decrement the loop counter */
/* Shift input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the result in the destination buffer. */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of shift group
@} end of BasicShift group
*/

View file

@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f32.c
* Description: Floating-point vector subtraction.
* Description: Floating-point vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,110 +29,120 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicSub Vector Subtraction
*
* Element-by-element subtraction of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicSub Vector Subtraction
Element-by-element subtraction of two vectors.
<pre>
pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicSub
* @{
@addtogroup BasicSub
@{
*/
/**
* @brief Floating-point vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_sub_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary variables */
float32_t inB1, inB2, inB3, inB4; /* temporary variables */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
/* Read 4 input samples from sourceA and sourceB */
inA1 = *pSrcA;
inB1 = *pSrcB;
inA2 = *(pSrcA + 1);
inB2 = *(pSrcB + 1);
inA3 = *(pSrcA + 2);
inB3 = *(pSrcB + 2);
inA4 = *(pSrcA + 3);
inB4 = *(pSrcB + 3);
/* dst = srcA - srcB */
/* subtract and store the result */
*pDst = inA1 - inB1;
*(pDst + 1) = inA2 - inB2;
*(pDst + 2) = inA3 - inB3;
*(pDst + 3) = inA4 - inB4;
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Update pointers to process next sampels */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement the loop counter */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of BasicSub group
@} end of BasicSub group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sub_q15.c
* Description: Q15 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,100 +29,98 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
@addtogroup BasicSub
@{
*/
/**
* @brief Q15 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_sub_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2;
q31_t inB1, inB2;
#endif
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer two samples at a time. */
inA1 = *__SIMD32(pSrcA)++;
inA2 = *__SIMD32(pSrcA)++;
inB1 = *__SIMD32(pSrcB)++;
inB2 = *__SIMD32(pSrcB)++;
*__SIMD32(pDst)++ = __QSUB16(inA1, inB1);
*__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from sourceA */
inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
/* read 2 times 2 samples at a time from sourceB */
inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
/* Decrement the loop counter */
/* Subtract and store 2 times 2 samples at a time */
write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);
/* Decrement the loop counter */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
@} end of BasicSub group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sub_q31.c
* Description: Q31 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,106 +29,80 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
@addtogroup BasicSub
@{
*/
/**
* @brief Q31 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_sub_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QSUB(inA1, inB1);
*pDst++ = __QSUB(inA2, inB2);
*pDst++ = __QSUB(inA3, inB3);
*pDst++ = __QSUB(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
/* Subtract and store result in destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
#else
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
/* Run the below code for Cortex-M0 */
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);
/* Decrement the loop counter */
/* Subtract and store result in destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
@} end of BasicSub group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sub_q7.c
* Description: Q7 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,91 +29,81 @@
#include "arm_math.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
@addtogroup BasicSub
@{
*/
/**
* @brief Q7 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
@brief Q7 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_sub_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer 4 samples at a time. */
*__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Subtract and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
#else
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);
/* Decrement the loop counter */
/* Subtract and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
@} end of BasicSub group
*/

223
DSP/Source/CMakeLists.txt Normal file
View file

@ -0,0 +1,223 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSP)
# Needed to find the config modules
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..)
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
option(BASICMATH "Basic Math Functions" ON)
option(COMPLEXMATH "Complex Math Functions" ON)
option(CONTROLLER "Controller Functions" ON)
option(FASTMATH "Fast Math Functions" ON)
option(FILTERING "Filtering Functions" ON)
option(MATRIX "Matrix Functions" ON)
option(STATISTICS "Statistics Functions" ON)
option(SUPPORT "Support Functions" ON)
option(TRANSFORM "Transform Functions" ON)
# When OFF it is the default behavior : all tables are included.
option(CONFIGTABLE "Configuration of table allowed" OFF)
# When CONFIGTABLE is ON, select if all interpolation tables must be included
option(ALLFAST "All interpolation tables included" OFF)
# When CONFIGTABLE is ON, select if all FFT tables must be included
option(ALLFFT "All fft tables included" OFF)
# Features which require inclusion of a data table.
# Since some tables may be big, the corresponding feature can be
# disabled.
# Those options are taken into account only when CONFIGTABLE is ON
option(ARM_COS_F32 "cos f32" OFF)
option(ARM_COS_Q31 "cos q31" OFF)
option(ARM_COS_Q15 "cos q15" OFF)
option(ARM_SIN_F32 "sin f32" OFF)
option(ARM_SIN_Q31 "sin q31" OFF)
option(ARM_SIN_Q15 "sin q15" OFF)
option(ARM_SIN_COS_F32 "sin cos f32" OFF)
option(ARM_SIN_COS_Q31 "sin cos q31" OFF)
option(ARM_LMS_NORM_Q31 "lms norm q31" OFF)
option(ARM_LMS_NORM_Q15 "lms norm q15" OFF)
option(CFFT_F32_16 "cfft f32 16" OFF)
option(CFFT_F32_32 "cfft f32 32" OFF)
option(CFFT_F32_64 "cfft f32 64" OFF)
option(CFFT_F32_128 "cfft f32 128" OFF)
option(CFFT_F32_256 "cfft f32 256" OFF)
option(CFFT_F32_512 "cfft f32 512" OFF)
option(CFFT_F32_1024 "cfft f32 1024" OFF)
option(CFFT_F32_2048 "cfft f32 2048" OFF)
option(CFFT_F32_4096 "cfft f32 4096" OFF)
option(CFFT_Q31_16 "cfft q31 16" OFF)
option(CFFT_Q31_32 "cfft q31 32" OFF)
option(CFFT_Q31_64 "cfft q31 64" OFF)
option(CFFT_Q31_128 "cfft q31 128" OFF)
option(CFFT_Q31_256 "cfft q31 256" OFF)
option(CFFT_Q31_512 "cfft q31 512" OFF)
option(CFFT_Q31_1024 "cfft q31 1024" OFF)
option(CFFT_Q31_2048 "cfft q31 2048" OFF)
option(CFFT_Q31_4096 "cfft q31 4096" OFF)
option(CFFT_Q15_16 "cfft q15 16" OFF)
option(CFFT_Q15_32 "cfft q15 32" OFF)
option(CFFT_Q15_64 "cfft q15 64" OFF)
option(CFFT_Q15_128 "cfft q15 128" OFF)
option(CFFT_Q15_256 "cfft q15 256" OFF)
option(CFFT_Q15_512 "cfft q15 512" OFF)
option(CFFT_Q15_1024 "cfft q15 1024" OFF)
option(CFFT_Q15_2048 "cfft q15 2048" OFF)
option(CFFT_Q15_4096 "cfft q15 4096" OFF)
option(RFFT_FAST_F32_32 "rfft fast f32 32" OFF)
option(RFFT_FAST_F32_64 "rfft fast f32 64" OFF)
option(RFFT_FAST_F32_128 "rfft fast f32 128" OFF)
option(RFFT_FAST_F32_256 "rfft fast f32 256" OFF)
option(RFFT_FAST_F32_512 "rfft fast f32 512" OFF)
option(RFFT_FAST_F32_1024 "rfft fast f32 1024" OFF)
option(RFFT_FAST_F32_2048 "rfft fast f32 2048" OFF)
option(RFFT_FAST_F32_4096 "rfft fast f32 4096" OFF)
option(RFFT_F32_128 "rfft f32 128" OFF)
option(RFFT_F32_512 "rfft f32 512" OFF)
option(RFFT_F32_2048 "rfft f32 2048" OFF)
option(RFFT_F32_8192 "rfft f32 8192" OFF)
option(RFFT_Q31_32 "rfft q31 32" OFF)
option(RFFT_Q31_64 "rfft q31 64" OFF)
option(RFFT_Q31_128 "rfft q31 128" OFF)
option(RFFT_Q31_256 "rfft q31 256" OFF)
option(RFFT_Q31_512 "rfft q31 512" OFF)
option(RFFT_Q31_1024 "rfft q31 1024" OFF)
option(RFFT_Q31_2048 "rfft q31 2048" OFF)
option(RFFT_Q31_4096 "rfft q31 4096" OFF)
option(RFFT_Q31_8192 "rfft q31 8192" OFF)
option(RFFT_Q15_32 "rfft q15 32" OFF)
option(RFFT_Q15_64 "rfft q15 64" OFF)
option(RFFT_Q15_128 "rfft q15 128" OFF)
option(RFFT_Q15_256 "rfft q15 256" OFF)
option(RFFT_Q15_512 "rfft q15 512" OFF)
option(RFFT_Q15_1024 "rfft q15 1024" OFF)
option(RFFT_Q15_2048 "rfft q15 2048" OFF)
option(RFFT_Q15_4096 "rfft q15 4096" OFF)
option(RFFT_Q15_8192 "rfft q15 8192" OFF)
option(DCT4_F32_128 "dct4 f32 128" OFF)
option(DCT4_F32_512 "dct4 f32 512" OFF)
option(DCT4_F32_2048 "dct4 f32 2048" OFF)
option(DCT4_F32_8192 "dct4 f32 8192" OFF)
option(DCT4_Q31_128 "dct4 q31 128" OFF)
option(DCT4_Q31_512 "dct4 q31 512" OFF)
option(DCT4_Q31_2048 "dct4 q31 2048" OFF)
option(DCT4_Q31_8192 "dct4 q31 8192" OFF)
option(DCT4_Q15_128 "dct4 q15 128" OFF)
option(DCT4_Q15_512 "dct4 q15 512" OFF)
option(DCT4_Q15_2048 "dct4 q15 2048" OFF)
option(DCT4_Q15_8192 "dct4 q15 8192" OFF)
###########################
#
# CMSIS DSP
#
###########################
# DSP Sources
SET(DSP ".")
add_library(CMSISDSP INTERFACE)
include(config)
if (BASICMATH)
add_subdirectory(BasicMathFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
endif()
if (COMPLEXMATH)
add_subdirectory(ComplexMathFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
endif()
if (CONTROLLER)
add_subdirectory(ControllerFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
endif()
if (FASTMATH)
add_subdirectory(FastMathFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
endif()
if (FILTERING)
add_subdirectory(FilteringFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
endif()
if (MATRIX)
add_subdirectory(MatrixFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
endif()
if (STATISTICS)
add_subdirectory(StatisticsFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
endif()
if (SUPPORT)
add_subdirectory(SupportFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
endif()
if (TRANSFORM)
add_subdirectory(TransformFunctions)
# FFT tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
endif()
if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM)
add_subdirectory(CommonTables)
if (TRANSFORM)
# FFT tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
endif()
endif()
if (FILTERING OR CONTROLLER OR FASTMATH)
# Select which tables to include
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
endif()
### Includes
target_include_directories(CMSISDSP INTERFACE "${DSP}/../Include")

View file

@ -0,0 +1,31 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPCommon)
add_library(CMSISDSPCommon STATIC arm_common_tables.c)
if (CONFIGTABLE AND ALLFFT)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES)
endif()
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES)
endif()
include(fft)
fft(CMSISDSPCommon)
include(interpol)
interpol(CMSISDSPCommon)
target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
configdsp(CMSISDSPCommon ..)
### Includes
target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,31 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CommonTables.c
* Description: Combination of all common table source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_common_tables.c"
#include "arm_const_structs.c"

File diff suppressed because it is too large Load diff

View file

@ -29,351 +29,458 @@
#include "arm_const_structs.h"
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
/* Floating-point structs */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
};
#endif
/* Fixed-point structs */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
/* Structure for real-value inputs */
/* Floating-point structs */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
{ 16, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
32U,
(float32_t *)twiddleCoef_rfft_32
{ 16, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
32U,
(float32_t *)twiddleCoef_rfft_32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
{ 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
64U,
(float32_t *)twiddleCoef_rfft_64
{ 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
64U,
(float32_t *)twiddleCoef_rfft_64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
{ 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
128U,
(float32_t *)twiddleCoef_rfft_128
{ 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
128U,
(float32_t *)twiddleCoef_rfft_128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
{ 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
256U,
(float32_t *)twiddleCoef_rfft_256
{ 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
256U,
(float32_t *)twiddleCoef_rfft_256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
{ 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
512U,
(float32_t *)twiddleCoef_rfft_512
{ 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
512U,
(float32_t *)twiddleCoef_rfft_512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
{ 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
1024U,
(float32_t *)twiddleCoef_rfft_1024
{ 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
1024U,
(float32_t *)twiddleCoef_rfft_1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
{ 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
2048U,
(float32_t *)twiddleCoef_rfft_2048
{ 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
2048U,
(float32_t *)twiddleCoef_rfft_2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
{ 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
4096U,
(float32_t *)twiddleCoef_rfft_4096
{ 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
4096U,
(float32_t *)twiddleCoef_rfft_4096
};
#endif
/* Fixed-point structs */
/* q31_t */
extern const q31_t realCoefAQ31[8192];
extern const q31_t realCoefBQ31[8192];
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
32U,
0,
1,
256U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len16
32U,
0,
1,
256U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len16
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
64U,
0,
1,
128U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len32
64U,
0,
1,
128U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
128U,
0,
1,
64U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len64
128U,
0,
1,
64U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
256U,
0,
1,
32U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len128
256U,
0,
1,
32U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
512U,
0,
1,
16U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len256
512U,
0,
1,
16U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
1024U,
0,
1,
8U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len512
1024U,
0,
1,
8U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
2048U,
0,
1,
4U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len1024
2048U,
0,
1,
4U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
4096U,
0,
1,
2U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len2048
4096U,
0,
1,
2U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
8192U,
0,
1,
1U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len4096
8192U,
0,
1,
1U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len4096
};
#endif
/* q15_t */
extern const q15_t realCoefAQ15[8192];
extern const q15_t realCoefBQ15[8192];
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
32U,
0,
1,
256U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len16
32U,
0,
1,
256U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len16
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
64U,
0,
1,
128U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len32
64U,
0,
1,
128U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
128U,
0,
1,
64U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len64
128U,
0,
1,
64U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
256U,
0,
1,
32U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len128
256U,
0,
1,
32U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
512U,
0,
1,
16U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len256
512U,
0,
1,
16U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
1024U,
0,
1,
8U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len512
1024U,
0,
1,
8U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
2048U,
0,
1,
4U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len1024
2048U,
0,
1,
4U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
4096U,
0,
1,
2U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len2048
4096U,
0,
1,
2U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
8192U,
0,
1,
1U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len4096
8192U,
0,
1,
1U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len4096
};
#endif
#endif

View file

@ -0,0 +1,16 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPComplexMath)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPComplexMath STATIC ${SRC})
configdsp(CMSISDSPComplexMath ..)
### Includes
target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,46 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctions.c
* Description: Combination of all comlex math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f32.c"
#include "arm_cmplx_conj_q15.c"
#include "arm_cmplx_conj_q31.c"
#include "arm_cmplx_dot_prod_f32.c"
#include "arm_cmplx_dot_prod_q15.c"
#include "arm_cmplx_dot_prod_q31.c"
#include "arm_cmplx_mag_f32.c"
#include "arm_cmplx_mag_q15.c"
#include "arm_cmplx_mag_q31.c"
#include "arm_cmplx_mag_squared_f32.c"
#include "arm_cmplx_mag_squared_q15.c"
#include "arm_cmplx_mag_squared_q31.c"
#include "arm_cmplx_mult_cmplx_f32.c"
#include "arm_cmplx_mult_cmplx_q15.c"
#include "arm_cmplx_mult_cmplx_q31.c"
#include "arm_cmplx_mult_real_f32.c"
#include "arm_cmplx_mult_real_q15.c"
#include "arm_cmplx_mult_real_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_f32.c
* Description: Floating-point complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,143 +29,133 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_conj Complex Conjugate
*
* Conjugates the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[(2*n)+0)] = pSrc[(2*n)+0]; // real part
* pDst[(2*n)+1)] = -pSrc[(2*n)+1]; // imag part
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Floating-point complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_conj_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4_t zero;
float32x4x2_t vec;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inR1, inR2, inR3, inR4;
float32_t inI1, inI2, inI3, inI4;
zero = vdupq_n_f32(0.0);
/*loop Unrolling */
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+(-1)*jA[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
vec = vld2q_f32(pSrc);
vec.val[1] = vsubq_f32(zero,vec.val[1]);
vst2q_f32(pDst,vec);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* read real input samples */
inR1 = pSrc[0];
/* store real samples to destination */
pDst[0] = inR1;
inR2 = pSrc[2];
pDst[2] = inR2;
inR3 = pSrc[4];
pDst[4] = inR3;
inR4 = pSrc[6];
pDst[6] = inR4;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* read imaginary input samples */
inI1 = pSrc[1];
inI2 = pSrc[3];
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* conjugate input */
inI1 = -inI1;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* read imaginary input samples */
inI3 = pSrc[5];
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* conjugate input */
inI2 = -inI2;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* read imaginary input samples */
inI4 = pSrc[7];
/* conjugate input */
inI3 = -inI3;
/* store imaginary samples to destination */
pDst[1] = inI1;
pDst[3] = inI2;
/* conjugate input */
inI4 = -inI4;
/* store imaginary samples to destination */
pDst[5] = inI3;
/* increment source pointer by 8 to process next sampels */
pSrc += 8U;
/* store imaginary sample to destination */
pDst[7] = inI4;
/* increment destination pointer by 8 to store next samples */
pDst += 8U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* realOut + j (imagOut) = realIn + j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q15.c
* Description: Q15 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,68 +29,66 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Q15 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Q15 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_cmplx_conj_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in1; /* Temporary input variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in2, in3, in4; /* Temporary input variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t zero = 0;
/*loop Unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
in1 = read_q15x2_ia ((q15_t **) &pSrc);
in2 = read_q15x2_ia ((q15_t **) &pSrc);
in3 = read_q15x2_ia ((q15_t **) &pSrc);
in4 = read_q15x2_ia ((q15_t **) &pSrc);
#ifndef ARM_MATH_BIG_ENDIAN
in1 = __QASX(zero, in1);
in2 = __QASX(zero, in2);
in3 = __QASX(zero, in3);
in4 = __QASX(zero, in4);
in1 = __QASX(0, in1);
in2 = __QASX(0, in2);
in3 = __QASX(0, in3);
in4 = __QASX(0, in4);
#else
in1 = __QSAX(zero, in1);
in2 = __QSAX(zero, in2);
in3 = __QSAX(zero, in3);
in4 = __QSAX(zero, in4);
in1 = __QSAX(0, in1);
in2 = __QSAX(0, in2);
in3 = __QSAX(0, in3);
in4 = __QSAX(0, in4);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
@ -98,52 +96,62 @@ void arm_cmplx_conj_q15(
in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
*__SIMD32(pDst)++ = in1;
*__SIMD32(pDst)++ = in2;
*__SIMD32(pDst)++ = in3;
*__SIMD32(pDst)++ = in4;
write_q15x2_ia (&pDst, in1);
write_q15x2_ia (&pDst, in2);
write_q15x2_ia (&pDst, in3);
write_q15x2_ia (&pDst, in4);
#else
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
/* Decrement the loop counter */
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = __SSAT(-*pSrc++, 16);
/* Decrement the loop counter */
blkCnt--;
}
#else
q15_t in;
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
/* Run the below code for Cortex-M0 */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (numSamples > 0U)
while (blkCnt > 0U)
{
/* realOut + j (imagOut) = realIn+ j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Decrement the loop counter */
numSamples--;
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in1 = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __SSAT(-in1, 16);
#else
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q31.c
* Description: Q31 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,141 +29,109 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Q31 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Q31 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_cmplx_conj_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inR1, inR2, inR3, inR4; /* Temporary real variables */
q31_t inI1, inI2, inI3, inI4; /* Temporary imaginary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
/* read real input sample */
inR1 = pSrc[0];
/* store real input sample */
pDst[0] = inR1;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* read imaginary input sample */
inI1 = pSrc[1];
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read real input sample */
inR2 = pSrc[2];
/* store real input sample */
pDst[2] = inR2;
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read imaginary input sample */
inI2 = pSrc[3];
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* negate imaginary input sample */
inI1 = __QSUB(0, inI1);
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read real input sample */
inR3 = pSrc[4];
/* store real input sample */
pDst[4] = inR3;
/* read imaginary input sample */
inI3 = pSrc[5];
/* negate imaginary input sample */
inI2 = __QSUB(0, inI2);
/* read real input sample */
inR4 = pSrc[6];
/* store real input sample */
pDst[6] = inR4;
/* negate imaginary input sample */
inI3 = __QSUB(0, inI3);
/* store imaginary input sample */
inI4 = pSrc[7];
/* store imaginary input samples */
pDst[1] = inI1;
/* negate imaginary input sample */
inI4 = __QSUB(0, inI4);
/* store imaginary input samples */
pDst[3] = inI2;
/* increment source pointer by 8 to proecess next samples */
pSrc += 8U;
/* store imaginary input samples */
pDst[5] = inI3;
pDst[7] = inI4;
/* increment destination pointer by 8 to process next samples */
pDst += 8U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Decrement the loop counter */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_f32.c
* Description: Floating-point complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,163 +29,205 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_dot_prod Complex Dot Product
*
* Computes the dot product of two complex vectors.
* The vectors are multiplied element-by-element and then summed.
*
* The <code>pSrcA</code> points to the first complex input vector and
* <code>pSrcB</code> points to the second complex input vector.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
*
* The underlying algorithm is used:
* <pre>
* realResult=0;
* imagResult=0;
* for(n=0; n<numSamples; n++) {
* realResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+0] - pSrcA[(2*n)+1]*pSrcB[(2*n)+1];
* imagResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+1] + pSrcA[(2*n)+1]*pSrcB[(2*n)+0];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Floating-point complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
void arm_cmplx_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result storage */
float32_t a0,b0,c0,d0;
uint32_t blkCnt; /* Loop counter */
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
float32_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4x2_t vec1,vec2,vec3,vec4;
float32x4_t accR,accI;
float32x2_t accum = vdup_n_f32(0);
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
accR = vdupq_n_f32(0.0);
accI = vdupq_n_f32(0.0);
/*loop Unrolling */
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3U;
while (blkCnt > 0U)
{
/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */
/* Calculate dot product and then store the result in a temporary buffer. */
vec1 = vld2q_f32(pSrcA);
vec2 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
vec3 = vld2q_f32(pSrcA);
vec4 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
/* Decrement the loop counter */
blkCnt--;
}
accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
real_sum += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
imag_sum += accum[0] + accum[1];
/* Tail */
blkCnt = numSamples & 0x7;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples & 0x3U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in the destination buffers */
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q15.c
* Description: Processing function for the Q15 Complex Dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,143 +29,120 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Q15 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
* These are accumulated in a 64-bit accumulator with 34.30 precision.
* As a final step, the accumulators are converted to 8.24 format.
* The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
@brief Q15 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned her
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
These are accumulated in a 64-bit accumulator with 34.30 precision.
As a final step, the accumulators are converted to 8.24 format.
The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
*/
void arm_cmplx_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q15_t a0,b0,c0,d0;
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q15_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 8.24 format */
/* Store real and imaginary result in 8.24 format */
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (real_sum >> 6);
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
@ -173,5 +150,5 @@ void arm_cmplx_dot_prod_q15(
}
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q31.c
* Description: Q31 complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,147 +29,125 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Q31 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
* The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
* Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
* The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
* Input down scaling is not required.
@brief Q31 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
Input down scaling is not required.
*/
void arm_cmplx_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q31_t a0,b0,c0,d0;
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q31_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 16.48 format */
/* Store real and imaginary result in 16.48 format */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_f32.c
* Description: Floating-point complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,125 +29,160 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_mag Complex Magnitude
*
* Computes the magnitude of the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* in the input array and the data is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The input array has a total of <code>2*numSamples</code> values;
* the output array has a total of <code>numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_mag
* @{
*/
/**
* @brief Floating-point complex magnitude.
* @param[in] *pSrc points to complex input buffer
* @param[out] *pDst points to real output buffer
* @param[in] numSamples number of complex samples in the input vector
* @return none.
*
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mag_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
float32_t realIn, imagIn; /* Temporary variables to hold input values */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
float32_t real, imag; /* Temporary variables to hold input values */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
#if defined(ARM_MATH_NEON)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
float32x4_t vMagSqA;
float32x4x2_t vecB;
float32x4_t vRealB;
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vecA = vld2q_f32(pSrc);
pSrc += 8;
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vecB = vld2q_f32(pSrc);
pSrc += 8;
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
vMagSqA = vaddq_f32(vRealA, vImagA);
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
vMagSqB = vaddq_f32(vRealB, vImagB);
/* Store the result in the destination buffer. */
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
pDst += 4;
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
blkCnt = numSamples & 7;
#else
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
while (numSamples > 0U)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
/* Decrement the loop counter */
numSamples--;
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q15.c
* Description: Q15 complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,113 +29,134 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag
* @{
@addtogroup cmplx_mag
@{
*/
/**
* @brief Q15 complex magnitude
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
@brief Q15 complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
*/
void arm_cmplx_mag_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
q31_t acc0, acc1; /* Accumulators */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
q31_t in;
q31_t acc0; /* Accumulators */
#else
q15_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t acc2, acc3;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
acc1 = __SMUAD(in2, in2);
acc2 = __SMUAD(in3, in3);
acc3 = __SMUAD(in4, in4);
/* store the result in 2.14 format in the destination buffer. */
arm_sqrt_q15((q15_t) ((acc0) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc1) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc2) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc3) >> 17), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
/* store the result in 2.14 format in the destination buffer. */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
/* Decrement the loop counter */
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
q15_t real, imag; /* Temporary variables to hold input values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* out = sqrt(real * real + imag * imag) */
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
acc0 = (real * real);
acc1 = (imag * imag);
/* store the result in 2.14 format in the destination buffer. */
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
#endif
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q31.c
* Description: Q31 complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,145 +29,102 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag
* @{
@addtogroup cmplx_mag
@{
*/
/**
* @brief Q31 complex magnitude
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
* Input down scaling is not required.
@brief Q31 complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
Input down scaling is not required.
*/
void arm_cmplx_mag_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
q31_t real, imag; /* Temporary variables to hold input values */
q31_t acc0, acc1; /* Accumulators */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t real1, real2, imag1, imag2; /* Temporary variables to hold input values */
q31_t out1, out2, out3, out4; /* Accumulators */
q63_t mul1, mul2, mul3, mul4; /* Temporary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read complex input from source buffer */
real1 = pSrc[0];
imag1 = pSrc[1];
real2 = pSrc[2];
imag2 = pSrc[3];
/* calculate power of input values */
mul1 = (q63_t) real1 *real1;
mul2 = (q63_t) imag1 *imag1;
mul3 = (q63_t) real2 *real2;
mul4 = (q63_t) imag2 *imag2;
/* get the result to 3.29 format */
out1 = (q31_t) (mul1 >> 33);
out2 = (q31_t) (mul2 >> 33);
out3 = (q31_t) (mul3 >> 33);
out4 = (q31_t) (mul4 >> 33);
/* add real and imaginary accumulators */
out1 = out1 + out2;
out3 = out3 + out4;
/* read complex input from source buffer */
real1 = pSrc[4];
imag1 = pSrc[5];
real2 = pSrc[6];
imag2 = pSrc[7];
/* calculate square root */
arm_sqrt_q31(out1, &pDst[0]);
/* calculate power of input values */
mul1 = (q63_t) real1 *real1;
/* calculate square root */
arm_sqrt_q31(out3, &pDst[1]);
/* calculate power of input values */
mul2 = (q63_t) imag1 *imag1;
mul3 = (q63_t) real2 *real2;
mul4 = (q63_t) imag2 *imag2;
/* get the result to 3.29 format */
out1 = (q31_t) (mul1 >> 33);
out2 = (q31_t) (mul2 >> 33);
out3 = (q31_t) (mul3 >> 33);
out4 = (q31_t) (mul4 >> 33);
/* add real and imaginary accumulators */
out1 = out1 + out2;
out3 = out3 + out4;
/* calculate square root */
arm_sqrt_q31(out1, &pDst[2]);
/* increment destination by 8 to process next samples */
pSrc += 8U;
/* calculate square root */
arm_sqrt_q31(out3, &pDst[3]);
/* increment destination by 4 to process next samples */
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 2.30 format in the destination buffer. */
/* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement the loop counter */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_f32.c
* Description: Floating-point complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,176 +29,156 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_mag_squared Complex Magnitude Squared
*
* Computes the magnitude squared of the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* in the input array and the data is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The input array has a total of <code>2*numSamples</code> values;
* the output array has a total of <code>numSamples</code> values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_mag_squared Complex Magnitude Squared
Computes the magnitude squared of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Floating-point complex magnitude squared
* @param[in] *pSrc points to the complex input vector
* @param[out] *pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
* @return none.
@brief Floating-point complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mag_squared_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
float32_t real, imag; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
float32_t real, imag; /* Temporary input variables */
#if defined (ARM_MATH_DSP)
float32_t real1, real2, real3, real4; /* Temporary variables to hold real values */
float32_t imag1, imag2, imag3, imag4; /* Temporary variables to hold imaginary values */
float32_t mul1, mul2, mul3, mul4; /* Temporary variables */
float32_t mul5, mul6, mul7, mul8; /* Temporary variables */
float32_t out1, out2, out3, out4; /* Temporary variables to hold output values */
#if defined(ARM_MATH_NEON)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
float32x4_t vMagSqA;
/*loop Unrolling */
blkCnt = numSamples >> 2U;
float32x4x2_t vecB;
float32x4_t vRealB;
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
/* read real input sample from source buffer */
real1 = pSrc[0];
/* read imaginary input sample from source buffer */
imag1 = pSrc[1];
/* out = sqrt((real * real) + (imag * imag)) */
/* calculate power of real value */
mul1 = real1 * real1;
vecA = vld2q_f32(pSrc);
pSrc += 8;
/* read real input sample from source buffer */
real2 = pSrc[2];
vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
vMagSqA = vaddq_f32(vRealA, vImagA);
/* calculate power of imaginary value */
mul2 = imag1 * imag1;
vecB = vld2q_f32(pSrc);
pSrc += 8;
/* read imaginary input sample from source buffer */
imag2 = pSrc[3];
vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
vMagSqB = vaddq_f32(vRealB, vImagB);
/* calculate power of real value */
mul3 = real2 * real2;
/* Store the result in the destination buffer. */
vst1q_f32(pDst, vMagSqA);
pDst += 4;
/* read real input sample from source buffer */
real3 = pSrc[4];
/* calculate power of imaginary value */
mul4 = imag2 * imag2;
/* read imaginary input sample from source buffer */
imag3 = pSrc[5];
/* calculate power of real value */
mul5 = real3 * real3;
/* calculate power of imaginary value */
mul6 = imag3 * imag3;
/* read real input sample from source buffer */
real4 = pSrc[6];
/* accumulate real and imaginary powers */
out1 = mul1 + mul2;
/* read imaginary input sample from source buffer */
imag4 = pSrc[7];
/* accumulate real and imaginary powers */
out2 = mul3 + mul4;
/* calculate power of real value */
mul7 = real4 * real4;
/* calculate power of imaginary value */
mul8 = imag4 * imag4;
/* store output to destination */
pDst[0] = out1;
/* accumulate real and imaginary powers */
out3 = mul5 + mul6;
/* store output to destination */
pDst[1] = out2;
/* accumulate real and imaginary powers */
out4 = mul7 + mul8;
/* store output to destination */
pDst[2] = out3;
/* increment destination pointer by 8 to process next samples */
pSrc += 8U;
/* store output to destination */
pDst[3] = out4;
/* increment destination pointer by 4 to process next samples */
pDst += 4U;
vst1q_f32(pDst, vMagSqB);
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples & 7;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* out = (real * real) + (imag * imag) */
/* store the result in the destination buffer. */
/* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q15.c
* Description: Q15 complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,108 +29,133 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Q15 complex magnitude squared
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
@brief Q15 complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
void arm_cmplx_mag_squared_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
q31_t acc0, acc1; /* Accumulators */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
q31_t in;
q31_t acc0; /* Accumulators */
#else
q15_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t acc2, acc3;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
acc1 = __SMUAD(in2, in2);
acc2 = __SMUAD(in3, in3);
acc3 = __SMUAD(in4, in4);
/* store the result in 3.13 format in the destination buffer. */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
*pDst++ = (q15_t) (acc1 >> 17);
*pDst++ = (q15_t) (acc2 >> 17);
*pDst++ = (q15_t) (acc3 >> 17);
/* Decrement the loop counter */
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t real, imag; /* Temporary variables to store real and imaginary values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* out = ((real * real) + (imag * imag)) */
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = (real * real);
acc1 = (imag * imag);
/* store the result in 3.13 format in the destination buffer. */
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
#endif
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q31.c
* Description: Q31 complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,55 +29,43 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Q31 complex magnitude squared
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
* Input down scaling is not required.
@brief Q31 complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
Input down scaling is not required.
*/
void arm_cmplx_mag_squared_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
q31_t real, imag; /* Temporary variables to store real and imaginary values */
q31_t acc0, acc1; /* Accumulators */
uint32_t blkCnt; /* Loop counter */
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
@ -90,60 +78,52 @@ void arm_cmplx_mag_squared_q31(
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
*pDst++ = acc0 + acc1;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* out = ((real * real) + (imag * imag)) */
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
/* store result in 3.29 format in destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_f32.c
* Description: Floating-point complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,168 +29,166 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
*
* Multiplies a complex vector by another complex vector and generates a complex result.
* The data in the complex arrays is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The parameter <code>numSamples</code> represents the number of complex
* samples processed. The complex arrays have a total of <code>2*numSamples</code>
* real values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
* pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
Multiplies a complex vector by another complex vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Floating-point complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
@brief Floating-point complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mult_cmplx_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples)
{
float32_t a1, b1, c1, d1; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* Loop counter */
float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4x2_t va, vb;
float32x4_t real, imag;
float32x4x2_t outCplx;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t a2, b2, c2, d2; /* Temporary variables to store real and imaginary values */
float32_t acc1, acc2, acc3, acc4;
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB
/* loop Unrolling */
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
vst2q_f32(pDst, outCplx);
/* Increment pointer */
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a1 = *pSrcA; /* A[2 * i] */
c1 = *pSrcB; /* B[2 * i] */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
b1 = *(pSrcA + 1); /* A[2 * i + 1] */
acc1 = a1 * c1; /* acc1 = A[2 * i] * B[2 * i] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
a2 = *(pSrcA + 2); /* A[2 * i + 2] */
acc2 = (b1 * c1); /* acc2 = A[2 * i + 1] * B[2 * i] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
d1 = *(pSrcB + 1); /* B[2 * i + 1] */
c2 = *(pSrcB + 2); /* B[2 * i + 2] */
acc1 -= b1 * d1; /* acc1 = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
d2 = *(pSrcB + 3); /* B[2 * i + 3] */
acc3 = a2 * c2; /* acc3 = A[2 * i + 2] * B[2 * i + 2] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
b2 = *(pSrcA + 3); /* A[2 * i + 3] */
acc2 += (a1 * d1); /* acc2 = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
a1 = *(pSrcA + 4); /* A[2 * i + 4] */
acc4 = (a2 * d2); /* acc4 = A[2 * i + 2] * B[2 * i + 3] */
c1 = *(pSrcB + 4); /* B[2 * i + 4] */
acc3 -= (b2 * d2); /* acc3 = A[2 * i + 2] * B[2 * i + 2] - A[2 * i + 3] * B[2 * i + 3] */
*pDst = acc1; /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
b1 = *(pSrcA + 5); /* A[2 * i + 5] */
acc4 += b2 * c2; /* acc4 = A[2 * i + 2] * B[2 * i + 3] + A[2 * i + 3] * B[2 * i + 2] */
*(pDst + 1) = acc2; /* C[2 * i + 1] = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
acc1 = (a1 * c1);
d1 = *(pSrcB + 5);
acc2 = (b1 * c1);
*(pDst + 2) = acc3;
*(pDst + 3) = acc4;
a2 = *(pSrcA + 6);
acc1 -= (b1 * d1);
c2 = *(pSrcB + 6);
acc2 += (a1 * d1);
b2 = *(pSrcA + 7);
acc3 = (a2 * c2);
d2 = *(pSrcB + 7);
acc4 = (b2 * c2);
*(pDst + 4) = acc1;
pSrcA += 8U;
acc3 -= (b2 * d2);
acc4 += (a2 * d2);
*(pDst + 5) = acc2;
pSrcB += 8U;
*(pDst + 6) = acc3;
*(pDst + 7) = acc4;
pDst += 8U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a1 = *pSrcA++;
b1 = *pSrcA++;
c1 = *pSrcB++;
d1 = *pSrcB++;
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
/* store the result in the destination buffer. */
*pDst++ = (a1 * c1) - (b1 * d1);
*pDst++ = (a1 * d1) + (b1 * c1);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* Decrement the numSamples loop counter */
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q15.c
* Description: Q15 complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,153 +29,108 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Q15 complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
@brief Q15 complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
void arm_cmplx_mult_cmplx_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples)
{
q15_t a, b, c, d; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* Loop counter */
q15_t a, b, c, d; /* Temporary variables */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* Decrement the blockSize loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q31.c
* Description: Q31 complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,286 +29,109 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Q31 complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
* Input down scaling is not required.
@brief Q31 complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
Input down scaling is not required.
*/
void arm_cmplx_mult_cmplx_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples)
{
q31_t a, b, c, d; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counters */
q31_t mul1, mul2, mul3, mul4;
q31_t out1, out2;
uint32_t blkCnt; /* Loop counter */
q31_t a, b, c, d; /* Temporary variables */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* store result in 3.29 format in destination buffer. */
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
/* loop Unrolling */
blkCnt = numSamples >> 1U;
/* First part of the processing with loop unrolling. Compute 2 outputs at a time.
** a second loop below computes the remaining 1 sample. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 2, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x2U;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
/* store result in 3.29 format in destination buffer. */
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_f32.c
* Description: Floating-point complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,185 +29,141 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup CmplxByRealMult Complex-by-Real Multiplication
*
* Multiplies a complex vector by a real vector and generates a complex result.
* The data in the complex arrays is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The parameter <code>numSamples</code> represents the number of complex
* samples processed. The complex arrays have a total of <code>2*numSamples</code>
* real values while the real array has a total of <code>numSamples</code>
* real values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
* pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup CmplxByRealMult Complex-by-Real Multiplication
Multiplies a complex vector by a real vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values while the real array has a total of <code>numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Floating-point complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
@brief Floating-point complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mult_real_f32(
float32_t * pSrcCmplx,
float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples)
const float32_t * pSrcCmplx,
const float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples)
{
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* Loop counter */
float32_t in; /* Temporary variable */
#if defined (ARM_MATH_DSP)
#if defined(ARM_MATH_NEON)
float32x4_t r;
float32x4x2_t ab,outCplx;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* Temporary variables to hold input data */
float32_t inA5, inA6, inA7, inA8; /* Temporary variables to hold input data */
float32_t inB1, inB2, inB3, inB4; /* Temporary variables to hold input data */
float32_t out1, out2, out3, out4; /* Temporary variables to hold output data */
float32_t out5, out6, out7, out8; /* Temporary variables to hold output data */
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* loop Unrolling */
while (blkCnt > 0U)
{
ab = vld2q_f32(pSrcCmplx); // load & separate real/imag pSrcA (de-interleave 2)
r = vld1q_f32(pSrcReal); // load & separate real/imag pSrcB
/* Increment pointers */
pSrcCmplx += 8;
pSrcReal += 4;
outCplx.val[0] = vmulq_f32(ab.val[0], r);
outCplx.val[1] = vmulq_f32(ab.val[1], r);
vst2q_f32(pCmplxDst, outCplx);
pCmplxDst += 8;
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read input from complex input buffer */
inA1 = pSrcCmplx[0];
inA2 = pSrcCmplx[1];
/* read input from real input buffer */
inB1 = pSrcReal[0];
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read input from complex input buffer */
inA3 = pSrcCmplx[2];
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* multiply complex buffer real input with real buffer input */
out1 = inA1 * inB1;
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* read input from complex input buffer */
inA4 = pSrcCmplx[3];
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* multiply complex buffer imaginary input with real buffer input */
out2 = inA2 * inB1;
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++* in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* read input from real input buffer */
inB2 = pSrcReal[1];
/* read input from complex input buffer */
inA5 = pSrcCmplx[4];
/* multiply complex buffer real input with real buffer input */
out3 = inA3 * inB2;
/* read input from complex input buffer */
inA6 = pSrcCmplx[5];
/* read input from real input buffer */
inB3 = pSrcReal[2];
/* multiply complex buffer imaginary input with real buffer input */
out4 = inA4 * inB2;
/* read input from complex input buffer */
inA7 = pSrcCmplx[6];
/* multiply complex buffer real input with real buffer input */
out5 = inA5 * inB3;
/* read input from complex input buffer */
inA8 = pSrcCmplx[7];
/* multiply complex buffer imaginary input with real buffer input */
out6 = inA6 * inB3;
/* read input from real input buffer */
inB4 = pSrcReal[3];
/* store result to destination bufer */
pCmplxDst[0] = out1;
/* multiply complex buffer real input with real buffer input */
out7 = inA7 * inB4;
/* store result to destination bufer */
pCmplxDst[1] = out2;
/* multiply complex buffer imaginary input with real buffer input */
out8 = inA8 * inB4;
/* store result to destination bufer */
pCmplxDst[2] = out3;
pCmplxDst[3] = out4;
pCmplxDst[4] = out5;
/* incremnet complex input buffer by 8 to process next samples */
pSrcCmplx += 8U;
/* store result to destination bufer */
pCmplxDst[5] = out6;
/* increment real input buffer by 4 to process next samples */
pSrcReal += 4U;
/* store result to destination bufer */
pCmplxDst[6] = out7;
pCmplxDst[7] = out8;
/* increment destination buffer by 8 to process next sampels */
pCmplxDst += 8U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ = (*pSrcCmplx++) * (in);
*pCmplxDst++ = (*pSrcCmplx++) * (in);
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* Decrement the numSamples loop counter */
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* Decrement loop counter */
blkCnt--;
}
}
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q15.c
* Description: Q15 complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,77 +29,71 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Q15 complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_cmplx_mult_real_q15(
q15_t * pSrcCmplx,
q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples)
const q15_t * pSrcCmplx,
const q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples)
{
q15_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2; /* Temporary variables to hold input data */
q31_t inB1; /* Temporary variables to hold input data */
q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
q31_t inA1, inA2; /* Temporary variables to hold input data */
q31_t inB1; /* Temporary variables to hold input data */
q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read complex number both real and imaginary from complex input buffer */
inA1 = *__SIMD32(pSrcCmplx)++;
/* read two real values at a time from real input buffer */
inB1 = *__SIMD32(pSrcReal)++;
/* read complex number both real and imaginary from complex input buffer */
inA2 = *__SIMD32(pSrcCmplx)++;
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
#if defined (ARM_MATH_DSP)
/* read 2 complex numbers both real and imaginary from complex input buffer */
inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
/* read 2 real values at a time from real input buffer */
inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
/* multiply complex number with real numbers */
#ifndef ARM_MATH_BIG_ENDIAN
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
#else
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* saturate the result */
@ -109,27 +103,23 @@ void arm_cmplx_mult_real_q15(
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
/* pack real and imaginary outputs and store them to destination */
*__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
inA1 = *__SIMD32(pSrcCmplx)++;
inB1 = *__SIMD32(pSrcReal)++;
inA2 = *__SIMD32(pSrcCmplx)++;
inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
#ifndef ARM_MATH_BIG_ENDIAN
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
#else
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
@ -137,55 +127,56 @@ void arm_cmplx_mult_real_q15(
out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
/* Decrement the numSamples loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
#else
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
/* Decrement the numSamples loop counter */
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* realOut = realA * realB. */
/* imagOut = imagA * realB. */
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
/* Decrement the numSamples loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q31.c
* Description: Q31 complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,183 +29,120 @@
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Q31 complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_cmplx_mult_real_q31(
q31_t * pSrcCmplx,
q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples)
const q31_t * pSrcCmplx,
const q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples)
{
q31_t inA1; /* Temporary variable to store input value */
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary variable */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
q31_t inA2, inA3, inA4; /* Temporary variables to hold input data */
q31_t inB1, inB2; /* Temporary variabels to hold input data */
q31_t out1, out2, out3, out4; /* Temporary variables to hold output data */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
inB2 = *pSrcReal++;
/* read imaginary input from complex input buffer */
inA3 = *pSrcCmplx++;
inA4 = *pSrcCmplx++;
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
out3 = ((q63_t) inA3 * inB2) >> 32;
out4 = ((q63_t) inA4 * inB2) >> 32;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
/* store saturated result in 1.31 format to destination buffer */
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
/* store result in destination buffer. */
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
out3 = out3 << 1;
out4 = out4 << 1;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
*pCmplxDst++ = out3;
*pCmplxDst++ = out4;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
inB2 = *pSrcReal++;
/* read imaginary input from complex input buffer */
inA3 = *pSrcCmplx++;
inA4 = *pSrcCmplx++;
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
out3 = ((q63_t) inA3 * inB2) >> 32;
out4 = ((q63_t) inA4 * inB2) >> 32;
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
out3 = out3 << 1;
out4 = out4 << 1;
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
*pCmplxDst++ = out3;
*pCmplxDst++ = out4;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
/* Decrement the numSamples loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* realOut = realA * realB. */
/* imagReal = imagA * realB. */
inA1 = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
*pCmplxDst++ =
(q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* Decrement the numSamples loop counter */
numSamples--;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
/* store saturated result in 1.31 format to destination buffer */
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
/* store result in destination buffer. */
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/

View file

@ -0,0 +1,37 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPController)
add_library(CMSISDSPController STATIC)
configdsp(CMSISDSPController ..)
include(interpol)
interpol(CMSISDSPController)
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPController PUBLIC ARM_ALL_FAST_TABLES)
endif()
target_sources(CMSISDSPController PRIVATE arm_pid_init_f32.c)
target_sources(CMSISDSPController PRIVATE arm_pid_init_q15.c)
target_sources(CMSISDSPController PRIVATE arm_pid_init_q31.c)
target_sources(CMSISDSPController PRIVATE arm_pid_reset_f32.c)
target_sources(CMSISDSPController PRIVATE arm_pid_reset_q15.c)
target_sources(CMSISDSPController PRIVATE arm_pid_reset_q31.c)
if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_F32)
target_sources(CMSISDSPController PRIVATE arm_sin_cos_f32.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_Q31)
target_sources(CMSISDSPController PRIVATE arm_sin_cos_q31.c)
endif()
### Includes
target_include_directories(CMSISDSPController PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,37 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: ControllerFunctions.c
* Description: Combination of all controller function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_pid_init_f32.c"
#include "arm_pid_init_q15.c"
#include "arm_pid_init_q31.c"
#include "arm_pid_reset_f32.c"
#include "arm_pid_reset_q15.c"
#include "arm_pid_reset_q31.c"
#include "arm_sin_cos_f32.c"
#include "arm_sin_cos_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_pid_init_f32.c
* Description: Floating-point PID Control initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,29 +28,30 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @brief Initialization function for the floating-point PID Control.
* @param[in,out] *S points to an instance of the PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state & 1 = reset the state.
* @return none.
* \par Description:
* \par
* The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
* The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
* using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
* also sets the state variables to all zeros.
@brief Initialization function for the floating-point PID Control.
@param[in,out] S points to an instance of the PID structure
@param[in] resetStateFlag
- value = 0: no change in state
- value = 1: reset state
@return none
@par Details
The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
also sets the state variables to all zeros.
*/
void arm_pid_init_f32(
arm_pid_instance_f32 * S,
int32_t resetStateFlag)
{
/* Derived coefficient A0 */
S->A0 = S->Kp + S->Ki + S->Kd;
@ -63,12 +64,12 @@ void arm_pid_init_f32(
/* Check whether state needs reset or not */
if (resetStateFlag)
{
/* Clear the state buffer. The size will be always 3 samples */
/* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(float32_t));
}
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_pid_init_q15.c
* Description: Q15 PID Control initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,22 +28,24 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @details
* @param[in,out] *S points to an instance of the Q15 PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
* @return none.
* \par Description:
* \par
* The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
* The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
* using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
* also sets the state variables to all zeros.
@brief Initialization function for the Q15 PID Control.
@param[in,out] S points to an instance of the Q15 PID structure
@param[in] resetStateFlag
- value = 0: no change in state
- value = 1: reset state
@return none
@par Details
The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
also sets the state variables to all zeros.
*/
void arm_pid_init_q15(
@ -53,35 +55,20 @@ void arm_pid_init_q15(
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Derived coefficient A0 */
S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);
/* Derived coefficients and pack into A1 */
#ifndef ARM_MATH_BIG_ENDIAN
S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);
#else
S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Check whether state needs reset or not */
if (resetStateFlag)
{
/* Clear the state buffer. The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q15_t));
}
#endif
#else
/* Run the below code for Cortex-M0 */
q31_t temp; /*to store the sum */
q31_t temp; /* to store the sum */
/* Derived coefficient A0 */
temp = S->Kp + S->Ki + S->Kd;
@ -92,19 +79,17 @@ void arm_pid_init_q15(
S->A1 = (q15_t) __SSAT(temp, 16);
S->A2 = S->Kd;
#endif /* #if defined (ARM_MATH_DSP) */
/* Check whether state needs reset or not */
if (resetStateFlag)
{
/* Clear the state buffer. The size will be always 3 samples */
/* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q15_t));
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_pid_init_q31.c
* Description: Q31 PID Control initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,22 +28,24 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @brief Initialization function for the Q31 PID Control.
* @param[in,out] *S points to an instance of the Q31 PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
* @return none.
* \par Description:
* \par
* The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
* The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
* using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
* also sets the state variables to all zeros.
@brief Initialization function for the Q31 PID Control.
@param[in,out] S points to an instance of the Q31 PID structure
@param[in] resetStateFlag
- value = 0: no change in state
- value = 1: reset state
@return none
@par Details
The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
also sets the state variables to all zeros.
*/
void arm_pid_init_q31(
@ -53,20 +55,15 @@ void arm_pid_init_q31(
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Derived coefficient A0 */
S->A0 = __QADD(__QADD(S->Kp, S->Ki), S->Kd);
/* Derived coefficient A1 */
S->A1 = -__QADD(__QADD(S->Kd, S->Kd), S->Kp);
#else
/* Run the below code for Cortex-M0 */
q31_t temp;
q31_t temp; /* to store the sum */
/* Derived coefficient A0 */
temp = clip_q63_to_q31((q63_t) S->Kp + S->Ki);
@ -84,12 +81,12 @@ void arm_pid_init_q31(
/* Check whether state needs reset or not */
if (resetStateFlag)
{
/* Clear the state buffer. The size will be always 3 samples */
/* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q31_t));
}
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_pid_reset_f32.c
* Description: Floating-point PID Control reset function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,26 +28,27 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @brief Reset function for the floating-point PID Control.
* @param[in] *S Instance pointer of PID control data structure.
* @return none.
* \par Description:
* The function resets the state buffer to zeros.
*/
@brief Reset function for the floating-point PID Control.
@param[in,out] S points to an instance of the floating-point PID structure
@return none
@par Details
The function resets the state buffer to zeros.
*/
void arm_pid_reset_f32(
arm_pid_instance_f32 * S)
{
/* Clear the state buffer. The size will be always 3 samples */
/* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(float32_t));
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_pid_reset_q15.c
* Description: Q15 PID Control reset function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,18 +28,20 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @brief Reset function for the Q15 PID Control.
* @param[in] *S Instance pointer of PID control data structure.
* @return none.
* \par Description:
* The function resets the state buffer to zeros.
*/
@brief Reset function for the Q15 PID Control.
@param[in,out] S points to an instance of the Q15 PID structure
@return none
@par Details
The function resets the state buffer to zeros.
*/
void arm_pid_reset_q15(
arm_pid_instance_q15 * S)
{
@ -48,5 +50,5 @@ void arm_pid_reset_q15(
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_pid_reset_q31.c
* Description: Q31 PID Control reset function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,26 +28,27 @@
#include "arm_math.h"
/**
* @addtogroup PID
* @{
/**
@addtogroup PID
@{
*/
/**
* @brief Reset function for the Q31 PID Control.
* @param[in] *S Instance pointer of PID control data structure.
* @return none.
* \par Description:
* The function resets the state buffer to zeros.
*/
@brief Reset function for the Q31 PID Control.
@param[in,out] S points to an instance of the Q31 PID structure
@return none
@par Details
The function resets the state buffer to zeros.
*/
void arm_pid_reset_q31(
arm_pid_instance_q31 * S)
{
/* Clear the state buffer. The size will be always 3 samples */
/* Reset state to zero, The size will be always 3 samples */
memset(S->state, 0, 3U * sizeof(q31_t));
}
/**
* @} end of PID group
@} end of PID group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sin_cos_f32.c
* Description: Sine and Cosine calculation for floating-point values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,115 +30,117 @@
#include "arm_common_tables.h"
/**
* @ingroup groupController
@ingroup groupController
*/
/**
* @defgroup SinCos Sine Cosine
*
* Computes the trigonometric sine and cosine values using a combination of table lookup
* and linear interpolation.
* There are separate functions for Q31 and floating-point data types.
* The input to the floating-point version is in degrees while the
* fixed-point Q31 have a scaled input with the range
* [-1 0.9999] mapping to [-180 +180] degrees.
*
* The floating point function also allows values that are out of the usual range. When this happens, the function will
* take extra time to adjust the input value to the range of [-180 180].
*
* The result is accurate to 5 digits after the decimal point.
*
* The implementation is based on table lookup using 360 values together with linear interpolation.
* The steps used are:
* -# Calculation of the nearest integer table index.
* -# Compute the fractional portion (fract) of the input.
* -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
* -# Sine value is computed as <code> *psinVal = y0 + (fract * (y1 - y0))</code>.
* -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
* -# Cosine value is computed as <code> *pcosVal = y0 + (fract * (y1 - y0))</code>.
*/
@defgroup SinCos Sine Cosine
/**
* @addtogroup SinCos
* @{
Computes the trigonometric sine and cosine values using a combination of table lookup
and linear interpolation.
There are separate functions for Q31 and floating-point data types.
The input to the floating-point version is in degrees while the
fixed-point Q31 have a scaled input with the range
[-1 0.9999] mapping to [-180 +180] degrees.
The floating point function also allows values that are out of the usual range. When this happens, the function will
take extra time to adjust the input value to the range of [-180 180].
The result is accurate to 5 digits after the decimal point.
The implementation is based on table lookup using 360 values together with linear interpolation.
The steps used are:
-# Calculation of the nearest integer table index.
-# Compute the fractional portion (fract) of the input.
-# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
-# Sine value is computed as <code> *psinVal = y0 + (fract * (y1 - y0))</code>.
-# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
-# Cosine value is computed as <code> *pcosVal = y0 + (fract * (y1 - y0))</code>.
*/
/**
* @brief Floating-point sin_cos function.
* @param[in] theta input value in degrees
* @param[out] *pSinVal points to the processed sine output.
* @param[out] *pCosVal points to the processed cos output.
* @return none.
@addtogroup SinCos
@{
*/
/**
@brief Floating-point sin_cos function.
@param[in] theta input value in degrees
@param[out] pSinVal points to processed sine output
@param[out] pCosVal points to processed cosine output
@return none
*/
void arm_sin_cos_f32(
float32_t theta,
float32_t * pSinVal,
float32_t * pCosVal)
float32_t theta,
float32_t * pSinVal,
float32_t * pCosVal)
{
float32_t fract, in; /* Temporary variables for input, output */
uint16_t indexS, indexC; /* Index variable */
float32_t f1, f2, d1, d2; /* Two nearest output values */
float32_t findex, Dn, Df, temp;
float32_t fract, in; /* Temporary input, output variables */
uint16_t indexS, indexC; /* Index variable */
float32_t f1, f2, d1, d2; /* Two nearest output values */
float32_t Dn, Df;
float32_t temp, findex;
/* input x is in degrees */
/* Scale the input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
in = theta * 0.00277777777778f;
/* input x is in degrees */
/* Scale input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
in = theta * 0.00277777777778f;
if (in < 0.0f)
{
in = -in;
}
if (in < 0.0f)
{
in = -in;
}
in = in - (int32_t)in;
in = in - (int32_t)in;
/* Calculation of index of the table */
findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
indexS = ((uint16_t)findex) & 0x1ff;
indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
/* Calculate the nearest index */
findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
indexS = ((uint16_t)findex) & 0x1ff;
indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
/* fractional value calculation */
fract = findex - (float32_t) indexS;
/* Calculation of fractional value */
fract = findex - (float32_t) indexS;
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_f32[indexC+0];
f2 = sinTable_f32[indexC+1];
d1 = -sinTable_f32[indexS+0];
d2 = -sinTable_f32[indexS+1];
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_f32[indexC ];
f2 = sinTable_f32[indexC+1];
d1 = -sinTable_f32[indexS ];
d2 = -sinTable_f32[indexS+1];
temp = (1.0f - fract) * f1 + fract * f2;
temp = (1.0f - fract) * f1 + fract * f2;
Dn = 0.0122718463030f; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
Df = f2 - f1; // delta between the values of the functions
Dn = 0.0122718463030f; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
Df = f2 - f1; /* delta between the values of the functions */
temp = Dn *(d1 + d2) - 2 * Df;
temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
temp = fract * temp + d1 * Dn;
temp = Dn * (d1 + d2) - 2 * Df;
temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
temp = fract * temp + d1 * Dn;
/* Calculation of cosine value */
*pCosVal = fract * temp + f1;
/* Calculation of cosine value */
*pCosVal = fract * temp + f1;
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_f32[indexS+0];
f2 = sinTable_f32[indexS+1];
d1 = sinTable_f32[indexC+0];
d2 = sinTable_f32[indexC+1];
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_f32[indexS ];
f2 = sinTable_f32[indexS+1];
d1 = sinTable_f32[indexC ];
d2 = sinTable_f32[indexC+1];
temp = (1.0f - fract) * f1 + fract * f2;
temp = (1.0f - fract) * f1 + fract * f2;
Df = f2 - f1; // delta between the values of the functions
temp = Dn*(d1 + d2) - 2*Df;
temp = fract*temp + (3*Df - (d2 + 2*d1)*Dn);
temp = fract*temp + d1*Dn;
Df = f2 - f1; // delta between the values of the functions
temp = Dn * (d1 + d2) - 2 * Df;
temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
temp = fract * temp + d1 * Dn;
/* Calculation of sine value */
*pSinVal = fract*temp + f1;
/* Calculation of sine value */
*pSinVal = fract * temp + f1;
if (theta < 0.0f)
{
*pSinVal = -*pSinVal;
}
if (theta < 0.0f)
{
*pSinVal = -*pSinVal;
}
}
/**
* @} end of SinCos group
@} end of SinCos group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sin_cos_q31.c
* Description: Cosine & Sine calculation for Q31 values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,23 +30,22 @@
#include "arm_common_tables.h"
/**
* @ingroup groupController
*/
/**
* @addtogroup SinCos
* @{
@ingroup groupController
*/
/**
* @brief Q31 sin_cos function.
* @param[in] theta scaled input value in degrees
* @param[out] *pSinVal points to the processed sine output.
* @param[out] *pCosVal points to the processed cosine output.
* @return none.
*
* The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
*
@addtogroup SinCos
@{
*/
/**
@brief Q31 sin_cos function.
@param[in] theta scaled input value in degrees
@param[out] pSinVal points to processed sine output
@param[out] pCosVal points to processed cosine output
@return none
The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
*/
void arm_sin_cos_q31(
@ -54,9 +53,9 @@ void arm_sin_cos_q31(
q31_t * pSinVal,
q31_t * pCosVal)
{
q31_t fract; /* Temporary variables for input, output */
uint16_t indexS, indexC; /* Index variable */
q31_t f1, f2, d1, d2; /* Two nearest output values */
q31_t fract; /* Temporary input, output variables */
uint16_t indexS, indexC; /* Index variable */
q31_t f1, f2, d1, d2; /* Two nearest output values */
q31_t Dn, Df;
q63_t temp;
@ -68,43 +67,44 @@ void arm_sin_cos_q31(
fract = (theta - (indexS << CONTROLLER_Q31_SHIFT)) << 8;
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_q31[indexC+0];
f2 = sinTable_q31[indexC+1];
d1 = -sinTable_q31[indexS+0];
f1 = sinTable_q31[indexC ];
f2 = sinTable_q31[indexC+1];
d1 = -sinTable_q31[indexS ];
d2 = -sinTable_q31[indexS+1];
Dn = 0x1921FB5; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
Df = f2 - f1; // delta between the values of the functions
temp = Dn*((q63_t)d1 + d2);
Dn = 0x1921FB5; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
Df = f2 - f1; /* delta between the values of the functions */
temp = Dn * ((q63_t)d1 + d2);
temp = temp - ((q63_t)Df << 32);
temp = (q63_t)fract*(temp >> 31);
temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
temp = (q63_t)fract*(temp >> 31);
temp = temp + (q63_t)d1*Dn;
temp = (q63_t)fract*(temp >> 31);
temp = (q63_t)fract * (temp >> 31);
temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
temp = (q63_t)fract * (temp >> 31);
temp = temp + (q63_t)d1 * Dn;
temp = (q63_t)fract * (temp >> 31);
/* Calculation of cosine value */
*pCosVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
/* Read two nearest values of input value from the cos & sin tables */
f1 = sinTable_q31[indexS+0];
f1 = sinTable_q31[indexS ];
f2 = sinTable_q31[indexS+1];
d1 = sinTable_q31[indexC+0];
d1 = sinTable_q31[indexC ];
d2 = sinTable_q31[indexC+1];
Df = f2 - f1; // delta between the values of the functions
temp = Dn*((q63_t)d1 + d2);
temp = Dn * ((q63_t)d1 + d2);
temp = temp - ((q63_t)Df << 32);
temp = (q63_t)fract*(temp >> 31);
temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
temp = (q63_t)fract*(temp >> 31);
temp = temp + (q63_t)d1*Dn;
temp = (q63_t)fract*(temp >> 31);
temp = (q63_t)fract * (temp >> 31);
temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
temp = (q63_t)fract * (temp >> 31);
temp = temp + (q63_t)d1 * Dn;
temp = (q63_t)fract * (temp >> 31);
/* Calculation of sine value */
*pSinVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
}
/**
* @} end of SinCos group
@} end of SinCos group
*/

View file

@ -0,0 +1,51 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPFastMath)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPFastMath STATIC)
include(interpol)
interpol(CMSISDSPFastMath)
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_ALL_FAST_TABLES)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_F32)
target_sources(CMSISDSPFastMath PRIVATE arm_cos_f32.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q15)
target_sources(CMSISDSPFastMath PRIVATE arm_cos_q15.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q31)
target_sources(CMSISDSPFastMath PRIVATE arm_cos_q31.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_F32)
target_sources(CMSISDSPFastMath PRIVATE arm_sin_f32.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q15)
target_sources(CMSISDSPFastMath PRIVATE arm_sin_q15.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q31)
target_sources(CMSISDSPFastMath PRIVATE arm_sin_q31.c)
endif()
target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q15.c)
target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q31.c)
configdsp(CMSISDSPFastMath ..)
### Includes
target_include_directories(CMSISDSPFastMath PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,37 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: FastMathFunctions.c
* Description: Combination of all fast math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cos_f32.c"
#include "arm_cos_q15.c"
#include "arm_cos_q31.c"
#include "arm_sin_f32.c"
#include "arm_sin_q15.c"
#include "arm_sin_q31.c"
#include "arm_sqrt_q15.c"
#include "arm_sqrt_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_cos_f32.c
* Description: Fast cosine calculation for floating-point values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,56 +28,57 @@
#include "arm_math.h"
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
@ingroup groupFastMath
*/
/**
* @defgroup cos Cosine
*
* Computes the trigonometric cosine function using a combination of table lookup
* and linear interpolation. There are separate functions for
* Q15, Q31, and floating-point data types.
* The input to the floating-point version is in radians and in the range [0 2*pi) while the
* fixed-point Q15 and Q31 have a scaled input with the range
* [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
* value of 2*pi wraps around to 0.
*
* The implementation is based on table lookup using 256 values together with linear interpolation.
* The steps used are:
* -# Calculation of the nearest integer table index
* -# Compute the fractional portion (fract) of the table index.
* -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
*
* where
* <pre>
* b=Table[index+0];
* c=Table[index+1];
* </pre>
*/
@defgroup cos Cosine
/**
* @addtogroup cos
* @{
Computes the trigonometric cosine function using a combination of table lookup
and linear interpolation. There are separate functions for
Q15, Q31, and floating-point data types.
The input to the floating-point version is in radians while the
fixed-point Q15 and Q31 have a scaled input with the range
[0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
value of 2*pi wraps around to 0.
The implementation is based on table lookup using 256 values together with linear interpolation.
The steps used are:
-# Calculation of the nearest integer table index
-# Compute the fractional portion (fract) of the table index.
-# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
where
<pre>
b = Table[index];
c = Table[index+1];
</pre>
*/
/**
* @brief Fast approximation to the trigonometric cosine function for floating-point data.
* @param[in] x input value in radians.
* @return cos(x).
@addtogroup cos
@{
*/
/**
@brief Fast approximation to the trigonometric cosine function for floating-point data.
@param[in] x input value in radians
@return cos(x)
*/
float32_t arm_cos_f32(
float32_t x)
{
float32_t cosVal, fract, in; /* Temporary variables for input, output */
float32_t cosVal, fract, in; /* Temporary input, output variables */
uint16_t index; /* Index variable */
float32_t a, b; /* Two nearest output values */
int32_t n;
float32_t findex;
/* input x is in radians */
/* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
/* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
in = x * 0.159154943092f + 0.25f;
/* Calculation of floor value of input */
@ -93,8 +94,14 @@ float32_t arm_cos_f32(
in = in - (float32_t) n;
/* Calculation of index of the table */
findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
index = ((uint16_t)findex) & 0x1ff;
findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
index = (uint16_t)findex;
/* when "in" is exactly 1, we need to rotate the index down to 0 */
if (index >= FAST_MATH_TABLE_SIZE) {
index = 0;
findex -= (float32_t)FAST_MATH_TABLE_SIZE;
}
/* fractional value calculation */
fract = findex - (float32_t) index;
@ -104,12 +111,12 @@ float32_t arm_cos_f32(
b = sinTable_f32[index+1];
/* Linear interpolation process */
cosVal = (1.0f-fract)*a + fract*b;
cosVal = (1.0f - fract) * a + fract * b;
/* Return the output value */
/* Return output value */
return (cosVal);
}
/**
* @} end of cos group
@} end of cos group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cos_q15.c
* Description: Fast cosine calculation for Q15 values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,36 +30,35 @@
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
*/
/**
* @addtogroup cos
* @{
@ingroup groupFastMath
*/
/**
* @brief Fast approximation to the trigonometric cosine function for Q15 data.
* @param[in] x Scaled input value in radians.
* @return cos(x).
*
* The Q15 input value is in the range [0 +0.9999] and is mapped to a radian
* value in the range [0 2*pi).
@addtogroup cos
@{
*/
/**
@brief Fast approximation to the trigonometric cosine function for Q15 data.
@param[in] x Scaled input value in radians
@return cos(x)
The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q15_t arm_cos_q15(
q15_t x)
{
q15_t cosVal; /* Temporary variables for input, output */
int32_t index; /* Index variables */
q15_t a, b; /* Four nearest output values */
q15_t cosVal; /* Temporary input, output variables */
int32_t index; /* Index variable */
q15_t a, b; /* Two nearest output values */
q15_t fract; /* Temporary values for fractional values */
/* add 0.25 (pi/2) to read sine table */
x = (uint16_t)x + 0x2000;
if (x < 0)
{ /* convert negative numbers to corresponding positive ones */
x = (uint16_t)x + 0x8000;
{ /* convert negative numbers to corresponding positive ones */
x = (uint16_t)x + 0x8000;
}
/* Calculate the nearest index */
@ -73,12 +72,13 @@ q15_t arm_cos_q15(
b = sinTable_q15[index+1];
/* Linear interpolation process */
cosVal = (q31_t)(0x8000-fract)*a >> 16;
cosVal = (q15_t)((((q31_t)cosVal << 16) + ((q31_t)fract*b)) >> 16);
cosVal = (q31_t) (0x8000 - fract) * a >> 16;
cosVal = (q15_t) ((((q31_t) cosVal << 16) + ((q31_t) fract * b)) >> 16);
return cosVal << 1;
/* Return output value */
return (cosVal << 1);
}
/**
* @} end of cos group
@} end of cos group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cos_q31.c
* Description: Fast cosine calculation for Q31 values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,36 +30,35 @@
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
*/
/**
* @addtogroup cos
* @{
@ingroup groupFastMath
*/
/**
* @brief Fast approximation to the trigonometric cosine function for Q31 data.
* @param[in] x Scaled input value in radians.
* @return cos(x).
*
* The Q31 input value is in the range [0 +0.9999] and is mapped to a radian
* value in the range [0 2*pi).
@addtogroup cos
@{
*/
/**
@brief Fast approximation to the trigonometric cosine function for Q31 data.
@param[in] x Scaled input value in radians
@return cos(x)
The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q31_t arm_cos_q31(
q31_t x)
{
q31_t cosVal; /* Temporary variables for input, output */
int32_t index; /* Index variables */
q31_t a, b; /* Four nearest output values */
q31_t cosVal; /* Temporary input, output variables */
int32_t index; /* Index variable */
q31_t a, b; /* Two nearest output values */
q31_t fract; /* Temporary values for fractional values */
/* add 0.25 (pi/2) to read sine table */
x = (uint32_t)x + 0x20000000;
if (x < 0)
{ /* convert negative numbers to corresponding positive ones */
x = (uint32_t)x + 0x80000000;
{ /* convert negative numbers to corresponding positive ones */
x = (uint32_t)x + 0x80000000;
}
/* Calculate the nearest index */
@ -73,12 +72,13 @@ q31_t arm_cos_q31(
b = sinTable_q31[index+1];
/* Linear interpolation process */
cosVal = (q63_t)(0x80000000-fract)*a >> 32;
cosVal = (q31_t)((((q63_t)cosVal << 32) + ((q63_t)fract*b)) >> 32);
cosVal = (q63_t) (0x80000000 - fract) * a >> 32;
cosVal = (q31_t) ((((q63_t) cosVal << 32) + ((q63_t) fract * b)) >> 32);
return cosVal << 1;
/* Return output value */
return (cosVal << 1);
}
/**
* @} end of cos group
@} end of cos group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sin_f32.c
* Description: Fast sine calculation for floating-point values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -28,70 +28,64 @@
#include "arm_math.h"
#include "arm_common_tables.h"
#include <math.h>
/**
* @ingroup groupFastMath
@ingroup groupFastMath
*/
/**
* @defgroup sin Sine
*
* Computes the trigonometric sine function using a combination of table lookup
* and linear interpolation. There are separate functions for
* Q15, Q31, and floating-point data types.
* The input to the floating-point version is in radians and in the range [0 2*pi) while the
* fixed-point Q15 and Q31 have a scaled input with the range
* [0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
* value of 2*pi wraps around to 0.
*
* The implementation is based on table lookup using 256 values together with linear interpolation.
* The steps used are:
* -# Calculation of the nearest integer table index
* -# Compute the fractional portion (fract) of the table index.
* -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
*
* where
* <pre>
* b=Table[index+0];
* c=Table[index+1];
* </pre>
@defgroup sin Sine
Computes the trigonometric sine function using a combination of table lookup
and linear interpolation. There are separate functions for
Q15, Q31, and floating-point data types.
The input to the floating-point version is in radians while the
fixed-point Q15 and Q31 have a scaled input with the range
[0 +0.9999] mapping to [0 2*pi). The fixed-point range is chosen so that a
value of 2*pi wraps around to 0.
The implementation is based on table lookup using 256 values together with linear interpolation.
The steps used are:
-# Calculation of the nearest integer table index
-# Compute the fractional portion (fract) of the table index.
-# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
where
<pre>
b = Table[index];
c = Table[index+1];
</pre>
*/
/**
* @addtogroup sin
* @{
@addtogroup sin
@{
*/
/**
* @brief Fast approximation to the trigonometric sine function for floating-point data.
* @param[in] x input value in radians.
* @return sin(x).
@brief Fast approximation to the trigonometric sine function for floating-point data.
@param[in] x input value in radians.
@return sin(x)
*/
float32_t arm_sin_f32(
float32_t x)
{
float32_t sinVal, fract, in; /* Temporary variables for input, output */
uint16_t index; /* Index variable */
float32_t a, b; /* Two nearest output values */
float32_t sinVal, fract, in; /* Temporary input, output variables */
uint16_t index; /* Index variable */
float32_t a, b; /* Two nearest output values */
int32_t n;
float32_t findex;
/* Special case for small negative inputs */
if ((x < 0.0f) && (x >= -1.9e-7f)) {
return x;
}
/* input x is in radians */
/* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi */
/* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi */
in = x * 0.159154943092f;
/* Calculation of floor value of input */
n = (int32_t) in;
/* Make negative values towards -infinity */
if (x < 0.0f)
if (in < 0.0f)
{
n--;
}
@ -100,9 +94,14 @@ float32_t arm_sin_f32(
in = in - (float32_t) n;
/* Calculation of index of the table */
findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
index = (uint16_t)findex;
index = ((uint16_t)findex) & 0x1ff;
/* when "in" is exactly 1, we need to rotate the index down to 0 */
if (index >= FAST_MATH_TABLE_SIZE) {
index = 0;
findex -= (float32_t)FAST_MATH_TABLE_SIZE;
}
/* fractional value calculation */
fract = findex - (float32_t) index;
@ -112,12 +111,12 @@ float32_t arm_sin_f32(
b = sinTable_f32[index+1];
/* Linear interpolation process */
sinVal = (1.0f-fract)*a + fract*b;
sinVal = (1.0f - fract) * a + fract * b;
/* Return the output value */
/* Return output value */
return (sinVal);
}
/**
* @} end of sin group
@} end of sin group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sin_q15.c
* Description: Fast sine calculation for Q15 values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,28 +30,28 @@
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
*/
/**
* @addtogroup sin
* @{
@ingroup groupFastMath
*/
/**
* @brief Fast approximation to the trigonometric sine function for Q15 data.
* @param[in] x Scaled input value in radians.
* @return sin(x).
*
* The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi).
@addtogroup sin
@{
*/
/**
@brief Fast approximation to the trigonometric sine function for Q15 data.
@param[in] x Scaled input value in radians
@return sin(x)
The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q15_t arm_sin_q15(
q15_t x)
{
q15_t sinVal; /* Temporary variables for input, output */
int32_t index; /* Index variables */
q15_t a, b; /* Four nearest output values */
q15_t sinVal; /* Temporary input, output variables */
int32_t index; /* Index variable */
q15_t a, b; /* Two nearest output values */
q15_t fract; /* Temporary values for fractional values */
/* Calculate the nearest index */
@ -65,12 +65,13 @@ q15_t arm_sin_q15(
b = sinTable_q15[index+1];
/* Linear interpolation process */
sinVal = (q31_t)(0x8000-fract)*a >> 16;
sinVal = (q15_t)((((q31_t)sinVal << 16) + ((q31_t)fract*b)) >> 16);
sinVal = (q31_t) (0x8000 - fract) * a >> 16;
sinVal = (q15_t) ((((q31_t) sinVal << 16) + ((q31_t) fract * b)) >> 16);
return sinVal << 1;
/* Return output value */
return (sinVal << 1);
}
/**
* @} end of sin group
@} end of sin group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sin_q31.c
* Description: Fast sine calculation for Q31 values
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,27 +30,28 @@
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
*/
/**
* @addtogroup sin
* @{
@ingroup groupFastMath
*/
/**
* @brief Fast approximation to the trigonometric sine function for Q31 data.
* @param[in] x Scaled input value in radians.
* @return sin(x).
*
* The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi). */
@addtogroup sin
@{
*/
/**
@brief Fast approximation to the trigonometric sine function for Q31 data.
@param[in] x Scaled input value in radians
@return sin(x)
The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
*/
q31_t arm_sin_q31(
q31_t x)
{
q31_t sinVal; /* Temporary variables for input, output */
int32_t index; /* Index variables */
q31_t a, b; /* Four nearest output values */
int32_t index; /* Index variable */
q31_t a, b; /* Two nearest output values */
q31_t fract; /* Temporary values for fractional values */
/* Calculate the nearest index */
@ -64,12 +65,13 @@ q31_t arm_sin_q31(
b = sinTable_q31[index+1];
/* Linear interpolation process */
sinVal = (q63_t)(0x80000000-fract)*a >> 32;
sinVal = (q31_t)((((q63_t)sinVal << 32) + ((q63_t)fract*b)) >> 32);
sinVal = (q63_t) (0x80000000 - fract) * a >> 32;
sinVal = (q31_t) ((((q63_t) sinVal << 32) + ((q63_t) fract * b)) >> 32);
return sinVal << 1;
/* Return output value */
return (sinVal << 1);
}
/**
* @} end of sin group
@} end of sin group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sqrt_q15.c
* Description: Q15 square root function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,31 +29,30 @@
#include "arm_math.h"
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
@ingroup groupFastMath
*/
/**
* @addtogroup SQRT
* @{
@addtogroup SQRT
@{
*/
/**
* @brief Q15 square root function.
* @param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF.
* @param[out] *pOut square root of input value.
* @return The function returns ARM_MATH_SUCCESS if the input value is positive
* and ARM_MATH_ARGUMENT_ERROR if the input is negative. For
* negative inputs, the function returns *pOut = 0.
*/
/**
@brief Q15 square root function.
@param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF
@param[out] pOut points to square root of input value
@return execution status
- \ref ARM_MATH_SUCCESS : input value is positive
- \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
arm_status arm_sqrt_q15(
q15_t in,
q15_t * pOut)
{
q15_t number, temp1, var1, signBits1, half;
q31_t bits_val1;
q15_t number, temp1, var1, signBits1, half;
float32_t temp_float1;
union
{
@ -85,7 +84,7 @@ arm_status arm_sqrt_q15(
/* Convert to float */
temp_float1 = number * 3.051757812500000e-005f;
/*Store as integer */
/* Store as integer */
tempconv.floatval = temp_float1;
bits_val1 = tempconv.fracval;
/* Subtract the shifted value from the magic number to give intial guess */
@ -135,10 +134,11 @@ arm_status arm_sqrt_q15(
else
{
*pOut = 0;
return (ARM_MATH_ARGUMENT_ERROR);
}
}
/**
* @} end of SQRT group
@} end of SQRT group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_sqrt_q31.c
* Description: Q31 square root function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -30,33 +30,34 @@
#include "arm_common_tables.h"
/**
* @ingroup groupFastMath
@ingroup groupFastMath
*/
/**
* @addtogroup SQRT
* @{
@addtogroup SQRT
@{
*/
/**
* @brief Q31 square root function.
* @param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF.
* @param[out] *pOut square root of input value.
* @return The function returns ARM_MATH_SUCCESS if the input value is positive
* and ARM_MATH_ARGUMENT_ERROR if the input is negative. For
* negative inputs, the function returns *pOut = 0.
@brief Q31 square root function.
@param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
@param[out] pOut points to square root of input value
@return execution status
- \ref ARM_MATH_SUCCESS : input value is positive
- \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
arm_status arm_sqrt_q31(
q31_t in,
q31_t * pOut)
{
q31_t number, temp1, bits_val1, var1, signBits1, half;
q31_t bits_val1;
q31_t number, temp1, var1, signBits1, half;
float32_t temp_float1;
union
{
q31_t fracval;
float32_t floatval;
q31_t fracval;
float32_t floatval;
} tempconv;
number = in;
@ -81,9 +82,9 @@ arm_status arm_sqrt_q31(
/* Store the number for later use */
temp1 = number;
/*Convert to float */
/* Convert to float */
temp_float1 = number * 4.6566128731e-010f;
/*Store as integer */
/* Store as integer */
tempconv.floatval = temp_float1;
bits_val1 = tempconv.fracval;
/* Subtract the shifted value from the magic number to give intial guess */
@ -133,10 +134,11 @@ arm_status arm_sqrt_q31(
else
{
*pOut = 0;
return (ARM_MATH_ARGUMENT_ERROR);
}
}
/**
* @} end of SQRT group
@} end of SQRT group
*/

View file

@ -0,0 +1,128 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPFiltering)
add_library(CMSISDSPFiltering STATIC)
include(interpol)
interpol(CMSISDSPFiltering)
configdsp(CMSISDSPFiltering ..)
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_ALL_FAST_TABLES)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q31)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q31.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q15)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q15.c)
endif()
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f64.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f64.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_conv_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q7.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_f32.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q31.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_q15.c)
target_sources(CMSISDSPFiltering PRIVATE arm_lms_q31.c)
### Includes
target_include_directories(CMSISDSPFiltering PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,127 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: FilteringFunctions.c
* Description: Combination of all filtering function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_biquad_cascade_df1_32x64_init_q31.c"
#include "arm_biquad_cascade_df1_32x64_q31.c"
#include "arm_biquad_cascade_df1_f32.c"
#include "arm_biquad_cascade_df1_fast_q15.c"
#include "arm_biquad_cascade_df1_fast_q31.c"
#include "arm_biquad_cascade_df1_init_f32.c"
#include "arm_biquad_cascade_df1_init_q15.c"
#include "arm_biquad_cascade_df1_init_q31.c"
#include "arm_biquad_cascade_df1_q15.c"
#include "arm_biquad_cascade_df1_q31.c"
#include "arm_biquad_cascade_df2T_f32.c"
#include "arm_biquad_cascade_df2T_f64.c"
#include "arm_biquad_cascade_df2T_init_f32.c"
#include "arm_biquad_cascade_df2T_init_f64.c"
#include "arm_biquad_cascade_stereo_df2T_f32.c"
#include "arm_biquad_cascade_stereo_df2T_init_f32.c"
#include "arm_conv_f32.c"
#include "arm_conv_fast_opt_q15.c"
#include "arm_conv_fast_q15.c"
#include "arm_conv_fast_q31.c"
#include "arm_conv_opt_q15.c"
#include "arm_conv_opt_q7.c"
#include "arm_conv_partial_f32.c"
#include "arm_conv_partial_fast_opt_q15.c"
#include "arm_conv_partial_fast_q15.c"
#include "arm_conv_partial_fast_q31.c"
#include "arm_conv_partial_opt_q15.c"
#include "arm_conv_partial_opt_q7.c"
#include "arm_conv_partial_q15.c"
#include "arm_conv_partial_q31.c"
#include "arm_conv_partial_q7.c"
#include "arm_conv_q15.c"
#include "arm_conv_q31.c"
#include "arm_conv_q7.c"
#include "arm_correlate_f32.c"
#include "arm_correlate_fast_opt_q15.c"
#include "arm_correlate_fast_q15.c"
#include "arm_correlate_fast_q31.c"
#include "arm_correlate_opt_q15.c"
#include "arm_correlate_opt_q7.c"
#include "arm_correlate_q15.c"
#include "arm_correlate_q31.c"
#include "arm_correlate_q7.c"
#include "arm_fir_decimate_f32.c"
#include "arm_fir_decimate_fast_q15.c"
#include "arm_fir_decimate_fast_q31.c"
#include "arm_fir_decimate_init_f32.c"
#include "arm_fir_decimate_init_q15.c"
#include "arm_fir_decimate_init_q31.c"
#include "arm_fir_decimate_q15.c"
#include "arm_fir_decimate_q31.c"
#include "arm_fir_f32.c"
#include "arm_fir_fast_q15.c"
#include "arm_fir_fast_q31.c"
#include "arm_fir_init_f32.c"
#include "arm_fir_init_q15.c"
#include "arm_fir_init_q31.c"
#include "arm_fir_init_q7.c"
#include "arm_fir_interpolate_f32.c"
#include "arm_fir_interpolate_init_f32.c"
#include "arm_fir_interpolate_init_q15.c"
#include "arm_fir_interpolate_init_q31.c"
#include "arm_fir_interpolate_q15.c"
#include "arm_fir_interpolate_q31.c"
#include "arm_fir_lattice_f32.c"
#include "arm_fir_lattice_init_f32.c"
#include "arm_fir_lattice_init_q15.c"
#include "arm_fir_lattice_init_q31.c"
#include "arm_fir_lattice_q15.c"
#include "arm_fir_lattice_q31.c"
#include "arm_fir_q15.c"
#include "arm_fir_q31.c"
#include "arm_fir_q7.c"
#include "arm_fir_sparse_f32.c"
#include "arm_fir_sparse_init_f32.c"
#include "arm_fir_sparse_init_q15.c"
#include "arm_fir_sparse_init_q31.c"
#include "arm_fir_sparse_init_q7.c"
#include "arm_fir_sparse_q15.c"
#include "arm_fir_sparse_q31.c"
#include "arm_fir_sparse_q7.c"
#include "arm_iir_lattice_f32.c"
#include "arm_iir_lattice_init_f32.c"
#include "arm_iir_lattice_init_q15.c"
#include "arm_iir_lattice_init_q31.c"
#include "arm_iir_lattice_q15.c"
#include "arm_iir_lattice_q31.c"
#include "arm_lms_f32.c"
#include "arm_lms_init_f32.c"
#include "arm_lms_init_q15.c"
#include "arm_lms_init_q31.c"
#include "arm_lms_norm_f32.c"
#include "arm_lms_norm_init_f32.c"
#include "arm_lms_norm_init_q15.c"
#include "arm_lms_norm_init_q31.c"
#include "arm_lms_norm_q15.c"
#include "arm_lms_norm_q31.c"
#include "arm_lms_q15.c"
#include "arm_lms_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_32x64_init_q31.c
* Description: High precision Q31 Biquad cascade filter initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,53 +29,49 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1_32x64
* @{
@addtogroup BiquadCascadeDF1_32x64
@{
*/
/**
* @details
*
* @param[in,out] *S points to an instance of the high precision Q31 Biquad cascade filter structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients.
* @param[in] *pState points to the state buffer.
* @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format.
* @return none
*
* <b>Coefficient and State Ordering:</b>
*
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> points to state variables array and size of each state variable is 1.63 format.
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
* The state variables are arranged in the state array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the Q31 Biquad cascade 32x64 filter.
@param[in,out] S points to an instance of the high precision Q31 Biquad cascade filter structure
@param[in] numStages number of 2nd order stages in the filter
@param[in] pCoeffs points to the filter coefficients
@param[in] pState points to the state buffer
@param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
@par
The <code>pState</code> points to state variables array and size of each state variable is 1.63 format.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
The state variables are arranged in the state array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cas_df1_32x64_init_q31(
arm_biquad_cas_df1_32x64_ins_q31 * S,
uint8_t numStages,
q31_t * pCoeffs,
q63_t * pState,
uint8_t postShift)
arm_biquad_cas_df1_32x64_ins_q31 * S,
uint8_t numStages,
const q31_t * pCoeffs,
q63_t * pState,
uint8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@ -94,5 +90,5 @@ void arm_biquad_cas_df1_32x64_init_q31(
}
/**
* @} end of BiquadCascadeDF1_32x64 group
@} end of BiquadCascadeDF1_32x64 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_32x64_q31.c
* Description: High precision Q31 Biquad cascade filter processing function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,174 +29,169 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
*
* This function implements a high precision Biquad cascade filter which operates on
* Q31 data values. The filter coefficients are in 1.31 format and the state variables
* are in 1.63 format. The double precision state variables reduce quantization noise
* in the filter and provide a cleaner output.
* These filters are particularly useful when implementing filters in which the
* singularities are close to the unit circle. This is common for low pass or high
* pass filters with very low cutoff frequencies.
*
* The function operates on blocks of input and output data
* and each call to the function processes <code>blockSize</code> samples through
* the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
* containing <code>blockSize</code> Q31 values.
*
* \par Algorithm
* Each Biquad stage implements a second order filter using the difference equation:
* <pre>
* y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* </pre>
* A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
* \image html Biquad.gif "Single Biquad filter stage"
* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
* Pay careful attention to the sign of the feedback coefficients.
* Some design tools use the difference equation
* <pre>
* y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
* </pre>
* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
*
* \par
* Higher order filters are realized as a cascade of second order sections.
* <code>numStages</code> refers to the number of second order stages used.
* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
* \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
*
* \par
* The <code>pState</code> points to state variables array .
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
* The state variables are arranged in the array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
*
* \par
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
*
* \par Instance Structure
* The coefficients and state variables for a filter are stored together in an instance data structure.
* A separate instance structure must be defined for each filter.
* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
*
* \par Init Function
* There is also an associated initialization function which performs the following operations:
* - Sets the values of the internal structure fields.
* - Zeros out the values in the state buffer.
* To do this manually without calling the init function, assign the follow subfields of the instance structure:
* numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
*
* \par
* Use of the initialization function is optional.
* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
* To place an instance structure into a const data section, the instance structure must be manually initialized.
* Set the values in the state buffer to zeros before static initialization.
* For example, to statically initialize the filter instance structure use
* <pre>
* arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
* </pre>
* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;
* <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied which is described in detail below.
* \par Fixed-Point Behavior
* Care must be taken while using Biquad Cascade 32x64 filter function.
* Following issues must be considered:
* - Scaling of coefficients
* - Filter gain
* - Overflow and saturation
*
* \par
* Filter coefficients are represented as fractional values and
* restricted to lie in the range <code>[-1 +1)</code>.
* The processing function has an additional scaling parameter <code>postShift</code>
* which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
* At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
* \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
* This essentially scales the filter coefficients by <code>2^postShift</code>.
* For example, to realize the coefficients
* <pre>
* {1.5, -0.8, 1.2, 1.6, -0.9}
* </pre>
* set the Coefficient array to:
* <pre>
* {0.75, -0.4, 0.6, 0.8, -0.45}
* </pre>
* and set <code>postShift=1</code>
*
* \par
* The second thing to keep in mind is the gain through the filter.
* The frequency response of a Biquad filter is a function of its coefficients.
* It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
* This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
* To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
*
* \par
* The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
* This is described in the function specific documentation below.
@defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
This function implements a high precision Biquad cascade filter which operates on
Q31 data values. The filter coefficients are in 1.31 format and the state variables
are in 1.63 format. The double precision state variables reduce quantization noise
in the filter and provide a cleaner output.
These filters are particularly useful when implementing filters in which the
singularities are close to the unit circle. This is common for low pass or high
pass filters with very low cutoff frequencies.
The function operates on blocks of input and output data
and each call to the function processes <code>blockSize</code> samples through
the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
containing <code>blockSize</code> Q31 values.
@par Algorithm
Each Biquad stage implements a second order filter using the difference equation:
<pre>
y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
</pre>
A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
\image html Biquad.gif "Single Biquad filter stage"
Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
Pay careful attention to the sign of the feedback coefficients.
Some design tools use the difference equation
<pre>
y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
</pre>
In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
@par
Higher order filters are realized as a cascade of second order sections.
<code>numStages</code> refers to the number of second order stages used.
For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
\image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
A 9th order filter would be realized with <code>numStages=5</code> second order stages
with the coefficients for one of the stages configured as a first order filter
(<code>b2=0</code> and <code>a2=0</code>).
@par
The <code>pState</code> points to state variables array.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
The state variables are arranged in the array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
@par
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
The state variables are updated after each block of data is processed, the coefficients are untouched.
@par Instance Structure
The coefficients and state variables for a filter are stored together in an instance data structure.
A separate instance structure must be defined for each filter.
Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
@par Init Function
There is also an associated initialization function which performs the following operations:
- Sets the values of the internal structure fields.
- Zeros out the values in the state buffer.
To do this manually without calling the init function, assign the follow subfields of the instance structure:
numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
@par
Use of the initialization function is optional.
However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
To place an instance structure into a const data section, the instance structure must be manually initialized.
Set the values in the state buffer to zeros before static initialization.
For example, to statically initialize the filter instance structure use
<pre>
arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
</pre>
where <code>numStages</code> is the number of Biquad stages in the filter;
<code>pState</code> is the address of the state buffer;
<code>pCoeffs</code> is the address of the coefficient buffer;
<code>postShift</code> shift to be applied which is described in detail below.
@par Fixed-Point Behavior
Care must be taken while using Biquad Cascade 32x64 filter function.
Following issues must be considered:
- Scaling of coefficients
- Filter gain
- Overflow and saturation
@par
Filter coefficients are represented as fractional values and
restricted to lie in the range <code>[-1 +1)</code>.
The processing function has an additional scaling parameter <code>postShift</code>
which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
\image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
This essentially scales the filter coefficients by <code>2^postShift</code>.
For example, to realize the coefficients
<pre>
{1.5, -0.8, 1.2, 1.6, -0.9}
</pre>
set the Coefficient array to:
<pre>
{0.75, -0.4, 0.6, 0.8, -0.45}
</pre>
and set <code>postShift=1</code>
@par
The second thing to keep in mind is the gain through the filter.
The frequency response of a Biquad filter is a function of its coefficients.
It is possible for the gain through the filter to exceed 1.0 meaning that the
filter increases the amplitude of certain frequencies.
This means that an input signal with amplitude < 1.0 may result in an output > 1.0
and these are saturated or overflowed based on the implementation of the filter.
To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
or the input signal must be scaled down so that the combination of input and filter are never overflowed.
@par
The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
This is described in the function specific documentation below.
*/
/**
* @addtogroup BiquadCascadeDF1_32x64
* @{
@addtogroup BiquadCascadeDF1_32x64
@{
*/
/**
* @details
@brief Processing function for the Q31 Biquad cascade 32x64 filter.
@param[in] S points to an instance of the high precision Q31 Biquad cascade filter
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
* @param[in] *S points to an instance of the high precision Q31 Biquad cascade filter.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data.
* @param[in] blockSize number of samples to process.
* @return none.
*
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
* Thus, if the accumulator result overflows it wraps around rather than clip.
* In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
* After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
* 1.31 format by discarding the low 32 bits.
*
* \par
* Two related functions are provided in the CMSIS DSP library.
* <code>arm_biquad_cascade_df1_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
* <code>arm_biquad_cascade_df1_fast_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
@par Details
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
Thus, if the accumulator result overflows it wraps around rather than clip.
In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
1.31 format by discarding the low 32 bits.
@par
Two related functions are provided in the CMSIS DSP library.
- \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
- \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
*/
void arm_biquad_cas_df1_32x64_q31(
const arm_biquad_cas_df1_32x64_ins_q31 * S,
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t *pIn = pSrc; /* input pointer initialization */
q31_t *pOut = pDst; /* output pointer initialization */
q63_t *pState = S->pState; /* state pointer initialization */
q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
q63_t acc; /* accumulator */
q31_t Xn1, Xn2; /* Input Filter state variables */
q63_t Yn1, Yn2; /* Output Filter state variables */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t Xn; /* temporary input */
int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
uint32_t sample, stage = S->numStages; /* loop counters */
q31_t acc_l, acc_h; /* temporary output */
uint32_t uShift = ((uint32_t) S->postShift + 1U);
uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t *pIn = pSrc; /* input pointer initialization */
q31_t *pOut = pDst; /* output pointer initialization */
q63_t *pState = S->pState; /* state pointer initialization */
const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
q63_t acc; /* accumulator */
q31_t Xn1, Xn2; /* Input Filter state variables */
q63_t Yn1, Yn2; /* Output Filter state variables */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t Xn; /* temporary input */
int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
uint32_t sample, stage = S->numStages; /* loop counters */
q31_t acc_l, acc_h; /* temporary output */
uint32_t uShift = ((uint32_t) S->postShift + 1U);
uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
do
{
@ -213,16 +208,16 @@ void arm_biquad_cas_df1_32x64_q31(
Yn1 = pState[2];
Yn2 = pState[3];
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 output values simultaneously. */
/* The variable acc hold output value that is being computed and
* stored in the destination buffer
/* Variable acc hold output value that is being computed and stored in destination buffer
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the input */
@ -231,13 +226,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) Xn *b0;
acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
acc += (q63_t) Xn1 *b1;
acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn2 *b2;
acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
@ -266,13 +261,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc += b1 * x[n-1] */
acc = (q63_t) Xn *b1;
acc = (q63_t) Xn * b1;
/* acc = b0 * x[n] */
acc += (q63_t) Xn2 *b0;
acc += (q63_t) Xn2 * b0;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn1 *b2;
acc += (q63_t) Xn1 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn2, a1);
@ -302,13 +297,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) Xn1 *b0;
acc = (q63_t) Xn1 * b0;
/* acc += b1 * x[n-1] */
acc += (q63_t) Xn2 *b1;
acc += (q63_t) Xn2 * b1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn *b2;
acc += (q63_t) Xn * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
@ -336,13 +331,13 @@ void arm_biquad_cas_df1_32x64_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) Xn *b0;
acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
acc += (q63_t) Xn1 *b1;
acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn2 *b2;
acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn2, a1);
@ -366,139 +361,55 @@ void arm_biquad_cas_df1_32x64_q31(
*(pOut + 3U) = acc_h;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
/* update output pointer */
pOut += 4U;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
sample = (blockSize & 0x3U);
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) Xn *b0;
/* acc += b1 * x[n-1] */
acc += (q63_t) Xn1 *b1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn2 *b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
/* acc += a2 * y[n-2] */
acc += mult32x64(Yn2, a2);
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
/* The result is converted to 1.63, Yn1 variable is reused */
Yn1 = acc << shift;
/* Calc lower part of acc */
acc_l = acc & 0xffffffff;
/* Calc upper part of acc */
acc_h = (acc >> 32) & 0xffffffff;
/* Apply shift for lower part of acc and upper part of acc */
acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Store the output in the destination buffer in 1.31 format. */
*pOut++ = acc_h;
/* Yn1 = acc << shift; */
/* Store the output in the destination buffer in 1.31 format. */
/* *pOut++ = (q31_t) (acc >> (32 - shift)); */
/* decrement the loop counter */
sample--;
}
/* The first stage output is given as input to the second stage. */
pIn = pDst;
/* Reset to destination buffer working pointer */
pOut = pDst;
/* Store the updated state variables back into the pState array */
/* Store the updated state variables back into the pState array */
*pState++ = (q63_t) Xn1;
*pState++ = (q63_t) Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
} while (--stage);
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0x3U;
#else
/* Run the below code for Cortex-M0 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/* Reading the state values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
/* The variable acc hold output value that is being computed and
* stored in the destination buffer
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) Xn *b0;
acc = (q63_t) Xn * b0;
/* acc += b1 * x[n-1] */
acc += (q63_t) Xn1 *b1;
acc += (q63_t) Xn1 * b1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) Xn2 *b2;
acc += (q63_t) Xn2 * b2;
/* acc += a1 * y[n-1] */
acc += mult32x64(Yn1, a1);
/* acc += a2 * y[n-2] */
acc += mult32x64(Yn2, a2);
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@ -517,17 +428,16 @@ void arm_biquad_cas_df1_32x64_q31(
/* Store the output in the destination buffer in 1.31 format. */
*pOut++ = acc_h;
/* Yn1 = acc << shift; */
/* Store the output in the destination buffer in 1.31 format. */
/* *pOut++ = (q31_t) (acc >> (32 - shift)); */
/* *pOut++ = (q31_t) (acc >> (32 - shift)); */
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* The first stage output is given as input to the second stage. */
/* The first stage output is given as input to the second stage. */
pIn = pDst;
/* Reset to destination buffer working pointer */
@ -541,9 +451,8 @@ void arm_biquad_cas_df1_32x64_q31(
} while (--stage);
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BiquadCascadeDF1_32x64 group
*/
/**
@} end of BiquadCascadeDF1_32x64 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_f32.c
* Description: Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,159 +29,308 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
*
* This set of functions implements arbitrary order recursive (IIR) filters.
* The filters are implemented as a cascade of second order Biquad sections.
* The functions support Q15, Q31 and floating-point data types.
* Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3.
*
* The functions operate on blocks of input and output data and each call to the function
* processes <code>blockSize</code> samples through the filter.
* <code>pSrc</code> points to the array of input data and
* <code>pDst</code> points to the array of output data.
* Both arrays contain <code>blockSize</code> values.
*
* \par Algorithm
* Each Biquad stage implements a second order filter using the difference equation:
* <pre>
* y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* </pre>
* A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
* \image html Biquad.gif "Single Biquad filter stage"
* Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
* Pay careful attention to the sign of the feedback coefficients.
* Some design tools use the difference equation
* <pre>
* y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
* </pre>
* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
*
* \par
* Higher order filters are realized as a cascade of second order sections.
* <code>numStages</code> refers to the number of second order stages used.
* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
* \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
*
* \par
* The <code>pState</code> points to state variables array.
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
*
* \par
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values.
* The state variables are updated after each block of data is processed, the coefficients are untouched.
*
* \par Instance Structure
* The coefficients and state variables for a filter are stored together in an instance data structure.
* A separate instance structure must be defined for each filter.
* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
* There are separate instance structure declarations for each of the 3 supported data types.
*
* \par Init Functions
* There is also an associated initialization function for each data type.
* The initialization function performs following operations:
* - Sets the values of the internal structure fields.
* - Zeros out the values in the state buffer.
* To do this manually without calling the init function, assign the follow subfields of the instance structure:
* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
*
* \par
* Use of the initialization function is optional.
* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
* To place an instance structure into a const data section, the instance structure must be manually initialized.
* Set the values in the state buffer to zeros before static initialization.
* The code below statically initializes each of the 3 different data type filter instance structures
* <pre>
* arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
* arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
* arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
* </pre>
* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;
* <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied.
*
* \par Fixed-Point Behavior
* Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
* Following issues must be considered:
* - Scaling of coefficients
* - Filter gain
* - Overflow and saturation
*
* \par
* <b>Scaling of coefficients: </b>
* Filter coefficients are represented as fractional values and
* coefficients are restricted to lie in the range <code>[-1 +1)</code>.
* The fixed-point functions have an additional scaling parameter <code>postShift</code>
* which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
* At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
* \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
* This essentially scales the filter coefficients by <code>2^postShift</code>.
* For example, to realize the coefficients
* <pre>
* {1.5, -0.8, 1.2, 1.6, -0.9}
* </pre>
* set the pCoeffs array to:
* <pre>
* {0.75, -0.4, 0.6, 0.8, -0.45}
* </pre>
* and set <code>postShift=1</code>
*
* \par
* <b>Filter gain: </b>
* The frequency response of a Biquad filter is a function of its coefficients.
* It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
* This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
* To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
*
* \par
* <b>Overflow and saturation: </b>
* For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
@defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
This set of functions implements arbitrary order recursive (IIR) filters.
The filters are implemented as a cascade of second order Biquad sections.
The functions support Q15, Q31 and floating-point data types.
Fast version of Q15 and Q31 also available.
The functions operate on blocks of input and output data and each call to the function
processes <code>blockSize</code> samples through the filter.
<code>pSrc</code> points to the array of input data and
<code>pDst</code> points to the array of output data.
Both arrays contain <code>blockSize</code> values.
@par Algorithm
Each Biquad stage implements a second order filter using the difference equation:
<pre>
y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
</pre>
A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
\image html Biquad.gif "Single Biquad filter stage"
Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
Pay careful attention to the sign of the feedback coefficients.
Some design tools use the difference equation
<pre>
y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
</pre>
In this case the feedback coefficients <code>a1</code> and <code>a2</code>
must be negated when used with the CMSIS DSP Library.
@par
Higher order filters are realized as a cascade of second order sections.
<code>numStages</code> refers to the number of second order stages used.
For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
\image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
@par
The <code>pState</code> points to state variables array.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
The state variables are arranged in the <code>pState</code> array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
@par
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values.
The state variables are updated after each block of data is processed, the coefficients are untouched.
@par Instance Structure
The coefficients and state variables for a filter are stored together in an instance data structure.
A separate instance structure must be defined for each filter.
Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
There are separate instance structure declarations for each of the 3 supported data types.
@par Init Function
There is also an associated initialization function for each data type.
The initialization function performs following operations:
- Sets the values of the internal structure fields.
- Zeros out the values in the state buffer.
To do this manually without calling the init function, assign the follow subfields of the instance structure:
numStages, pCoeffs, pState. Also set all of the values in pState to zero.
@par
Use of the initialization function is optional.
However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
To place an instance structure into a const data section, the instance structure must be manually initialized.
Set the values in the state buffer to zeros before static initialization.
The code below statically initializes each of the 3 different data type filter instance structures
<pre>
arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
</pre>
where <code>numStages</code> is the number of Biquad stages in the filter;
<code>pState</code> is the address of the state buffer;
<code>pCoeffs</code> is the address of the coefficient buffer;
<code>postShift</code> shift to be applied.
@par Fixed-Point Behavior
Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
Following issues must be considered:
- Scaling of coefficients
- Filter gain
- Overflow and saturation
@par Scaling of coefficients
Filter coefficients are represented as fractional values and
coefficients are restricted to lie in the range <code>[-1 +1)</code>.
The fixed-point functions have an additional scaling parameter <code>postShift</code>
which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
\image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
This essentially scales the filter coefficients by <code>2^postShift</code>.
For example, to realize the coefficients
<pre>
{1.5, -0.8, 1.2, 1.6, -0.9}
</pre>
set the pCoeffs array to:
<pre>
{0.75, -0.4, 0.6, 0.8, -0.45}
</pre>
and set <code>postShift=1</code>
@par Filter gain
The frequency response of a Biquad filter is a function of its coefficients.
It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
@par Overflow and saturation
For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @param[in] *S points to an instance of the floating-point Biquad cascade structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data.
* @param[in] blockSize number of samples to process per call.
* @return none.
*
@brief Processing function for the floating-point Biquad cascade filter.
@param[in] S points to an instance of the floating-point Biquad cascade structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
*/
#if defined(ARM_MATH_NEON)
void arm_biquad_cascade_df1_f32(
const arm_biquad_casd_df1_inst_f32 * S,
float32_t * pSrc,
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pIn = pSrc; /* source pointer */
const float32_t *pIn = pSrc; /* source pointer */
float32_t *pOut = pDst; /* destination pointer */
float32_t *pState = S->pState; /* pState pointer */
float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
float32_t acc; /* Simulates the accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float32_t Xn; /* temporary input */
uint32_t sample, stage = S->numStages; /* loop counters */
float32x4_t Xn;
float32x2_t Yn;
float32x2_t a;
float32x4_t b;
float32x4_t x,tmp;
float32x2_t t;
float32x2x2_t y;
#if defined (ARM_MATH_DSP)
float32_t Xns;
/* Run the below code for Cortex-M4 and Cortex-M3 */
while (stage > 0U)
{
/* Reading the coefficients */
Xn = vld1q_f32(pState);
Yn = vld1_f32(pState + 2);
b = vld1q_f32(pCoeffs);
b = vrev64q_f32(b);
b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
a = vld1_f32(pCoeffs + 3);
a = vrev64_f32(a);
b[0] = 0.0;
pCoeffs += 5;
/* Reading the pState values */
/* Apply loop unrolling and compute 4 output values simultaneously. */
/* The variable acc hold output values that are being computed:
*
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
sample = blockSize >> 2U;
while (sample > 0U)
{
/* Read the first 4 inputs */
x = vld1q_f32(pIn);
pIn += 4;
tmp = vextq_f32(Xn, x, 1);
t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
t = vmla_f32(t, a, Yn);
t = vpadd_f32(t, t);
Yn = vext_f32(Yn, t, 1);
tmp = vextq_f32(Xn, x, 2);
t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
t = vmla_f32(t, a, Yn);
t = vpadd_f32(t, t);
Yn = vext_f32(Yn, t, 1);
y.val[0] = Yn;
tmp = vextq_f32(Xn, x, 3);
t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
t = vmla_f32(t, a, Yn);
t = vpadd_f32(t, t);
Yn = vext_f32(Yn, t, 1);
Xn = x;
t = vmul_f32(vget_high_f32(b), vget_high_f32(Xn));
t = vmla_f32(t, vget_low_f32(b), vget_low_f32(Xn));
t = vmla_f32(t, a, Yn);
t = vpadd_f32(t, t);
Yn = vext_f32(Yn, t, 1);
y.val[1] = Yn;
tmp = vcombine_f32(y.val[0], y.val[1]);
/* Store the 4 outputs and increment the pointer */
vst1q_f32(pOut, tmp);
pOut += 4;
/* Decrement the loop counter */
sample--;
}
/* If the block size is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
sample = blockSize & 0x3U;
while (sample > 0U)
{
/* Read the input */
Xns = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
acc = (b[1] * Xn[2]) + (b[2] * Xn[3]) + (b[3] * Xns) + (a[0] * Yn[0]) + (a[1] * Yn[1]);
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn[2] = Xn[3];
Xn[3] = Xns;
Yn[0] = Yn[1];
Yn[1] = acc;
/* Decrement the loop counter */
sample--;
}
vst1q_f32(pState,vcombine_f32(vrev64_f32(vget_high_f32(Xn)),vrev64_f32(Yn)));
pState += 4;
/* Store the updated state variables back into the pState array */
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent numStages occur in-place in the output buffer */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
/* Decrement the loop counter */
stage--;
}
}
#else
void arm_biquad_cascade_df1_f32(
const arm_biquad_casd_df1_inst_f32 * S,
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
const float32_t *pIn = pSrc; /* Source pointer */
float32_t *pOut = pDst; /* Destination pointer */
float32_t *pState = S->pState; /* pState pointer */
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t acc; /* Accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float32_t Xn; /* Temporary input */
uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
@ -198,19 +347,20 @@ void arm_biquad_cascade_df1_f32(
Yn1 = pState[2];
Yn2 = pState[3];
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 output values simultaneously. */
/* The variable acc hold output values that are being computed:
/* Variable acc hold output values that are being computed:
*
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the first input */
@ -219,15 +369,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
/* Store the result in the accumulator in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn2;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Read the second input */
Xn2 = *pIn++;
@ -235,15 +385,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
/* Store the result in the accumulator in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn1;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Read the third input */
Xn1 = *pIn++;
@ -251,15 +401,15 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
/* Store the result in the accumulator in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn2;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Read the forth input */
Xn = *pIn++;
@ -267,97 +417,32 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
/* Store the result in the accumulator in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn1;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0x3U;
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = acc;
/* decrement the loop counter */
sample--;
}
/* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent numStages occur in-place in the output buffer */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
/* decrement the loop counter */
stage--;
} while (stage > 0U);
#else
/* Run the below code for Cortex-M0 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/* Reading the pState values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
/* The variables acc holds the output value that is computed:
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U)
{
/* Read the input */
@ -366,47 +451,45 @@ void arm_biquad_cascade_df1_f32(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
/* Store the result in the accumulator in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = acc;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = acc;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* Store the updated state variables back into the pState array */
/* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent numStages occur in-place in the output buffer */
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent numStages occur in-place in the output buffer */
pIn = pDst;
/* Reset the output pointer */
/* Reset output pointer */
pOut = pDst;
/* decrement the loop counter */
/* decrement loop counter */
stage--;
} while (stage > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BiquadCascadeDF1 group
*/
#endif /* #if defined(ARM_MATH_NEON) */
/**
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_fast_q15.c
* Description: Fast processing function for the Q15 Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,90 +29,87 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @details
* @param[in] *S points to an instance of the Q15 Biquad cascade structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data.
* @param[in] blockSize number of samples to process per call.
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* This fast version uses a 32-bit accumulator with 2.30 format.
* The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
* Thus, if the accumulator result overflows it wraps around and distorts the result.
* In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
* The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
*
* \par
* Refer to the function <code>arm_biquad_cascade_df1_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
* Use the function <code>arm_biquad_cascade_df1_init_q15()</code> to initialize the filter structure.
*
@brief Processing function for the Q15 Biquad cascade filter (fast variant).
@param[in] S points to an instance of the Q15 Biquad cascade structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process per call
@return none
@par Scaling and Overflow Behavior
This fast version uses a 32-bit accumulator with 2.30 format.
The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
Thus, if the accumulator result overflows it wraps around and distorts the result.
In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
@remark
Refer to \ref arm_biquad_cascade_df1_q15() for a slower implementation of this function
which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
Use the function \ref arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
*/
void arm_biquad_cascade_df1_fast_q15(
const arm_biquad_casd_df1_inst_q15 * S,
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q31_t in; /* Temporary variable to hold input value */
q31_t out; /* Temporary variable to hold output value */
q31_t b0; /* Temporary variable to hold bo value */
q31_t b1, a1; /* Filter coefficients */
q31_t state_in, state_out; /* Filter state variables */
q31_t acc; /* Accumulator */
int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
q15_t *pState = S->pState; /* State pointer */
q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
uint32_t sample, stage = S->numStages; /* Stage loop counter */
const q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q15_t *pState = S->pState; /* State pointer */
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t acc; /* Accumulator */
q31_t in; /* Temporary variable to hold input value */
q31_t out; /* Temporary variable to hold output value */
q31_t b0; /* Temporary variable to hold bo value */
q31_t b1, a1; /* Filter coefficients */
q31_t state_in, state_out; /* Filter state variables */
int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
/* Read the b0 and 0 coefficients using SIMD */
b0 = *__SIMD32(pCoeffs)++;
b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the b1 and b2 coefficients using SIMD */
b1 = *__SIMD32(pCoeffs)++;
b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the a1 and a2 coefficients using SIMD */
a1 = *__SIMD32(pCoeffs)++;
a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the input state values from the state buffer: x[n-1], x[n-2] */
state_in = *__SIMD32(pState)++;
state_in = read_q15x2_ia (&pState);
/* Read the output state values from the state buffer: y[n-1], y[n-2] */
state_out = *__SIMD32(pState)--;
state_out = read_q15x2_da (&pState);
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 2 output values simultaneously. */
/* The variable acc hold output values that are being computed:
/* Variable acc hold output values that are being computed:
*
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Loop unrolling: Compute 2 outputs at a time */
sample = blockSize >> 1U;
/* First part of the processing with loop unrolling. Compute 2 outputs at a time.
** a second loop below computes the remaining 1 sample. */
while (sample > 0U)
{
/* Read the input */
in = *__SIMD32(pIn)++;
in = read_q15x2_ia ((q15_t **) &pIn);
/* out = b0 * x[n] + 0 * 0 */
out = __SMUAD(b0, in);
@ -126,24 +123,20 @@ void arm_biquad_cascade_df1_fast_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in, state_in, 16);
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_out = __PKHBT(state_out >> 16, (out), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* out = b0 * x[n] + 0 * 0 */
out = __SMUADX(b0, in);
@ -155,64 +148,53 @@ void arm_biquad_cascade_df1_fast_q15(
/* The result is converted from 3.29 to 1.31 and then saturation is applied */
out = __SSAT((acc >> shift), 16);
/* Store the output in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
#else
*__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in >> 16, state_in, 16);
state_in = __PKHBT(in >> 16, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement loop counter */
sample--;
}
/* Loop unrolling: Compute remaining outputs */
sample = (blockSize & 0x1U);
#else
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* Decrement the loop counter */
sample--;
}
/* If the blockSize is not a multiple of 2, compute any remaining output samples here.
** No loop unrolling is used. */
if ((blockSize & 0x1U) != 0U)
while (sample > 0U)
{
/* Read the input */
in = *pIn++;
/* out = b0 * x[n] + 0 * 0 */
#ifndef ARM_MATH_BIG_ENDIAN
out = __SMUAD(b0, in);
#else
out = __SMUADX(b0, in);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */
@ -228,46 +210,41 @@ void arm_biquad_cascade_df1_fast_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* decrement loop counter */
sample--;
}
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent (numStages - 1) occur in-place in the output buffer */
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent (numStages - 1) occur in-place in the output buffer */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
/* Store the updated state variables back into the state array */
*__SIMD32(pState)++ = state_in;
*__SIMD32(pState)++ = state_out;
/* Store the updated state variables back into the state array */
write_q15x2_ia(&pState, state_in);
write_q15x2_ia(&pState, state_out);
/* Decrement the loop counter */
/* Decrement loop counter */
stage--;
} while (stage > 0U);
}
/**
* @} end of BiquadCascadeDF1 group
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_fast_q31.c
* Description: Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,55 +29,52 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @details
*
* @param[in] *S points to an instance of the Q31 Biquad cascade structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data.
* @param[in] blockSize number of samples to process per call.
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* This function is optimized for speed at the expense of fixed-point precision and overflow protection.
* The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
* These intermediate results are added to a 2.30 accumulator.
* Finally, the accumulator is saturated and converted to a 1.31 result.
* The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
* In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
* arm_biquad_cascade_df1_init_q31() to initialize filter structure.
*
* \par
* Refer to the function <code>arm_biquad_cascade_df1_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
* Use the function <code>arm_biquad_cascade_df1_init_q31()</code> to initialize the filter structure.
@brief Processing function for the Q31 Biquad cascade filter (fast variant).
@param[in] S points to an instance of the Q31 Biquad cascade structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process per call
@return none
@par Scaling and Overflow Behavior
This function is optimized for speed at the expense of fixed-point precision and overflow protection.
The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
These intermediate results are added to a 2.30 accumulator.
Finally, the accumulator is saturated and converted to a 1.31 result.
The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
arm_biquad_cascade_df1_init_q31() to initialize filter structure.
@remark
Refer to \ref arm_biquad_cascade_df1_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
Use the function \ref arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
*/
void arm_biquad_cascade_df1_fast_q31(
const arm_biquad_casd_df1_inst_q31 * S,
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t acc = 0; /* accumulator */
q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t *pIn = pSrc; /* input pointer initialization */
q31_t *pOut = pDst; /* output pointer initialization */
q31_t *pState = S->pState; /* pState pointer initialization */
q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
q31_t Xn; /* temporary input */
int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
uint32_t sample, stage = S->numStages; /* loop counters */
const q31_t *pIn = pSrc; /* Source pointer */
q31_t *pOut = pDst; /* Destination pointer */
q31_t *pState = S->pState; /* pState pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t acc = 0; /* Accumulator */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
q31_t Xn; /* Temporary input */
int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
@ -88,22 +85,23 @@ void arm_biquad_cascade_df1_fast_q31(
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/* Reading the state values */
/* Reading the pState values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 output values simultaneously. */
/* The variables acc ... acc3 hold output values that are being computed:
/* Variables acc ... acc3 hold output values that are being computed:
*
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the input */
@ -111,19 +109,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
/*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
/* acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
mult_32x32_keep32_R(acc, b1, Xn1);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b0, Xn);
/* acc += b[2] * x[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn2);
/* acc += a1 * y[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31 , Yn2 variable is reused */
@ -137,19 +135,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
/*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
/* acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn2);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn);
/* acc += b[2] * x[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn1);
/* acc += a1 * y[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn2);
/* acc += a2 * y[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
@ -163,19 +161,19 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
/*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
/* acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn1);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn2);
/* acc += b[2] * x[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn);
/* acc += a1 * y[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31, Yn2 variable is reused */
@ -190,7 +188,7 @@ void arm_biquad_cascade_df1_fast_q31(
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
/*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
/* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
@ -207,47 +205,53 @@ void arm_biquad_cascade_df1_fast_q31(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn2 = Xn1 */
Xn2 = Xn1;
/* The result is converted to 1.31, Yn1 variable is reused */
Yn1 = acc << shift;
/* Xn1 = Xn */
/* Xn1 = Xn */
Xn1 = Xn;
/* Store the output in the destination buffer. */
*(pOut + 3U) = Yn1;
pOut += 4U;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
sample = (blockSize & 0x3U);
while (sample > 0U)
{
#else
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
/*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
/* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
mult_32x32_keep32_R(acc, b0, Xn);
/* acc += b1 * x[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, b1, Xn1);
/* acc += b[2] * x[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, b2, Xn2);
/* acc += a1 * y[n-1] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
multAcc_32x32_keep32_R(acc, a1, Yn1);
/* acc += a2 * y[n-2] */
/*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
/* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
multAcc_32x32_keep32_R(acc, a2, Yn2);
/* The result is converted to 1.31 */
@ -255,10 +259,10 @@ void arm_biquad_cascade_df1_fast_q31(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@ -267,18 +271,18 @@ void arm_biquad_cascade_df1_fast_q31(
/* Store the output in the destination buffer. */
*pOut++ = acc;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
}
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent stages occur in-place in the output buffer */
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent stages occur in-place in the output buffer */
pIn = pDst;
/* Reset to destination pointer */
pOut = pDst;
/* Store the updated state variables back into the pState array */
/* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
@ -288,5 +292,5 @@ void arm_biquad_cascade_df1_fast_q31(
}
/**
* @} end of BiquadCascadeDF1 group
*/
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_f32.c
* Description: Floating-point Biquad cascade DirectFormI(DF1) filter initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,55 +29,49 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @details
* @brief Initialization function for the floating-point Biquad cascade filter.
* @param[in,out] *S points to an instance of the floating-point Biquad cascade structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients array.
* @param[in] *pState points to the state array.
* @return none
*
*
* <b>Coefficient and State Ordering:</b>
*
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
*
* \par
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> is a pointer to state array.
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
*
@brief Initialization function for the floating-point Biquad cascade filter.
@param[in,out] S points to an instance of the floating-point Biquad cascade structure.
@param[in] numStages number of 2nd order stages in the filter.
@param[in] pCoeffs points to the filter coefficients.
@param[in] pState points to the state buffer.
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
@par
The <code>pState</code> is a pointer to state array.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
The state variables are arranged in the <code>pState</code> array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_f32(
arm_biquad_casd_df1_inst_f32 * S,
uint8_t numStages,
float32_t * pCoeffs,
float32_t * pState)
arm_biquad_casd_df1_inst_f32 * S,
uint8_t numStages,
const float32_t * pCoeffs,
float32_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@ -93,5 +87,5 @@ void arm_biquad_cascade_df1_init_f32(
}
/**
* @} end of BiquadCascadeDF1 group
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_q15.c
* Description: Q15 Biquad cascade DirectFormI(DF1) filter initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,54 +29,51 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @details
*
* @param[in,out] *S points to an instance of the Q15 Biquad cascade structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients.
* @param[in] *pState points to the state buffer.
* @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format
* @return none
*
* <b>Coefficient and State Ordering:</b>
*
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}
* </pre>
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>6*numStages</code> values.
* The zero coefficient between <code>b1</code> and <code>b2</code> facilities use of 16-bit SIMD instructions on the Cortex-M4.
*
* \par
* The state variables are stored in the array <code>pState</code>.
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the Q15 Biquad cascade filter.
@param[in,out] S points to an instance of the Q15 Biquad cascade structure.
@param[in] numStages number of 2nd order stages in the filter.
@param[in] pCoeffs points to the filter coefficients.
@param[in] pState points to the state buffer.
@param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>6*numStages</code> values.
The zero coefficient between <code>b1</code> and <code>b2</code> facilities use of 16-bit SIMD instructions on the Cortex-M4.
@par
The state variables are stored in the array <code>pState</code>.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
The state variables are arranged in the <code>pState</code> array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_q15(
arm_biquad_casd_df1_inst_q15 * S,
uint8_t numStages,
q15_t * pCoeffs,
q15_t * pState,
int8_t postShift)
arm_biquad_casd_df1_inst_q15 * S,
uint8_t numStages,
const q15_t * pCoeffs,
q15_t * pState,
int8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@ -95,5 +92,5 @@ void arm_biquad_cascade_df1_init_q15(
}
/**
* @} end of BiquadCascadeDF1 group
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_init_q31.c
* Description: Q31 Biquad cascade DirectFormI(DF1) filter initialization function
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,53 +29,50 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @details
*
* @param[in,out] *S points to an instance of the Q31 Biquad cascade structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients buffer.
* @param[in] *pState points to the state buffer.
* @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
* @return none
*
* <b>Coefficient and State Ordering:</b>
*
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> points to state variables array.
* Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {x[n-1], x[n-2], y[n-1], y[n-2]}
* </pre>
* The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
* The state array has a total length of <code>4*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the Q31 Biquad cascade filter.
@param[in,out] S points to an instance of the Q31 Biquad cascade structure.
@param[in] numStages number of 2nd order stages in the filter.
@param[in] pCoeffs points to the filter coefficients.
@param[in] pState points to the state buffer.
@param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
@par
The <code>pState</code> points to state variables array.
Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
The state variables are arranged in the <code>pState</code> array as:
<pre>
{x[n-1], x[n-2], y[n-1], y[n-2]}
</pre>
The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
The state array has a total length of <code>4*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df1_init_q31(
arm_biquad_casd_df1_inst_q31 * S,
uint8_t numStages,
q31_t * pCoeffs,
q31_t * pState,
int8_t postShift)
arm_biquad_casd_df1_inst_q31 * S,
uint8_t numStages,
const q31_t * pCoeffs,
q31_t * pState,
int8_t postShift)
{
/* Assign filter stages */
S->numStages = numStages;
@ -94,5 +91,5 @@ void arm_biquad_cascade_df1_init_q31(
}
/**
* @} end of BiquadCascadeDF1 group
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_q15.c
* Description: Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,79 +29,74 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @brief Processing function for the Q15 Biquad cascade filter.
* @param[in] *S points to an instance of the Q15 Biquad cascade structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the location where the output result is written.
* @param[in] blockSize number of samples to process per call.
* @return none.
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using a 64-bit internal accumulator.
* Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
* There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
* The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
* Finally, the result is saturated to 1.15 format.
*
* \par
* Refer to the function <code>arm_biquad_cascade_df1_fast_q15()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
@brief Processing function for the Q15 Biquad cascade filter.
@param[in] S points to an instance of the Q15 Biquad cascade structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the location where the output result is written
@param[in] blockSize number of samples to process
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
Finally, the result is saturated to 1.15 format.
@remark
Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
*/
void arm_biquad_cascade_df1_q15(
const arm_biquad_casd_df1_inst_q15 * S,
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q31_t in; /* Temporary variable to hold input value */
q31_t out; /* Temporary variable to hold output value */
q31_t b0; /* Temporary variable to hold bo value */
q31_t b1, a1; /* Filter coefficients */
q31_t state_in, state_out; /* Filter state variables */
q31_t acc_l, acc_h;
q63_t acc; /* Accumulator */
int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
q15_t *pState = S->pState; /* State pointer */
q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
int32_t uShift = (32 - lShift);
const q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q31_t in; /* Temporary variable to hold input value */
q31_t out; /* Temporary variable to hold output value */
q31_t b0; /* Temporary variable to hold bo value */
q31_t b1, a1; /* Filter coefficients */
q31_t state_in, state_out; /* Filter state variables */
q31_t acc_l, acc_h;
q63_t acc; /* Accumulator */
q15_t *pState = S->pState; /* State pointer */
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
int32_t uShift = (32 - lShift);
do
{
/* Read the b0 and 0 coefficients using SIMD */
b0 = *__SIMD32(pCoeffs)++;
b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the b1 and b2 coefficients using SIMD */
b1 = *__SIMD32(pCoeffs)++;
b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the a1 and a2 coefficients using SIMD */
a1 = *__SIMD32(pCoeffs)++;
a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
/* Read the input state values from the state buffer: x[n-1], x[n-2] */
state_in = *__SIMD32(pState)++;
state_in = read_q15x2_ia (&pState);
/* Read the output state values from the state buffer: y[n-1], y[n-2] */
state_out = *__SIMD32(pState)--;
state_out = read_q15x2_da (&pState);
/* Apply loop unrolling and compute 2 output values simultaneously. */
/* The variable acc hold output values that are being computed:
@ -117,7 +112,7 @@ void arm_biquad_cascade_df1_q15(
{
/* Read the input */
in = *__SIMD32(pIn)++;
in = read_q15x2_ia ((q15_t **) &pIn);
/* out = b0 * x[n] + 0 * 0 */
out = __SMUAD(b0, in);
@ -141,23 +136,19 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in, state_in, 16);
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
state_out = __PKHBT(state_out >> 16, (out), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* out = b0 * x[n] + 0 * 0 */
@ -180,41 +171,30 @@ void arm_biquad_cascade_df1_q15(
out = __SSAT(out, 16);
/* Store the output in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
#else
*__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in >> 16, state_in, 16);
state_in = __PKHBT(in >> 16, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, in, 16);
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
/* Decrement loop counter */
sample--;
}
/* If the blockSize is not a multiple of 2, compute any remaining output samples here.
@ -226,15 +206,10 @@ void arm_biquad_cascade_df1_q15(
in = *pIn++;
/* out = b0 * x[n] + 0 * 0 */
#ifndef ARM_MATH_BIG_ENDIAN
out = __SMUAD(b0, in);
#else
out = __SMUADX(b0, in);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* acc = b1 * x[n-1] + b2 * x[n-2] + out */
@ -259,58 +234,49 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
/* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
#ifndef ARM_MATH_BIG_ENDIAN
state_in = __PKHBT(in, state_in, 16);
state_out = __PKHBT(out, state_out, 16);
#else
state_in = __PKHBT(state_in >> 16, in, 16);
state_out = __PKHBT(state_out >> 16, out, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
}
/* The first stage goes from the input wire to the output wire. */
/* Subsequent numStages occur in-place in the output wire */
/* The first stage goes from the input wire to the output wire. */
/* Subsequent numStages occur in-place in the output wire */
pIn = pDst;
/* Reset the output pointer */
pOut = pDst;
/* Store the updated state variables back into the state array */
*__SIMD32(pState)++ = state_in;
*__SIMD32(pState)++ = state_out;
/* Store the updated state variables back into the state array */
write_q15x2_ia (&pState, state_in);
write_q15x2_ia (&pState, state_out);
/* Decrement the loop counter */
/* Decrement loop counter */
stage--;
} while (stage > 0U);
#else
/* Run the below code for Cortex-M0 */
q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
q15_t Xn; /* temporary input */
q63_t acc; /* Accumulator */
int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
q15_t *pState = S->pState; /* State pointer */
q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
const q15_t *pIn = pSrc; /* Source pointer */
q15_t *pOut = pDst; /* Destination pointer */
q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
q15_t Xn; /* temporary input */
q63_t acc; /* Accumulator */
int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
q15_t *pState = S->pState; /* State pointer */
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
do
{
@ -328,7 +294,7 @@ void arm_biquad_cascade_df1_q15(
Yn1 = pState[2];
Yn2 = pState[3];
/* The variables acc holds the output value that is computed:
/* The variables acc holds the output value that is computed:
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
@ -357,10 +323,10 @@ void arm_biquad_cascade_df1_q15(
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
@ -392,7 +358,6 @@ void arm_biquad_cascade_df1_q15(
}
/**
* @} end of BiquadCascadeDF1 group
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df1_q31.c
* Description: Processing function for the Q31 Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,59 +29,54 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF1
* @{
@addtogroup BiquadCascadeDF1
@{
*/
/**
* @brief Processing function for the Q31 Biquad cascade filter.
* @param[in] *S points to an instance of the Q31 Biquad cascade structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data.
* @param[in] blockSize number of samples to process per call.
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
* Thus, if the accumulator result overflows it wraps around rather than clip.
* In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
* After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
* 1.31 format by discarding the low 32 bits.
*
* \par
* Refer to the function <code>arm_biquad_cascade_df1_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
@brief Processing function for the Q31 Biquad cascade filter.
@param[in] S points to an instance of the Q31 Biquad cascade structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
Thus, if the accumulator result overflows it wraps around rather than clip.
In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
1.31 format by discarding the low 32 bits.
@remark
Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
*/
void arm_biquad_cascade_df1_q31(
const arm_biquad_casd_df1_inst_q31 * S,
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q63_t acc; /* accumulator */
uint32_t uShift = ((uint32_t) S->postShift + 1U);
uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
q31_t *pIn = pSrc; /* input pointer initialization */
q31_t *pOut = pDst; /* output pointer initialization */
q31_t *pState = S->pState; /* pState pointer initialization */
q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t Xn; /* temporary input */
uint32_t sample, stage = S->numStages; /* loop counters */
const q31_t *pIn = pSrc; /* Source pointer */
q31_t *pOut = pDst; /* Destination pointer */
q31_t *pState = S->pState; /* pState pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q63_t acc; /* Accumulator */
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
q31_t Xn; /* Temporary input */
uint32_t uShift = ((uint32_t) S->postShift + 1U);
uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
uint32_t sample, stage = S->numStages; /* Loop counters */
#if defined (ARM_MATH_DSP)
q31_t acc_l, acc_h; /* temporary output variables */
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t acc_l, acc_h; /* temporary output variables */
#endif
do
{
@ -92,301 +87,161 @@ void arm_biquad_cascade_df1_q31(
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/* Reading the state values */
/* Reading the pState values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 output values simultaneously. */
/* The variable acc hold output values that are being computed:
/* Variable acc hold output values that are being computed:
*
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Loop unrolling: Compute 4 outputs at a time */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U)
{
/* Read the input */
/* Read the first input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn2;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn1;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn2;
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31 , Yn2 variable is reused */
/* Calc lower part of acc */
acc_l = acc & 0xffffffff;
/* Calc upper part of acc */
acc_h = (acc >> 32) & 0xffffffff;
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Store the output in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn2;
/* Read the second input */
Xn2 = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn2;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn1;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn2;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn1;
acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
/* Calc lower part of acc */
acc_l = acc & 0xffffffff;
/* Calc upper part of acc */
acc_h = (acc >> 32) & 0xffffffff;
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Store the output in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn1;
/* Read the third input */
/* Read the third input */
Xn1 = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn1;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn2;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn1;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn2;
acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31, Yn2 variable is reused */
/* Calc lower part of acc */
acc_l = acc & 0xffffffff;
/* Calc upper part of acc */
acc_h = (acc >> 32) & 0xffffffff;
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Store the output in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn2;
/* Read the forth input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn2;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn2;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn1;
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
/* The result is converted to 1.31, Yn1 variable is reused */
/* Calc lower part of acc */
acc_l = acc & 0xffffffff;
/* Calc upper part of acc */
acc_h = (acc >> 32) & 0xffffffff;
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
/* Apply shift for lower part of acc and upper part of acc */
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
/* Store the output in the destination buffer. */
/* Store output in destination buffer. */
*pOut++ = Yn1;
/* decrement the loop counter */
sample--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
sample = (blockSize & 0x3U);
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn2;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn1;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn2;
/* The result is converted to 1.31 */
acc = acc >> lShift;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = (q31_t) acc;
/* Store the output in the destination buffer. */
*pOut++ = (q31_t) acc;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent stages occur in-place in the output buffer */
pIn = pDst;
/* Reset to destination pointer */
pOut = pDst;
/* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
} while (--stage);
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0x3U;
#else
/* Run the below code for Cortex-M0 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/* Reading the state values */
Xn1 = pState[0];
Xn2 = pState[1];
Yn1 = pState[2];
Yn2 = pState[3];
/* The variables acc holds the output value that is computed:
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
*/
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U)
{
/* Read the input */
Xn = *pIn++;
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
/* acc = b0 * x[n] */
acc = (q63_t) b0 *Xn;
/* acc += b1 * x[n-1] */
acc += (q63_t) b1 *Xn1;
/* acc += b[2] * x[n-2] */
acc += (q63_t) b2 *Xn2;
/* acc += a1 * y[n-1] */
acc += (q63_t) a1 *Yn1;
/* acc += a2 * y[n-2] */
acc += (q63_t) a2 *Yn2;
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
/* The result is converted to 1.31 */
acc = acc >> lShift;
/* Store output in destination buffer. */
*pOut++ = (q31_t) acc;
/* Every time after the output is computed state should be updated. */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
/* The states should be updated as: */
/* Xn2 = Xn1 */
/* Xn1 = Xn */
/* Yn2 = Yn1 */
/* Yn1 = acc */
Xn2 = Xn1;
Xn1 = Xn;
Yn2 = Yn1;
Yn1 = (q31_t) acc;
/* Store the output in the destination buffer. */
*pOut++ = (q31_t) acc;
/* decrement the loop counter */
/* decrement loop counter */
sample--;
}
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent stages occur in-place in the output buffer */
pIn = pDst;
/* Reset to destination pointer */
pOut = pDst;
/* Store the updated state variables back into the pState array */
/* Store the updated state variables back into the pState array */
*pState++ = Xn1;
*pState++ = Xn2;
*pState++ = Yn1;
*pState++ = Yn2;
} while (--stage);
/* The first stage goes from the input buffer to the output buffer. */
/* Subsequent numStages occur in-place in the output buffer */
pIn = pDst;
/* Reset output pointer */
pOut = pDst;
/* decrement loop counter */
stage--;
} while (stage > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BiquadCascadeDF1 group
*/
@} end of BiquadCascadeDF1 group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df2T_f32.c
* Description: Processing function for floating-point transposed direct form II Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,381 +29,141 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
*
* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
* The filters are implemented as a cascade of second order Biquad sections.
* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
* Only floating-point data is supported.
*
* This function operate on blocks of input and output data and each call to the function
* processes <code>blockSize</code> samples through the filter.
* <code>pSrc</code> points to the array of input data and
* <code>pDst</code> points to the array of output data.
* Both arrays contain <code>blockSize</code> values.
*
* \par Algorithm
* Each Biquad stage implements a second order filter using the difference equation:
* <pre>
* y[n] = b0 * x[n] + d1
* d1 = b1 * x[n] + a1 * y[n] + d2
* d2 = b2 * x[n] + a2 * y[n]
* </pre>
* where d1 and d2 represent the two state values.
*
* \par
* A Biquad filter using a transposed Direct Form II structure is shown below.
* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
* Pay careful attention to the sign of the feedback coefficients.
* Some design tools flip the sign of the feedback coefficients:
* <pre>
* y[n] = b0 * x[n] + d1;
* d1 = b1 * x[n] - a1 * y[n] + d2;
* d2 = b2 * x[n] - a2 * y[n];
* </pre>
* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
*
* \par
* Higher order filters are realized as a cascade of second order sections.
* <code>numStages</code> refers to the number of second order stages used.
* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
*
* \par
* <code>pState</code> points to the state variable array.
* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {d11, d12, d21, d22, ...}
* </pre>
* where <code>d1x</code> refers to the state variables for the first Biquad and
* <code>d2x</code> refers to the state variables for the second Biquad.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
*
* \par
* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
* That is why the Direct Form I structure supports Q15 and Q31 data types.
* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
*
* \par Instance Structure
* The coefficients and state variables for a filter are stored together in an instance data structure.
* A separate instance structure must be defined for each filter.
* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
*
* \par Init Functions
* There is also an associated initialization function.
* The initialization function performs following operations:
* - Sets the values of the internal structure fields.
* - Zeros out the values in the state buffer.
* To do this manually without calling the init function, assign the follow subfields of the instance structure:
* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
*
* \par
* Use of the initialization function is optional.
* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
* To place an instance structure into a const data section, the instance structure must be manually initialized.
* Set the values in the state buffer to zeros before static initialization.
* For example, to statically initialize the instance structure use
* <pre>
* arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
* </pre>
* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
* <code>pCoeffs</code> is the address of the coefficient buffer;
*
*/
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
*/
@brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
@param[in] S points to an instance of the filter data structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
*/
/**
* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in] *S points to an instance of the filter data structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data
* @param[in] blockSize number of samples to process.
* @return none.
*/
#if defined(ARM_MATH_NEON)
LOW_OPTIMIZATION_ENTER
void arm_biquad_cascade_df2T_f32(
const arm_biquad_cascade_df2T_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const arm_biquad_cascade_df2T_instance_f32 * S,
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pIn = pSrc; /* source pointer */
const float32_t *pIn = pSrc; /* source pointer */
float32_t *pOut = pDst; /* destination pointer */
float32_t *pState = S->pState; /* State pointer */
float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
float32_t acc1; /* accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1; /* temporary input */
float32_t d1, d2; /* state variables */
uint32_t sample, stage = S->numStages; /* loop counters */
uint32_t sample, stageCnt,stage = S->numStages; /* loop counters */
#if defined(ARM_MATH_CM7)
float32_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8; /* Input State variables */
float32_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
float32_t acc2, acc3, acc4, acc5, acc6, acc7; /* Simulates the accumulator */
float32_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
float32_t Xn2, Xn3, Xn4; /* Input State variables */
float32_t acc2, acc3, acc4; /* accumulator */
do
float32_t p0, p1, p2, p3, p4, A1;
float32x4_t XnV, YnV;
float32x4x2_t dV;
float32x4_t zeroV = vdupq_n_f32(0.0);
float32x4_t t1,t2,t3,t4,b1V,b2V,a1V,a2V,s;
/* Loop unrolling. Compute 4 outputs at a time */
stageCnt = stage >> 2;
while (stageCnt > 0U)
{
/* Reading the coefficients */
b0 = pCoeffs[0];
b1 = pCoeffs[1];
b2 = pCoeffs[2];
a1 = pCoeffs[3];
/* Apply loop unrolling and compute 16 output values simultaneously. */
sample = blockSize >> 4U;
a2 = pCoeffs[4];
t1 = vld1q_f32(pCoeffs);
pCoeffs += 4;
/*Reading the state values */
d1 = pState[0];
d2 = pState[1];
t2 = vld1q_f32(pCoeffs);
pCoeffs += 4;
pCoeffs += 5U;
t3 = vld1q_f32(pCoeffs);
pCoeffs += 4;
t4 = vld1q_f32(pCoeffs);
pCoeffs += 4;
/* First part of the processing with loop unrolling. Compute 16 outputs at a time.
** a second loop below computes the remaining 1 to 15 samples. */
b1V = vld1q_f32(pCoeffs);
pCoeffs += 4;
b2V = vld1q_f32(pCoeffs);
pCoeffs += 4;
a1V = vld1q_f32(pCoeffs);
pCoeffs += 4;
a2V = vld1q_f32(pCoeffs);
pCoeffs += 4;
/* Reading the state values */
dV = vld2q_f32(pState);
sample = blockSize;
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* Read the first 2 inputs. 2 cycles */
Xn1 = pIn[0 ];
Xn2 = pIn[1 ];
XnV = vdupq_n_f32(*pIn++);
/* Sample 1. 5 cycles */
Xn3 = pIn[2 ];
acc1 = b0 * Xn1 + d1;
s = dV.val[0];
YnV = s;
Xn4 = pIn[3 ];
d1 = b1 * Xn1 + d2;
s = vextq_f32(zeroV,dV.val[0],3);
YnV = vmlaq_f32(YnV, t1, s);
Xn5 = pIn[4 ];
d2 = b2 * Xn1;
s = vextq_f32(zeroV,dV.val[0],2);
YnV = vmlaq_f32(YnV, t2, s);
Xn6 = pIn[5 ];
d1 += a1 * acc1;
s = vextq_f32(zeroV,dV.val[0],1);
YnV = vmlaq_f32(YnV, t3, s);
Xn7 = pIn[6 ];
d2 += a2 * acc1;
YnV = vmlaq_f32(YnV, t4, XnV);
/* Sample 2. 5 cycles */
Xn8 = pIn[7 ];
acc2 = b0 * Xn2 + d1;
s = vextq_f32(XnV,YnV,3);
Xn9 = pIn[8 ];
d1 = b1 * Xn2 + d2;
dV.val[0] = vmlaq_f32(dV.val[1], s, b1V);
dV.val[0] = vmlaq_f32(dV.val[0], YnV, a1V);
Xn10 = pIn[9 ];
d2 = b2 * Xn2;
dV.val[1] = vmulq_f32(s, b2V);
dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);
Xn11 = pIn[10];
d1 += a1 * acc2;
Xn12 = pIn[11];
d2 += a2 * acc2;
/* Sample 3. 5 cycles */
Xn13 = pIn[12];
acc3 = b0 * Xn3 + d1;
Xn14 = pIn[13];
d1 = b1 * Xn3 + d2;
Xn15 = pIn[14];
d2 = b2 * Xn3;
Xn16 = pIn[15];
d1 += a1 * acc3;
pIn += 16;
d2 += a2 * acc3;
/* Sample 4. 5 cycles */
acc4 = b0 * Xn4 + d1;
d1 = b1 * Xn4 + d2;
d2 = b2 * Xn4;
d1 += a1 * acc4;
d2 += a2 * acc4;
/* Sample 5. 5 cycles */
acc5 = b0 * Xn5 + d1;
d1 = b1 * Xn5 + d2;
d2 = b2 * Xn5;
d1 += a1 * acc5;
d2 += a2 * acc5;
/* Sample 6. 5 cycles */
acc6 = b0 * Xn6 + d1;
d1 = b1 * Xn6 + d2;
d2 = b2 * Xn6;
d1 += a1 * acc6;
d2 += a2 * acc6;
/* Sample 7. 5 cycles */
acc7 = b0 * Xn7 + d1;
d1 = b1 * Xn7 + d2;
d2 = b2 * Xn7;
d1 += a1 * acc7;
d2 += a2 * acc7;
/* Sample 8. 5 cycles */
acc8 = b0 * Xn8 + d1;
d1 = b1 * Xn8 + d2;
d2 = b2 * Xn8;
d1 += a1 * acc8;
d2 += a2 * acc8;
/* Sample 9. 5 cycles */
acc9 = b0 * Xn9 + d1;
d1 = b1 * Xn9 + d2;
d2 = b2 * Xn9;
d1 += a1 * acc9;
d2 += a2 * acc9;
/* Sample 10. 5 cycles */
acc10 = b0 * Xn10 + d1;
d1 = b1 * Xn10 + d2;
d2 = b2 * Xn10;
d1 += a1 * acc10;
d2 += a2 * acc10;
/* Sample 11. 5 cycles */
acc11 = b0 * Xn11 + d1;
d1 = b1 * Xn11 + d2;
d2 = b2 * Xn11;
d1 += a1 * acc11;
d2 += a2 * acc11;
/* Sample 12. 5 cycles */
acc12 = b0 * Xn12 + d1;
d1 = b1 * Xn12 + d2;
d2 = b2 * Xn12;
d1 += a1 * acc12;
d2 += a2 * acc12;
/* Sample 13. 5 cycles */
acc13 = b0 * Xn13 + d1;
d1 = b1 * Xn13 + d2;
d2 = b2 * Xn13;
pOut[0 ] = acc1 ;
d1 += a1 * acc13;
pOut[1 ] = acc2 ;
d2 += a2 * acc13;
/* Sample 14. 5 cycles */
pOut[2 ] = acc3 ;
acc14 = b0 * Xn14 + d1;
pOut[3 ] = acc4 ;
d1 = b1 * Xn14 + d2;
pOut[4 ] = acc5 ;
d2 = b2 * Xn14;
pOut[5 ] = acc6 ;
d1 += a1 * acc14;
pOut[6 ] = acc7 ;
d2 += a2 * acc14;
/* Sample 15. 5 cycles */
pOut[7 ] = acc8 ;
pOut[8 ] = acc9 ;
acc15 = b0 * Xn15 + d1;
pOut[9 ] = acc10;
d1 = b1 * Xn15 + d2;
pOut[10] = acc11;
d2 = b2 * Xn15;
pOut[11] = acc12;
d1 += a1 * acc15;
pOut[12] = acc13;
d2 += a2 * acc15;
/* Sample 16. 5 cycles */
pOut[13] = acc14;
acc16 = b0 * Xn16 + d1;
pOut[14] = acc15;
d1 = b1 * Xn16 + d2;
pOut[15] = acc16;
d2 = b2 * Xn16;
*pOut++ = YnV[3];
sample--;
d1 += a1 * acc16;
pOut += 16;
d2 += a2 * acc16;
}
sample = blockSize & 0xFu;
while (sample > 0U) {
Xn1 = *pIn;
acc1 = b0 * Xn1 + d1;
pIn++;
d1 = b1 * Xn1 + d2;
*pOut = acc1;
d2 = b2 * Xn1;
pOut++;
d1 += a1 * acc1;
sample--;
d2 += a2 * acc1;
}
/* Store the updated state variables back into the state array */
pState[0] = d1;
vst2q_f32(pState,dV);
pState += 8;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
pState[1] = d2;
/* decrement the loop counter */
stage--;
pState += 2U;
/*Reset the output working pointer */
pOut = pDst;
} while (stage > 0U);
/* decrement the loop counter */
stageCnt--;
#elif defined(ARM_MATH_CM0_FAMILY)
}
/* Run the below code for Cortex-M0 */
do
/* Tail */
stageCnt = stage & 3;
while (stageCnt > 0U)
{
/* Reading the coefficients */
b0 = *pCoeffs++;
@ -416,7 +176,6 @@ uint32_t blockSize)
d1 = pState[0];
d2 = pState[1];
sample = blockSize;
while (sample > 0U)
@ -452,139 +211,313 @@ uint32_t blockSize)
pOut = pDst;
/* decrement the loop counter */
stage--;
stageCnt--;
}
}
#else
LOW_OPTIMIZATION_ENTER
void arm_biquad_cascade_df2T_f32(
const arm_biquad_cascade_df2T_instance_f32 * S,
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
const float32_t *pIn = pSrc; /* Source pointer */
float32_t *pOut = pDst; /* Destination pointer */
float32_t *pState = S->pState; /* State pointer */
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t acc1; /* Accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1; /* Temporary input */
float32_t d1, d2; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */
} while (stage > 0U);
do
{
/* Reading the coefficients */
b0 = pCoeffs[0];
b1 = pCoeffs[1];
b2 = pCoeffs[2];
a1 = pCoeffs[3];
a2 = pCoeffs[4];
/* Reading the state values */
d1 = pState[0];
d2 = pState[1];
pCoeffs += 5U;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 16 outputs at a time */
sample = blockSize >> 4U;
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* 1 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 2 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 3 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 4 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 5 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 6 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 7 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 8 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 9 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 10 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 11 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 12 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 13 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 14 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 15 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 16 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* decrement loop counter */
sample--;
}
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0xFU;
#else
float32_t Xn2, Xn3, Xn4; /* Input State variables */
float32_t acc2, acc3, acc4; /* accumulator */
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
float32_t p0, p1, p2, p3, p4, A1;
/* Run the below code for Cortex-M4 and Cortex-M3 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/*Reading the state values */
d1 = pState[0];
d2 = pState[1];
/* Apply loop unrolling and compute 4 output values simultaneously. */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U) {
Xn1 = *pIn++;
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
acc1 = b0 * Xn1 + d1;
/* Read the four inputs */
Xn1 = pIn[0];
Xn2 = pIn[1];
Xn3 = pIn[2];
Xn4 = pIn[3];
pIn += 4;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
p0 = b0 * Xn1;
p1 = b1 * Xn1;
acc1 = p0 + d1;
p0 = b0 * Xn2;
p3 = a1 * acc1;
p2 = b2 * Xn1;
A1 = p1 + p3;
p4 = a2 * acc1;
d1 = A1 + d2;
d2 = p2 + p4;
d2 = b2 * Xn1;
d2 += a2 * acc1;
p1 = b1 * Xn2;
acc2 = p0 + d1;
p0 = b0 * Xn3;
p3 = a1 * acc2;
p2 = b2 * Xn2;
A1 = p1 + p3;
p4 = a2 * acc2;
d1 = A1 + d2;
d2 = p2 + p4;
*pOut++ = acc1;
p1 = b1 * Xn3;
acc3 = p0 + d1;
p0 = b0 * Xn4;
p3 = a1 * acc3;
p2 = b2 * Xn3;
A1 = p1 + p3;
p4 = a2 * acc3;
d1 = A1 + d2;
d2 = p2 + p4;
acc4 = p0 + d1;
p1 = b1 * Xn4;
p3 = a1 * acc4;
p2 = b2 * Xn4;
A1 = p1 + p3;
p4 = a2 * acc4;
d1 = A1 + d2;
d2 = p2 + p4;
pOut[0] = acc1;
pOut[1] = acc2;
pOut[2] = acc3;
pOut[3] = acc4;
pOut += 4;
sample--;
}
sample = blockSize & 0x3U;
while (sample > 0U) {
Xn1 = *pIn++;
p0 = b0 * Xn1;
p1 = b1 * Xn1;
acc1 = p0 + d1;
p3 = a1 * acc1;
p2 = b2 * Xn1;
A1 = p1 + p3;
p4 = a2 * acc1;
d1 = A1 + d2;
d2 = p2 + p4;
*pOut++ = acc1;
sample--;
/* decrement loop counter */
sample--;
}
/* Store the updated state variables back into the state array */
*pState++ = d1;
*pState++ = d2;
pState[0] = d1;
pState[1] = d2;
pState += 2U;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/*Reset the output working pointer */
/* Reset the output working pointer */
pOut = pDst;
/* decrement the loop counter */
/* decrement loop counter */
stage--;
} while (stage > 0U);
#endif
}
LOW_OPTIMIZATION_EXIT
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of BiquadCascadeDF2T group
*/
@} end of BiquadCascadeDF2T group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df2T_f64.c
* Description: Processing function for floating-point transposed direct form II Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,139 +29,128 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
*
* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
* The filters are implemented as a cascade of second order Biquad sections.
* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
* Only floating-point data is supported.
*
* This function operate on blocks of input and output data and each call to the function
* processes <code>blockSize</code> samples through the filter.
* <code>pSrc</code> points to the array of input data and
* <code>pDst</code> points to the array of output data.
* Both arrays contain <code>blockSize</code> values.
*
* \par Algorithm
* Each Biquad stage implements a second order filter using the difference equation:
* <pre>
* y[n] = b0 * x[n] + d1
* d1 = b1 * x[n] + a1 * y[n] + d2
* d2 = b2 * x[n] + a2 * y[n]
* </pre>
* where d1 and d2 represent the two state values.
*
* \par
* A Biquad filter using a transposed Direct Form II structure is shown below.
* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
* Pay careful attention to the sign of the feedback coefficients.
* Some design tools flip the sign of the feedback coefficients:
* <pre>
* y[n] = b0 * x[n] + d1;
* d1 = b1 * x[n] - a1 * y[n] + d2;
* d2 = b2 * x[n] - a2 * y[n];
* </pre>
* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
*
* \par
* Higher order filters are realized as a cascade of second order sections.
* <code>numStages</code> refers to the number of second order stages used.
* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
*
* \par
* <code>pState</code> points to the state variable array.
* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {d11, d12, d21, d22, ...}
* </pre>
* where <code>d1x</code> refers to the state variables for the first Biquad and
* <code>d2x</code> refers to the state variables for the second Biquad.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
*
* \par
* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
* That is why the Direct Form I structure supports Q15 and Q31 data types.
* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
*
* \par Instance Structure
* The coefficients and state variables for a filter are stored together in an instance data structure.
* A separate instance structure must be defined for each filter.
* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
*
* \par Init Functions
* There is also an associated initialization function.
* The initialization function performs following operations:
* - Sets the values of the internal structure fields.
* - Zeros out the values in the state buffer.
* To do this manually without calling the init function, assign the follow subfields of the instance structure:
* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
*
* \par
* Use of the initialization function is optional.
* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
* To place an instance structure into a const data section, the instance structure must be manually initialized.
* Set the values in the state buffer to zeros before static initialization.
* For example, to statically initialize the instance structure use
* <pre>
* arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
* </pre>
* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
* <code>pCoeffs</code> is the address of the coefficient buffer;
*
@defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
The filters are implemented as a cascade of second order Biquad sections.
These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
Only floating-point data is supported.
This function operate on blocks of input and output data and each call to the function
processes <code>blockSize</code> samples through the filter.
<code>pSrc</code> points to the array of input data and
<code>pDst</code> points to the array of output data.
Both arrays contain <code>blockSize</code> values.
@par Algorithm
Each Biquad stage implements a second order filter using the difference equation:
<pre>
y[n] = b0 * x[n] + d1
d1 = b1 * x[n] + a1 * y[n] + d2
d2 = b2 * x[n] + a2 * y[n]
</pre>
where d1 and d2 represent the two state values.
@par
A Biquad filter using a transposed Direct Form II structure is shown below.
\image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
Pay careful attention to the sign of the feedback coefficients.
Some design tools flip the sign of the feedback coefficients:
<pre>
y[n] = b0 * x[n] + d1;
d1 = b1 * x[n] - a1 * y[n] + d2;
d2 = b2 * x[n] - a2 * y[n];
</pre>
In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
@par
Higher order filters are realized as a cascade of second order sections.
<code>numStages</code> refers to the number of second order stages used.
For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
@par
<code>pState</code> points to the state variable array.
Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
The state variables are arranged in the <code>pState</code> array as:
<pre>
{d11, d12, d21, d22, ...}
</pre>
where <code>d1x</code> refers to the state variables for the first Biquad and
<code>d2x</code> refers to the state variables for the second Biquad.
The state array has a total length of <code>2*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
@par
The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
That is why the Direct Form I structure supports Q15 and Q31 data types.
The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
@par Instance Structure
The coefficients and state variables for a filter are stored together in an instance data structure.
A separate instance structure must be defined for each filter.
Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
@par Init Functions
There is also an associated initialization function.
The initialization function performs following operations:
- Sets the values of the internal structure fields.
- Zeros out the values in the state buffer.
To do this manually without calling the init function, assign the follow subfields of the instance structure:
numStages, pCoeffs, pState. Also set all of the values in pState to zero.
@par
Use of the initialization function is optional.
However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
To place an instance structure into a const data section, the instance structure must be manually initialized.
Set the values in the state buffer to zeros before static initialization.
For example, to statically initialize the instance structure use
<pre>
arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
</pre>
where <code>numStages</code> is the number of Biquad stages in the filter;
<code>pState</code> is the address of the state buffer.
<code>pCoeffs</code> is the address of the coefficient buffer;
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
*/
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in] *S points to an instance of the filter data structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data
* @param[in] blockSize number of samples to process.
* @return none.
*/
@brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
@param[in] S points to an instance of the filter data structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
*/
LOW_OPTIMIZATION_ENTER
void arm_biquad_cascade_df2T_f64(
const arm_biquad_cascade_df2T_instance_f64 * S,
float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
const arm_biquad_cascade_df2T_instance_f64 * S,
float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
float64_t *pIn = pSrc; /* source pointer */
float64_t *pOut = pDst; /* destination pointer */
float64_t *pState = S->pState; /* State pointer */
float64_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
float64_t acc1; /* accumulator */
float64_t b0, b1, b2, a1, a2; /* Filter coefficients */
float64_t Xn1; /* temporary input */
float64_t d1, d2; /* state variables */
uint32_t sample, stage = S->numStages; /* loop counters */
float64_t *pIn = pSrc; /* Source pointer */
float64_t *pOut = pDst; /* Destination pointer */
float64_t *pState = S->pState; /* State pointer */
float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float64_t acc1; /* Accumulator */
float64_t b0, b1, b2, a1, a2; /* Filter coefficients */
float64_t Xn1; /* Temporary input */
float64_t d1, d2; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */
#if defined(ARM_MATH_CM7)
float64_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8; /* Input State variables */
float64_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
float64_t acc2, acc3, acc4, acc5, acc6, acc7; /* Simulates the accumulator */
float64_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
do
{
@ -170,421 +159,285 @@ uint32_t blockSize)
b1 = pCoeffs[1];
b2 = pCoeffs[2];
a1 = pCoeffs[3];
/* Apply loop unrolling and compute 16 output values simultaneously. */
sample = blockSize >> 4U;
a2 = pCoeffs[4];
/*Reading the state values */
/* Reading the state values */
d1 = pState[0];
d2 = pState[1];
pCoeffs += 5U;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 16 outputs at a time */
sample = blockSize >> 4U;
/* First part of the processing with loop unrolling. Compute 16 outputs at a time.
** a second loop below computes the remaining 1 to 15 samples. */
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* Read the first 2 inputs. 2 cycles */
Xn1 = pIn[0 ];
Xn2 = pIn[1 ];
/* 1 */
Xn1 = *pIn++;
/* Sample 1. 5 cycles */
Xn3 = pIn[2 ];
acc1 = b0 * Xn1 + d1;
Xn4 = pIn[3 ];
d1 = b1 * Xn1 + d2;
Xn5 = pIn[4 ];
d2 = b2 * Xn1;
Xn6 = pIn[5 ];
d1 += a1 * acc1;
Xn7 = pIn[6 ];
d2 = b2 * Xn1;
d2 += a2 * acc1;
/* Sample 2. 5 cycles */
Xn8 = pIn[7 ];
acc2 = b0 * Xn2 + d1;
*pOut++ = acc1;
Xn9 = pIn[8 ];
d1 = b1 * Xn2 + d2;
Xn10 = pIn[9 ];
d2 = b2 * Xn2;
/* 2 */
Xn1 = *pIn++;
Xn11 = pIn[10];
d1 += a1 * acc2;
acc1 = b0 * Xn1 + d1;
Xn12 = pIn[11];
d2 += a2 * acc2;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
/* Sample 3. 5 cycles */
Xn13 = pIn[12];
acc3 = b0 * Xn3 + d1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
Xn14 = pIn[13];
d1 = b1 * Xn3 + d2;
*pOut++ = acc1;
Xn15 = pIn[14];
d2 = b2 * Xn3;
/* 3 */
Xn1 = *pIn++;
Xn16 = pIn[15];
d1 += a1 * acc3;
acc1 = b0 * Xn1 + d1;
pIn += 16;
d2 += a2 * acc3;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
/* Sample 4. 5 cycles */
acc4 = b0 * Xn4 + d1;
d1 = b1 * Xn4 + d2;
d2 = b2 * Xn4;
d1 += a1 * acc4;
d2 += a2 * acc4;
d2 = b2 * Xn1;
d2 += a2 * acc1;
/* Sample 5. 5 cycles */
acc5 = b0 * Xn5 + d1;
d1 = b1 * Xn5 + d2;
d2 = b2 * Xn5;
d1 += a1 * acc5;
d2 += a2 * acc5;
*pOut++ = acc1;
/* Sample 6. 5 cycles */
acc6 = b0 * Xn6 + d1;
d1 = b1 * Xn6 + d2;
d2 = b2 * Xn6;
d1 += a1 * acc6;
d2 += a2 * acc6;
/* 4 */
Xn1 = *pIn++;
/* Sample 7. 5 cycles */
acc7 = b0 * Xn7 + d1;
d1 = b1 * Xn7 + d2;
d2 = b2 * Xn7;
d1 += a1 * acc7;
d2 += a2 * acc7;
acc1 = b0 * Xn1 + d1;
/* Sample 8. 5 cycles */
acc8 = b0 * Xn8 + d1;
d1 = b1 * Xn8 + d2;
d2 = b2 * Xn8;
d1 += a1 * acc8;
d2 += a2 * acc8;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
/* Sample 9. 5 cycles */
acc9 = b0 * Xn9 + d1;
d1 = b1 * Xn9 + d2;
d2 = b2 * Xn9;
d1 += a1 * acc9;
d2 += a2 * acc9;
d2 = b2 * Xn1;
d2 += a2 * acc1;
/* Sample 10. 5 cycles */
acc10 = b0 * Xn10 + d1;
d1 = b1 * Xn10 + d2;
d2 = b2 * Xn10;
d1 += a1 * acc10;
d2 += a2 * acc10;
*pOut++ = acc1;
/* Sample 11. 5 cycles */
acc11 = b0 * Xn11 + d1;
d1 = b1 * Xn11 + d2;
d2 = b2 * Xn11;
d1 += a1 * acc11;
d2 += a2 * acc11;
/* 5 */
Xn1 = *pIn++;
/* Sample 12. 5 cycles */
acc12 = b0 * Xn12 + d1;
d1 = b1 * Xn12 + d2;
d2 = b2 * Xn12;
d1 += a1 * acc12;
d2 += a2 * acc12;
acc1 = b0 * Xn1 + d1;
/* Sample 13. 5 cycles */
acc13 = b0 * Xn13 + d1;
d1 = b1 * Xn13 + d2;
d2 = b2 * Xn13;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
pOut[0 ] = acc1 ;
d1 += a1 * acc13;
d2 = b2 * Xn1;
d2 += a2 * acc1;
pOut[1 ] = acc2 ;
d2 += a2 * acc13;
*pOut++ = acc1;
/* Sample 14. 5 cycles */
pOut[2 ] = acc3 ;
acc14 = b0 * Xn14 + d1;
/* 6 */
Xn1 = *pIn++;
pOut[3 ] = acc4 ;
d1 = b1 * Xn14 + d2;
acc1 = b0 * Xn1 + d1;
pOut[4 ] = acc5 ;
d2 = b2 * Xn14;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
pOut[5 ] = acc6 ;
d1 += a1 * acc14;
d2 = b2 * Xn1;
d2 += a2 * acc1;
pOut[6 ] = acc7 ;
d2 += a2 * acc14;
*pOut++ = acc1;
/* Sample 15. 5 cycles */
pOut[7 ] = acc8 ;
pOut[8 ] = acc9 ;
acc15 = b0 * Xn15 + d1;
/* 7 */
Xn1 = *pIn++;
pOut[9 ] = acc10;
d1 = b1 * Xn15 + d2;
acc1 = b0 * Xn1 + d1;
pOut[10] = acc11;
d2 = b2 * Xn15;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
pOut[11] = acc12;
d1 += a1 * acc15;
d2 = b2 * Xn1;
d2 += a2 * acc1;
pOut[12] = acc13;
d2 += a2 * acc15;
*pOut++ = acc1;
/* Sample 16. 5 cycles */
pOut[13] = acc14;
acc16 = b0 * Xn16 + d1;
/* 8 */
Xn1 = *pIn++;
pOut[14] = acc15;
d1 = b1 * Xn16 + d2;
acc1 = b0 * Xn1 + d1;
pOut[15] = acc16;
d2 = b2 * Xn16;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 9 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 10 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 11 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 12 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 13 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 14 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 15 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* 16 */
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
d1 = b1 * Xn1 + d2;
d1 += a1 * acc1;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* decrement loop counter */
sample--;
d1 += a1 * acc16;
pOut += 16;
d2 += a2 * acc16;
}
sample = blockSize & 0xFu;
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0xFU;
#else
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U) {
Xn1 = *pIn;
Xn1 = *pIn++;
acc1 = b0 * Xn1 + d1;
pIn++;
d1 = b1 * Xn1 + d2;
*pOut = acc1;
d2 = b2 * Xn1;
pOut++;
d1 += a1 * acc1;
sample--;
d2 = b2 * Xn1;
d2 += a2 * acc1;
*pOut++ = acc1;
/* decrement loop counter */
sample--;
}
/* Store the updated state variables back into the state array */
pState[0] = d1;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
pState[1] = d2;
/* decrement the loop counter */
stage--;
pState += 2U;
/*Reset the output working pointer */
pOut = pDst;
} while (stage > 0U);
#elif defined(ARM_MATH_CM0_FAMILY)
/* Run the below code for Cortex-M0 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/*Reading the state values */
d1 = pState[0];
d2 = pState[1];
sample = blockSize;
while (sample > 0U)
{
/* Read the input */
Xn1 = *pIn++;
/* y[n] = b0 * x[n] + d1 */
acc1 = (b0 * Xn1) + d1;
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc1;
/* Every time after the output is computed state should be updated. */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
/* d2 = b2 * x[n] + a2 * y[n] */
d2 = (b2 * Xn1) + (a2 * acc1);
/* decrement the loop counter */
sample--;
}
/* Store the updated state variables back into the state array */
*pState++ = d1;
*pState++ = d2;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/*Reset the output working pointer */
/* Reset the output working pointer */
pOut = pDst;
/* decrement the loop counter */
/* decrement loop counter */
stage--;
} while (stage > 0U);
#else
float64_t Xn2, Xn3, Xn4; /* Input State variables */
float64_t acc2, acc3, acc4; /* accumulator */
float64_t p0, p1, p2, p3, p4, A1;
/* Run the below code for Cortex-M4 and Cortex-M3 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/*Reading the state values */
d1 = pState[0];
d2 = pState[1];
/* Apply loop unrolling and compute 4 output values simultaneously. */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* Read the four inputs */
Xn1 = pIn[0];
Xn2 = pIn[1];
Xn3 = pIn[2];
Xn4 = pIn[3];
pIn += 4;
p0 = b0 * Xn1;
p1 = b1 * Xn1;
acc1 = p0 + d1;
p0 = b0 * Xn2;
p3 = a1 * acc1;
p2 = b2 * Xn1;
A1 = p1 + p3;
p4 = a2 * acc1;
d1 = A1 + d2;
d2 = p2 + p4;
p1 = b1 * Xn2;
acc2 = p0 + d1;
p0 = b0 * Xn3;
p3 = a1 * acc2;
p2 = b2 * Xn2;
A1 = p1 + p3;
p4 = a2 * acc2;
d1 = A1 + d2;
d2 = p2 + p4;
p1 = b1 * Xn3;
acc3 = p0 + d1;
p0 = b0 * Xn4;
p3 = a1 * acc3;
p2 = b2 * Xn3;
A1 = p1 + p3;
p4 = a2 * acc3;
d1 = A1 + d2;
d2 = p2 + p4;
acc4 = p0 + d1;
p1 = b1 * Xn4;
p3 = a1 * acc4;
p2 = b2 * Xn4;
A1 = p1 + p3;
p4 = a2 * acc4;
d1 = A1 + d2;
d2 = p2 + p4;
pOut[0] = acc1;
pOut[1] = acc2;
pOut[2] = acc3;
pOut[3] = acc4;
pOut += 4;
sample--;
}
sample = blockSize & 0x3U;
while (sample > 0U) {
Xn1 = *pIn++;
p0 = b0 * Xn1;
p1 = b1 * Xn1;
acc1 = p0 + d1;
p3 = a1 * acc1;
p2 = b2 * Xn1;
A1 = p1 + p3;
p4 = a2 * acc1;
d1 = A1 + d2;
d2 = p2 + p4;
*pOut++ = acc1;
sample--;
}
/* Store the updated state variables back into the state array */
*pState++ = d1;
*pState++ = d2;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/*Reset the output working pointer */
pOut = pDst;
/* decrement the loop counter */
stage--;
} while (stage > 0U);
#endif
}
LOW_OPTIMIZATION_EXIT
/**
* @} end of BiquadCascadeDF2T group
*/
@} end of BiquadCascadeDF2T group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df2T_init_f32.c
* Description: Initialization function for floating-point transposed direct form II Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,47 +29,169 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in,out] *S points to an instance of the filter data structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients.
* @param[in] *pState points to the state buffer.
* @return none
*
* <b>Coefficient and State Ordering:</b>
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
*
* \par
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> is a pointer to state array.
* Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
* The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
@param[in,out] S points to an instance of the filter data structure.
@param[in] numStages number of 2nd order stages in the filter.
@param[in] pCoeffs points to the filter coefficients.
@param[in] pState points to the state buffer.
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order
in the not Neon version.
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
32*x + 5*y
and it must be initialized using the function
arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
standard array coefficient as parameters.
But, an array of 8*numstages is a good approximation.
Then, the initialization can be done with:
<pre>
arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, neonCoefs, stateNeon);
arm_biquad_cascade_df2T_compute_coefs_f32(&SNeon,nbCascade,coefs);
</pre>
@par In this example, neonCoefs is a bigger array of size 8 * numStages.
coefs is the standard array:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
The <code>pState</code> is a pointer to state array.
Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
The state array has a total length of <code>2*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df2T_init_f32(
#if defined(ARM_MATH_NEON)
/*
Must be called after initializing the biquad instance.
pCoeffs has size 5 * nbCascade
Whereas the pCoeffs for the init has size (4*4 + 4*4)* nbCascade
So this pCoeffs is the one which would be used for the not Neon version.
The pCoeffs passed in init is bigger than the one for the not Neon version.
*/
void arm_biquad_cascade_df2T_compute_coefs_f32(
arm_biquad_cascade_df2T_instance_f32 * S,
uint8_t numStages,
float32_t * pCoeffs,
float32_t * pState)
float32_t * pCoeffs)
{
uint8_t cnt;
float32_t *pDstCoeffs;
float32_t b0[4],b1[4],b2[4],a1[4],a2[4];
pDstCoeffs = S->pCoeffs;
cnt = numStages >> 2;
while(cnt > 0)
{
for(int i=0;i<4;i++)
{
b0[i] = pCoeffs[0];
b1[i] = pCoeffs[1];
b2[i] = pCoeffs[2];
a1[i] = pCoeffs[3];
a2[i] = pCoeffs[4];
pCoeffs += 5;
}
/* Vec 1 */
*pDstCoeffs++ = 0;
*pDstCoeffs++ = b0[1];
*pDstCoeffs++ = b0[2];
*pDstCoeffs++ = b0[3];
/* Vec 2 */
*pDstCoeffs++ = 0;
*pDstCoeffs++ = 0;
*pDstCoeffs++ = b0[1] * b0[2];
*pDstCoeffs++ = b0[2] * b0[3];
/* Vec 3 */
*pDstCoeffs++ = 0;
*pDstCoeffs++ = 0;
*pDstCoeffs++ = 0;
*pDstCoeffs++ = b0[1] * b0[2] * b0[3];
/* Vec 4 */
*pDstCoeffs++ = b0[0];
*pDstCoeffs++ = b0[0] * b0[1];
*pDstCoeffs++ = b0[0] * b0[1] * b0[2];
*pDstCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];
/* Vec 5 */
*pDstCoeffs++ = b1[0];
*pDstCoeffs++ = b1[1];
*pDstCoeffs++ = b1[2];
*pDstCoeffs++ = b1[3];
/* Vec 6 */
*pDstCoeffs++ = b2[0];
*pDstCoeffs++ = b2[1];
*pDstCoeffs++ = b2[2];
*pDstCoeffs++ = b2[3];
/* Vec 7 */
*pDstCoeffs++ = a1[0];
*pDstCoeffs++ = a1[1];
*pDstCoeffs++ = a1[2];
*pDstCoeffs++ = a1[3];
/* Vec 8 */
*pDstCoeffs++ = a2[0];
*pDstCoeffs++ = a2[1];
*pDstCoeffs++ = a2[2];
*pDstCoeffs++ = a2[3];
cnt--;
}
cnt = numStages & 0x3;
while(cnt > 0)
{
*pDstCoeffs++ = *pCoeffs++;
*pDstCoeffs++ = *pCoeffs++;
*pDstCoeffs++ = *pCoeffs++;
*pDstCoeffs++ = *pCoeffs++;
*pDstCoeffs++ = *pCoeffs++;
cnt--;
}
}
#endif
void arm_biquad_cascade_df2T_init_f32(
arm_biquad_cascade_df2T_instance_f32 * S,
uint8_t numStages,
const float32_t * pCoeffs,
float32_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@ -85,5 +207,5 @@ void arm_biquad_cascade_df2T_init_f32(
}
/**
* @} end of BiquadCascadeDF2T group
@} end of BiquadCascadeDF2T group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_df2T_init_f64.c
* Description: Initialization function for floating-point transposed direct form II Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,47 +29,44 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in,out] *S points to an instance of the filter data structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients.
* @param[in] *pState points to the state buffer.
* @return none
*
* <b>Coefficient and State Ordering:</b>
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
*
* \par
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> is a pointer to state array.
* Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
* The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
@param[in,out] S points to an instance of the filter data structure
@param[in] numStages number of 2nd order stages in the filter
@param[in] pCoeffs points to the filter coefficients
@param[in] pState points to the state buffer
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
@par
The <code>pState</code> is a pointer to state array.
Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
The state array has a total length of <code>2*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_df2T_init_f64(
arm_biquad_cascade_df2T_instance_f64 * S,
uint8_t numStages,
float64_t * pCoeffs,
float64_t * pState)
arm_biquad_cascade_df2T_instance_f64 * S,
uint8_t numStages,
float64_t * pCoeffs,
float64_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@ -85,5 +82,5 @@ void arm_biquad_cascade_df2T_init_f64(
}
/**
* @} end of BiquadCascadeDF2T group
@} end of BiquadCascadeDF2T group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_stereo_df2T_f32.c
* Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,139 +29,39 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
*
* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
* The filters are implemented as a cascade of second order Biquad sections.
* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
* Only floating-point data is supported.
*
* This function operate on blocks of input and output data and each call to the function
* processes <code>blockSize</code> samples through the filter.
* <code>pSrc</code> points to the array of input data and
* <code>pDst</code> points to the array of output data.
* Both arrays contain <code>blockSize</code> values.
*
* \par Algorithm
* Each Biquad stage implements a second order filter using the difference equation:
* <pre>
* y[n] = b0 * x[n] + d1
* d1 = b1 * x[n] + a1 * y[n] + d2
* d2 = b2 * x[n] + a2 * y[n]
* </pre>
* where d1 and d2 represent the two state values.
*
* \par
* A Biquad filter using a transposed Direct Form II structure is shown below.
* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
* Pay careful attention to the sign of the feedback coefficients.
* Some design tools flip the sign of the feedback coefficients:
* <pre>
* y[n] = b0 * x[n] + d1;
* d1 = b1 * x[n] - a1 * y[n] + d2;
* d2 = b2 * x[n] - a2 * y[n];
* </pre>
* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
*
* \par
* Higher order filters are realized as a cascade of second order sections.
* <code>numStages</code> refers to the number of second order stages used.
* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
*
* \par
* <code>pState</code> points to the state variable array.
* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
* The state variables are arranged in the <code>pState</code> array as:
* <pre>
* {d11, d12, d21, d22, ...}
* </pre>
* where <code>d1x</code> refers to the state variables for the first Biquad and
* <code>d2x</code> refers to the state variables for the second Biquad.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
*
* \par
* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
* That is why the Direct Form I structure supports Q15 and Q31 data types.
* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
*
* \par Instance Structure
* The coefficients and state variables for a filter are stored together in an instance data structure.
* A separate instance structure must be defined for each filter.
* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
*
* \par Init Functions
* There is also an associated initialization function.
* The initialization function performs following operations:
* - Sets the values of the internal structure fields.
* - Zeros out the values in the state buffer.
* To do this manually without calling the init function, assign the follow subfields of the instance structure:
* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
*
* \par
* Use of the initialization function is optional.
* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
* To place an instance structure into a const data section, the instance structure must be manually initialized.
* Set the values in the state buffer to zeros before static initialization.
* For example, to statically initialize the instance structure use
* <pre>
* arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
* </pre>
* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
* <code>pCoeffs</code> is the address of the coefficient buffer;
*
*/
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
*/
/**
* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in] *S points to an instance of the filter data structure.
* @param[in] *pSrc points to the block of input data.
* @param[out] *pDst points to the block of output data
* @param[in] blockSize number of samples to process.
* @return none.
*/
@brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
@param[in] S points to an instance of the filter data structure
@param[in] pSrc points to the block of input data
@param[out] pDst points to the block of output data
@param[in] blockSize number of samples to process
@return none
*/
LOW_OPTIMIZATION_ENTER
void arm_biquad_cascade_stereo_df2T_f32(
const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
float32_t *pIn = pSrc; /* source pointer */
float32_t *pOut = pDst; /* destination pointer */
float32_t *pState = S->pState; /* State pointer */
float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
float32_t acc1a, acc1b; /* accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1a, Xn1b; /* temporary input */
float32_t d1a, d2a, d1b, d2b; /* state variables */
uint32_t sample, stage = S->numStages; /* loop counters */
#if defined(ARM_MATH_CM7)
float32_t Xn2a, Xn3a, Xn4a, Xn5a, Xn6a, Xn7a, Xn8a; /* Input State variables */
float32_t Xn2b, Xn3b, Xn4b, Xn5b, Xn6b, Xn7b, Xn8b; /* Input State variables */
float32_t acc2a, acc3a, acc4a, acc5a, acc6a, acc7a, acc8a; /* Simulates the accumulator */
float32_t acc2b, acc3b, acc4b, acc5b, acc6b, acc7b, acc8b; /* Simulates the accumulator */
const float32_t *pIn = pSrc; /* Source pointer */
float32_t *pOut = pDst; /* Destination pointer */
float32_t *pState = S->pState; /* State pointer */
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t acc1a, acc1b; /* Accumulator */
float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
float32_t Xn1a, Xn1b; /* Temporary input */
float32_t d1a, d2a, d1b, d2b; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */
do
{
@ -170,11 +70,9 @@ uint32_t blockSize)
b1 = pCoeffs[1];
b2 = pCoeffs[2];
a1 = pCoeffs[3];
/* Apply loop unrolling and compute 8 output values simultaneously. */
sample = blockSize >> 3U;
a2 = pCoeffs[4];
/*Reading the state values */
/* Reading the state values */
d1a = pState[0];
d2a = pState[1];
d1b = pState[2];
@ -182,215 +80,182 @@ uint32_t blockSize)
pCoeffs += 5U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 8 outputs at a time */
sample = blockSize >> 3U;
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* 1 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
/* Read the first 2 inputs. 2 cycles */
Xn1a = pIn[0 ];
Xn1b = pIn[1 ];
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Sample 1. 5 cycles */
Xn2a = pIn[2 ];
acc1a = b0 * Xn1a + d1a;
*pOut++ = acc1a;
*pOut++ = acc1b;
Xn2b = pIn[3 ];
d1a = b1 * Xn1a + d2a;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
Xn3a = pIn[4 ];
d2a = b2 * Xn1a;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
Xn3b = pIn[5 ];
d1a += a1 * acc1a;
/* 2 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
Xn4a = pIn[6 ];
d2a += a2 * acc1a;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Sample 2. 5 cycles */
Xn4b = pIn[7 ];
acc1b = b0 * Xn1b + d1b;
*pOut++ = acc1a;
*pOut++ = acc1b;
Xn5a = pIn[8 ];
d1b = b1 * Xn1b + d2b;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
Xn5b = pIn[9 ];
d2b = b2 * Xn1b;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
Xn6a = pIn[10];
d1b += a1 * acc1b;
/* 3 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
Xn6b = pIn[11];
d2b += a2 * acc1b;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Sample 3. 5 cycles */
Xn7a = pIn[12];
acc2a = b0 * Xn2a + d1a;
*pOut++ = acc1a;
*pOut++ = acc1b;
Xn7b = pIn[13];
d1a = b1 * Xn2a + d2a;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
Xn8a = pIn[14];
d2a = b2 * Xn2a;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
Xn8b = pIn[15];
d1a += a1 * acc2a;
/* 4 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
pIn += 16;
d2a += a2 * acc2a;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Sample 4. 5 cycles */
acc2b = b0 * Xn2b + d1b;
d1b = b1 * Xn2b + d2b;
d2b = b2 * Xn2b;
d1b += a1 * acc2b;
d2b += a2 * acc2b;
*pOut++ = acc1a;
*pOut++ = acc1b;
/* Sample 5. 5 cycles */
acc3a = b0 * Xn3a + d1a;
d1a = b1 * Xn3a + d2a;
d2a = b2 * Xn3a;
d1a += a1 * acc3a;
d2a += a2 * acc3a;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
/* Sample 6. 5 cycles */
acc3b = b0 * Xn3b + d1b;
d1b = b1 * Xn3b + d2b;
d2b = b2 * Xn3b;
d1b += a1 * acc3b;
d2b += a2 * acc3b;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* Sample 7. 5 cycles */
acc4a = b0 * Xn4a + d1a;
d1a = b1 * Xn4a + d2a;
d2a = b2 * Xn4a;
d1a += a1 * acc4a;
d2a += a2 * acc4a;
/* 5 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
/* Sample 8. 5 cycles */
acc4b = b0 * Xn4b + d1b;
d1b = b1 * Xn4b + d2b;
d2b = b2 * Xn4b;
d1b += a1 * acc4b;
d2b += a2 * acc4b;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Sample 9. 5 cycles */
acc5a = b0 * Xn5a + d1a;
d1a = b1 * Xn5a + d2a;
d2a = b2 * Xn5a;
d1a += a1 * acc5a;
d2a += a2 * acc5a;
*pOut++ = acc1a;
*pOut++ = acc1b;
/* Sample 10. 5 cycles */
acc5b = b0 * Xn5b + d1b;
d1b = b1 * Xn5b + d2b;
d2b = b2 * Xn5b;
d1b += a1 * acc5b;
d2b += a2 * acc5b;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
/* Sample 11. 5 cycles */
acc6a = b0 * Xn6a + d1a;
d1a = b1 * Xn6a + d2a;
d2a = b2 * Xn6a;
d1a += a1 * acc6a;
d2a += a2 * acc6a;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* Sample 12. 5 cycles */
acc6b = b0 * Xn6b + d1b;
d1b = b1 * Xn6b + d2b;
d2b = b2 * Xn6b;
d1b += a1 * acc6b;
d2b += a2 * acc6b;
/* 6 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
/* Sample 13. 5 cycles */
acc7a = b0 * Xn7a + d1a;
d1a = b1 * Xn7a + d2a;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
pOut[0 ] = acc1a ;
d2a = b2 * Xn7a;
*pOut++ = acc1a;
*pOut++ = acc1b;
pOut[1 ] = acc1b ;
d1a += a1 * acc7a;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
pOut[2 ] = acc2a ;
d2a += a2 * acc7a;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* Sample 14. 5 cycles */
pOut[3 ] = acc2b ;
acc7b = b0 * Xn7b + d1b;
/* 7 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
pOut[4 ] = acc3a ;
d1b = b1 * Xn7b + d2b;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
pOut[5 ] = acc3b ;
d2b = b2 * Xn7b;
*pOut++ = acc1a;
*pOut++ = acc1b;
pOut[6 ] = acc4a ;
d1b += a1 * acc7b;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
pOut[7 ] = acc4b ;
d2b += a2 * acc7b;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* Sample 15. 5 cycles */
pOut[8 ] = acc5a ;
acc8a = b0 * Xn8a + d1a;
/* 8 */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
pOut[9 ] = acc5b;
d1a = b1 * Xn8a + d2a;
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
pOut[10] = acc6a;
d2a = b2 * Xn8a;
*pOut++ = acc1a;
*pOut++ = acc1b;
pOut[11] = acc6b;
d1a += a1 * acc8a;
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
pOut[12] = acc7a;
d2a += a2 * acc8a;
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* Sample 16. 5 cycles */
pOut[13] = acc7b;
acc8b = b0 * Xn8b + d1b;
pOut[14] = acc8a;
d1b = b1 * Xn8b + d2b;
pOut[15] = acc8b;
d2b = b2 * Xn8b;
sample--;
d1b += a1 * acc8b;
pOut += 16;
d2b += a2 * acc8b;
/* decrement loop counter */
sample--;
}
/* Loop unrolling: Compute remaining outputs */
sample = blockSize & 0x7U;
#else
/* Initialize blkCnt with number of samples */
sample = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (sample > 0U) {
/* Read the input */
Xn1a = *pIn++; //Channel a
Xn1b = *pIn++; //Channel b
/* Read the input */
Xn1a = *pIn++; /* Channel a */
Xn1b = *pIn++; /* Channel b */
/* y[n] = b0 * x[n] + d1 */
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* y[n] = b0 * x[n] + d1 */
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc1a;
*pOut++ = acc1b;
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc1a;
*pOut++ = acc1b;
/* Every time after the output is computed state should be updated. */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
/* Every time after the output is computed state should be updated. */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
/* d2 = b2 * x[n] + a2 * y[n] */
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* d2 = b2 * x[n] + a2 * y[n] */
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
sample--;
/* decrement loop counter */
sample--;
}
/* Store the updated state variables back into the state array */
@ -400,271 +265,21 @@ uint32_t blockSize)
pState[2] = d1b;
pState[3] = d2b;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/* decrement the loop counter */
stage--;
pState += 4U;
/*Reset the output working pointer */
pOut = pDst;
} while (stage > 0U);
#elif defined(ARM_MATH_CM0_FAMILY)
/* Run the below code for Cortex-M0 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/*Reading the state values */
d1a = pState[0];
d2a = pState[1];
d1b = pState[2];
d2b = pState[3];
sample = blockSize;
while (sample > 0U)
{
/* Read the input */
Xn1a = *pIn++; //Channel a
Xn1b = *pIn++; //Channel b
/* y[n] = b0 * x[n] + d1 */
acc1a = (b0 * Xn1a) + d1a;
acc1b = (b0 * Xn1b) + d1b;
/* Store the result in the accumulator in the destination buffer. */
*pOut++ = acc1a;
*pOut++ = acc1b;
/* Every time after the output is computed state should be updated. */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
/* d2 = b2 * x[n] + a2 * y[n] */
d2a = (b2 * Xn1a) + (a2 * acc1a);
d2b = (b2 * Xn1b) + (a2 * acc1b);
/* decrement the loop counter */
sample--;
}
/* Store the updated state variables back into the state array */
*pState++ = d1a;
*pState++ = d2a;
*pState++ = d1b;
*pState++ = d2b;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/*Reset the output working pointer */
/* Reset the output working pointer */
pOut = pDst;
/* decrement the loop counter */
/* Decrement the loop counter */
stage--;
} while (stage > 0U);
#else
float32_t Xn2a, Xn3a, Xn4a; /* Input State variables */
float32_t Xn2b, Xn3b, Xn4b; /* Input State variables */
float32_t acc2a, acc3a, acc4a; /* accumulator */
float32_t acc2b, acc3b, acc4b; /* accumulator */
float32_t p0a, p1a, p2a, p3a, p4a, A1a;
float32_t p0b, p1b, p2b, p3b, p4b, A1b;
/* Run the below code for Cortex-M4 and Cortex-M3 */
do
{
/* Reading the coefficients */
b0 = *pCoeffs++;
b1 = *pCoeffs++;
b2 = *pCoeffs++;
a1 = *pCoeffs++;
a2 = *pCoeffs++;
/*Reading the state values */
d1a = pState[0];
d2a = pState[1];
d1b = pState[2];
d2b = pState[3];
/* Apply loop unrolling and compute 4 output values simultaneously. */
sample = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (sample > 0U) {
/* y[n] = b0 * x[n] + d1 */
/* d1 = b1 * x[n] + a1 * y[n] + d2 */
/* d2 = b2 * x[n] + a2 * y[n] */
/* Read the four inputs */
Xn1a = pIn[0];
Xn1b = pIn[1];
Xn2a = pIn[2];
Xn2b = pIn[3];
Xn3a = pIn[4];
Xn3b = pIn[5];
Xn4a = pIn[6];
Xn4b = pIn[7];
pIn += 8;
p0a = b0 * Xn1a;
p0b = b0 * Xn1b;
p1a = b1 * Xn1a;
p1b = b1 * Xn1b;
acc1a = p0a + d1a;
acc1b = p0b + d1b;
p0a = b0 * Xn2a;
p0b = b0 * Xn2b;
p3a = a1 * acc1a;
p3b = a1 * acc1b;
p2a = b2 * Xn1a;
p2b = b2 * Xn1b;
A1a = p1a + p3a;
A1b = p1b + p3b;
p4a = a2 * acc1a;
p4b = a2 * acc1b;
d1a = A1a + d2a;
d1b = A1b + d2b;
d2a = p2a + p4a;
d2b = p2b + p4b;
p1a = b1 * Xn2a;
p1b = b1 * Xn2b;
acc2a = p0a + d1a;
acc2b = p0b + d1b;
p0a = b0 * Xn3a;
p0b = b0 * Xn3b;
p3a = a1 * acc2a;
p3b = a1 * acc2b;
p2a = b2 * Xn2a;
p2b = b2 * Xn2b;
A1a = p1a + p3a;
A1b = p1b + p3b;
p4a = a2 * acc2a;
p4b = a2 * acc2b;
d1a = A1a + d2a;
d1b = A1b + d2b;
d2a = p2a + p4a;
d2b = p2b + p4b;
p1a = b1 * Xn3a;
p1b = b1 * Xn3b;
acc3a = p0a + d1a;
acc3b = p0b + d1b;
p0a = b0 * Xn4a;
p0b = b0 * Xn4b;
p3a = a1 * acc3a;
p3b = a1 * acc3b;
p2a = b2 * Xn3a;
p2b = b2 * Xn3b;
A1a = p1a + p3a;
A1b = p1b + p3b;
p4a = a2 * acc3a;
p4b = a2 * acc3b;
d1a = A1a + d2a;
d1b = A1b + d2b;
d2a = p2a + p4a;
d2b = p2b + p4b;
acc4a = p0a + d1a;
acc4b = p0b + d1b;
p1a = b1 * Xn4a;
p1b = b1 * Xn4b;
p3a = a1 * acc4a;
p3b = a1 * acc4b;
p2a = b2 * Xn4a;
p2b = b2 * Xn4b;
A1a = p1a + p3a;
A1b = p1b + p3b;
p4a = a2 * acc4a;
p4b = a2 * acc4b;
d1a = A1a + d2a;
d1b = A1b + d2b;
d2a = p2a + p4a;
d2b = p2b + p4b;
pOut[0] = acc1a;
pOut[1] = acc1b;
pOut[2] = acc2a;
pOut[3] = acc2b;
pOut[4] = acc3a;
pOut[5] = acc3b;
pOut[6] = acc4a;
pOut[7] = acc4b;
pOut += 8;
sample--;
}
sample = blockSize & 0x3U;
while (sample > 0U) {
Xn1a = *pIn++;
Xn1b = *pIn++;
p0a = b0 * Xn1a;
p0b = b0 * Xn1b;
p1a = b1 * Xn1a;
p1b = b1 * Xn1b;
acc1a = p0a + d1a;
acc1b = p0b + d1b;
p3a = a1 * acc1a;
p3b = a1 * acc1b;
p2a = b2 * Xn1a;
p2b = b2 * Xn1b;
A1a = p1a + p3a;
A1b = p1b + p3b;
p4a = a2 * acc1a;
p4b = a2 * acc1b;
d1a = A1a + d2a;
d1b = A1b + d2b;
d2a = p2a + p4a;
d2b = p2b + p4b;
*pOut++ = acc1a;
*pOut++ = acc1b;
sample--;
}
/* Store the updated state variables back into the state array */
*pState++ = d1a;
*pState++ = d2a;
*pState++ = d1b;
*pState++ = d2b;
/* The current stage input is given as the output to the next stage */
pIn = pDst;
/*Reset the output working pointer */
pOut = pDst;
/* decrement the loop counter */
stage--;
} while (stage > 0U);
#endif
}
LOW_OPTIMIZATION_EXIT
/**
* @} end of BiquadCascadeDF2T group
*/
@} end of BiquadCascadeDF2T group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_biquad_cascade_stereo_df2T_init_f32.c
* Description: Initialization function for floating-point transposed direct form II Biquad cascade filter
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,47 +29,44 @@
#include "arm_math.h"
/**
* @ingroup groupFilters
@ingroup groupFilters
*/
/**
* @addtogroup BiquadCascadeDF2T
* @{
@addtogroup BiquadCascadeDF2T
@{
*/
/**
* @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
* @param[in,out] *S points to an instance of the filter data structure.
* @param[in] numStages number of 2nd order stages in the filter.
* @param[in] *pCoeffs points to the filter coefficients.
* @param[in] *pState points to the state buffer.
* @return none
*
* <b>Coefficient and State Ordering:</b>
* \par
* The coefficients are stored in the array <code>pCoeffs</code> in the following order:
* <pre>
* {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
* </pre>
*
* \par
* where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
* <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
* and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
*
* \par
* The <code>pState</code> is a pointer to state array.
* Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.
* The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
* The state array has a total length of <code>2*numStages</code> values.
* The state variables are updated after each block of data is processed; the coefficients are untouched.
@brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
@param[in,out] S points to an instance of the filter data structure.
@param[in] numStages number of 2nd order stages in the filter.
@param[in] pCoeffs points to the filter coefficients.
@param[in] pState points to the state buffer.
@return none
@par Coefficient and State Ordering
The coefficients are stored in the array <code>pCoeffs</code> in the following order:
<pre>
{b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
</pre>
@par
where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
<code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
@par
The <code>pState</code> is a pointer to state array.
Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.
The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
The state array has a total length of <code>2*numStages</code> values.
The state variables are updated after each block of data is processed; the coefficients are untouched.
*/
void arm_biquad_cascade_stereo_df2T_init_f32(
arm_biquad_cascade_stereo_df2T_instance_f32 * S,
uint8_t numStages,
float32_t * pCoeffs,
float32_t * pState)
arm_biquad_cascade_stereo_df2T_instance_f32 * S,
uint8_t numStages,
const float32_t * pCoeffs,
float32_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
@ -85,5 +82,5 @@ void arm_biquad_cascade_stereo_df2T_init_f32(
}
/**
* @} end of BiquadCascadeDF2T group
@} end of BiquadCascadeDF2T group
*/

Some files were not shown because too many files have changed in this diff Show more