Official ARM version: v5.6.0

This commit is contained in:
rihab kouki 2020-07-28 11:24:49 +01:00
parent 9f95ff5b6b
commit 96d6da4e25
2939 changed files with 339304 additions and 113320 deletions

View file

@ -0,0 +1,16 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPStatistics)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPStatistics STATIC ${SRC})
configdsp(CMSISDSPStatistics ..)
### Includes
target_include_directories(CMSISDSPStatistics PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,53 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: StatisticsFunctions.c
* Description: Combination of all statistics function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_max_f32.c"
#include "arm_max_q15.c"
#include "arm_max_q31.c"
#include "arm_max_q7.c"
#include "arm_mean_f32.c"
#include "arm_mean_q15.c"
#include "arm_mean_q31.c"
#include "arm_mean_q7.c"
#include "arm_min_f32.c"
#include "arm_min_q15.c"
#include "arm_min_q31.c"
#include "arm_min_q7.c"
#include "arm_power_f32.c"
#include "arm_power_q15.c"
#include "arm_power_q31.c"
#include "arm_power_q7.c"
#include "arm_rms_f32.c"
#include "arm_rms_q15.c"
#include "arm_rms_q31.c"
#include "arm_std_f32.c"
#include "arm_std_q15.c"
#include "arm_std_q31.c"
#include "arm_var_f32.c"
#include "arm_var_q15.c"
#include "arm_var_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_max_f32.c
* Description: Maximum value of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -27,136 +27,237 @@
*/
#include "arm_math.h"
#if defined(ARM_MATH_NEON)
#include <limits.h>
#endif
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup Max Maximum
*
* Computes the maximum value of an array of data.
* The function returns both the maximum value and its position within the array.
* There are separate functions for floating-point, Q31, Q15, and Q7 data types.
@defgroup Max Maximum
Computes the maximum value of an array of data.
The function returns both the maximum value and its position within the array.
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
* @addtogroup Max
* @{
@addtogroup Max
@{
*/
/**
* @brief Maximum value of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult maximum value returned here
* @param[out] *pIndex index of maximum value returned here
* @return none.
@brief Maximum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_NEON)
void arm_max_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
float32x4_t outV, srcV;
float32x2_t outV2;
uint32x4_t idxV;
uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
uint32x4_t index={4,5,6,7};
uint32x4_t delta={4,4,4,4};
uint32x4_t countV={0,1,2,3};
uint32x2_t countV2;
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparison */
if (blockSize <= 3)
{
out = *pSrc++;
blkCnt = blockSize - 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)
{
srcV = vld1q_f32(pSrc);
pSrc += 4;
idxV = vcgtq_f32(srcV, outV);
outV = vbslq_f32(idxV, srcV, outV );
countV = vbslq_u32(idxV, index,countV );
index = vaddq_u32(index,delta);
/* Decrement the loop counter */
blkCnt--;
}
outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
outV2 = vpmax_f32(outV2,outV2);
out = outV2[0];
idxV = vceqq_f32(outV, vdupq_n_f32(out));
countV = vbslq_u32(idxV, countV,maxIdx);
countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
countV2 = vpmin_u32(countV2,countV2);
outIndex = countV2[0];
/* if (blockSize - 1U) is not multiple of 4 */
blkCnt = (blockSize - 4 ) % 4U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt ;
}
/* Decrement the loop counter */
blkCnt--;
}
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#else
void arm_max_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 1U;
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 2U;
out = maxVal;
outIndex = index + 2U;
}
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 3U;
out = maxVal;
outIndex = index + 3U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 4U;
out = maxVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
float32_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal1;
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -164,7 +265,7 @@ void arm_max_f32(
*pResult = out;
*pIndex = outIndex;
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of Max group
@} end of Max group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_max_q15.c
* Description: Maximum value of a Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,126 +29,112 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Max
* @{
@addtogroup Max
@{
*/
/**
* @brief Maximum value of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult maximum value returned here
* @param[out] *pIndex index of maximum value returned here
* @return none.
@brief Maximum value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
void arm_max_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q15_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 1U;
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 2U;
out = maxVal;
outIndex = index + 2U;
}
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 3U;
out = maxVal;
outIndex = index + 3U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 4U;
out = maxVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q15_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal1;
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -158,5 +144,5 @@ void arm_max_q15(
}
/**
* @} end of Max group
@} end of Max group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_max_q31.c
* Description: Maximum value of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,126 +29,112 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Max
* @{
@addtogroup Max
@{
*/
/**
* @brief Maximum value of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult maximum value returned here
* @param[out] *pIndex index of maximum value returned here
* @return none.
@brief Maximum value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
void arm_max_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q31_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 1U;
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 2U;
out = maxVal;
outIndex = index + 2U;
}
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 3U;
out = maxVal;
outIndex = index + 3U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 4U;
out = maxVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q31_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal1;
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -158,5 +144,5 @@ void arm_max_q31(
}
/**
* @} end of Max group
@} end of Max group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_max_q7.c
* Description: Maximum value of a Q7 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,126 +29,112 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Max
* @{
@addtogroup Max
@{
*/
/**
* @brief Maximum value of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult maximum value returned here
* @param[out] *pIndex index of maximum value returned here
* @return none.
@brief Maximum value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
void arm_max_q7(
q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q7_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 1U;
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 2U;
out = maxVal;
outIndex = index + 2U;
}
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal2 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal1;
outIndex = count + 3U;
out = maxVal;
outIndex = index + 3U;
}
/* compare for the maximum value */
if (out < maxVal2)
maxVal = *pSrc++;
if (out < maxVal)
{
/* Update the maximum value and its index */
out = maxVal2;
outIndex = count + 4U;
out = maxVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q7_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal1;
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -158,5 +144,5 @@ void arm_max_q7(
}
/**
* @} end of Max group
@} end of Max group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mean_f32.c
* Description: Mean value of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,82 +29,70 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup mean Mean
*
* Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
* The underlying algorithm is used:
*
* <pre>
* Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
* </pre>
*
* There are separate functions for floating-point, Q31, Q15, and Q7 data types.
@defgroup mean Mean
Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
The underlying algorithm is used:
<pre>
Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
</pre>
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
* @addtogroup mean
* @{
@addtogroup mean
@{
*/
/**
* @brief Mean value of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult mean value returned here
* @return none.
@brief Mean value of a floating-point vector.
@param[in] pSrc points to the input vector.
@param[in] blockSize number of samples in input vector.
@param[out] pResult mean value returned here.
@return none
*/
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_mean_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* Loop counter */
float32_t in1, in2, in3, in4;
float32x4_t inV;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
sum += in1;
sum += in2;
sum += in3;
sum += in4;
inV = vld1q_f32(pSrc);
sumV = vaddq_f32(sumV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = sumV2[0] + sumV2[1];
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
@ -119,7 +107,60 @@ void arm_mean_f32(
/* Store the result to the destination */
*pResult = sum / (float32_t) blockSize;
}
#else
void arm_mean_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (sum / blockSize);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of mean group
@} end of mean group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mean_q15.c
* Description: Mean value of a Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,59 +29,55 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup mean
* @{
@addtogroup mean
@{
*/
/**
* @brief Mean value of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult mean value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using a 32-bit internal accumulator.
* The input is represented in 1.15 format and is accumulated in a 32-bit
* accumulator in 17.15 format.
* There is no risk of internal overflow with this approach, and the
* full precision of intermediate result is preserved.
* Finally, the accumulator is saturated and truncated to yield a result of 1.15 format.
*
@brief Mean value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.15 format and is accumulated in a 32-bit
accumulator in 17.15 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.15 format.
*/
void arm_mean_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q31_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in;
#endif
q31_t in;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in = *__SIMD32(pSrc)++;
in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
in = *__SIMD32(pSrc)++;
in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
@ -89,32 +85,30 @@ void arm_mean_q15(
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
*pResult = (q15_t) (sum / (q31_t)blockSize);
/* Store result to destination */
*pResult = (q15_t) (sum / (int32_t) blockSize);
}
/**
* @} end of mean group
@} end of mean group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mean_q31.c
* Description: Mean value of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,95 +29,82 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup mean
* @{
@addtogroup mean
@{
*/
/**
* @brief Mean value of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult mean value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*\par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.31 format and is accumulated in a 64-bit
* accumulator in 33.31 format.
* There is no risk of internal overflow with this approach, and the
* full precision of intermediate result is preserved.
* Finally, the accumulator is truncated to yield a result of 1.31 format.
*
@brief Mean value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.31 format and is accumulated in a 64-bit
accumulator in 33.31 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.31 format.
*/
void arm_mean_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in1, in2, in3, in4;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
sum += in1;
sum += in2;
sum += in3;
sum += in4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
*pResult = (q31_t) (sum / (int32_t) blockSize);
/* Store result to destination */
*pResult = (q31_t) (sum / blockSize);
}
/**
* @} end of mean group
@} end of mean group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mean_q7.c
* Description: Mean value of a Q7 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,57 +29,51 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup mean
* @{
@addtogroup mean
@{
*/
/**
* @brief Mean value of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult mean value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using a 32-bit internal accumulator.
* The input is represented in 1.7 format and is accumulated in a 32-bit
* accumulator in 25.7 format.
* There is no risk of internal overflow with this approach, and the
* full precision of intermediate result is preserved.
* Finally, the accumulator is truncated to yield a result of 1.7 format.
*
@brief Mean value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.7 format and is accumulated in a 32-bit
accumulator in 25.7 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.7 format.
*/
void arm_mean_q7(
q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult)
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult)
{
q31_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in;
#endif
q31_t in;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in = *__SIMD32(pSrc)++;
in = read_q7x4_ia ((q7_t **) &pSrc);
sum += ((in << 24U) >> 24U);
sum += ((in << 16U) >> 24U);
sum += ((in << 8U) >> 24U);
@ -89,32 +83,30 @@ void arm_mean_q7(
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
/* Store result to destination */
*pResult = (q7_t) (sum / (int32_t) blockSize);
}
/**
* @} end of mean group
@} end of mean group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_min_f32.c
* Description: Minimum value of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -27,136 +27,233 @@
*/
#include "arm_math.h"
#include <limits.h>
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup Min Minimum
*
* Computes the minimum value of an array of data.
* The function returns both the minimum value and its position within the array.
* There are separate functions for floating-point, Q31, Q15, and Q7 data types.
@defgroup Min Minimum
Computes the minimum value of an array of data.
The function returns both the minimum value and its position within the array.
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
* @addtogroup Min
* @{
@addtogroup Min
@{
*/
/**
* @brief Minimum value of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult minimum value returned here
* @param[out] *pIndex index of minimum value returned here
* @return none.
@brief Minimum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_NEON)
void arm_min_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
float32x4_t outV, srcV;
float32x2_t outV2;
uint32x4_t idxV;
uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
uint32x4_t index={4,5,6,7};
uint32x4_t delta={4,4,4,4};
uint32x4_t countV={0,1,2,3};
uint32x2_t countV2;
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparison */
if (blockSize <= 3)
{
out = *pSrc++;
blkCnt = blockSize - 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out > maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)
{
srcV = vld1q_f32(pSrc);
pSrc += 4;
idxV = vcltq_f32(srcV, outV);
outV = vbslq_f32(idxV, srcV, outV );
countV = vbslq_u32(idxV, index,countV );
index = vaddq_u32(index,delta);
/* Decrement the loop counter */
blkCnt--;
}
outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
outV2 = vpmin_f32(outV2,outV2);
out = outV2[0];
idxV = vceqq_f32(outV, vdupq_n_f32(out));
countV = vbslq_u32(idxV, countV,maxIdx);
countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
countV2 = vpmin_u32(countV2,countV2);
outIndex = countV2[0];
/* if (blockSize - 1U) is not multiple of 4 */
blkCnt = (blockSize - 4 ) % 4U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out > maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt ;
}
/* Decrement the loop counter */
blkCnt--;
}
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#else
void arm_min_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 1U;
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 2U;
out = minVal;
outIndex = index + 2U;
}
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 3U;
out = minVal;
outIndex = index + 3U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 4U;
out = minVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
float32_t minVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal1;
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -164,7 +261,8 @@ void arm_min_f32(
*pResult = out;
*pIndex = outIndex;
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of Min group
@} end of Min group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_min_q15.c
* Description: Minimum value of a Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,127 +29,113 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Min
* @{
@addtogroup Min
@{
*/
/**
* @brief Minimum value of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult minimum value returned here
* @param[out] *pIndex index of minimum value returned here
* @return none.
@brief Minimum value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
void arm_min_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q15_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 1U;
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 2U;
out = minVal;
outIndex = index + 2U;
}
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 3U;
out = minVal;
outIndex = index + 3U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 4U;
out = minVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q15_t minVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal1;
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -159,5 +145,5 @@ void arm_min_q15(
}
/**
* @} end of Min group
@} end of Min group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_min_q31.c
* Description: Minimum value of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,127 +29,113 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Min
* @{
@addtogroup Min
@{
*/
/**
* @brief Minimum value of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult minimum value returned here
* @param[out] *pIndex index of minimum value returned here
* @return none.
@brief Minimum value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
void arm_min_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q31_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 1U;
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 2U;
out = minVal;
outIndex = index + 2U;
}
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 3U;
out = minVal;
outIndex = index + 3U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 4U;
out = minVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q31_t minVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal1;
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -159,5 +145,5 @@ void arm_min_q31(
}
/**
* @} end of Min group
@} end of Min group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_min_q7.c
* Description: Minimum value of a Q7 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,127 +29,113 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup Min
* @{
@addtogroup Min
@{
*/
/**
* @brief Minimum value of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult minimum value returned here
* @param[out] *pIndex index of minimum value returned here
* @return none.
@brief Minimum value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
void arm_min_q7(
q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
q7_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise the count value. */
count = 0U;
/* Initialise the index value to zero. */
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Loop unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 1U;
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 2U;
out = minVal;
outIndex = index + 2U;
}
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal2 = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal1;
outIndex = count + 3U;
out = minVal;
outIndex = index + 3U;
}
/* compare for the minimum value */
if (out > minVal2)
minVal = *pSrc++;
if (out > minVal)
{
/* Update the minimum value and its index */
out = minVal2;
outIndex = count + 4U;
out = minVal;
outIndex = index + 4U;
}
count += 4U;
index += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* if (blockSize - 1U) is not multiple of 4 */
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Run the below code for Cortex-M0 */
q7_t minVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal1 = *pSrc++;
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal1)
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal1;
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -159,5 +145,5 @@ void arm_min_q7(
}
/**
* @} end of Min group
@} end of Min group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_power_f32.c
* Description: Sum of the squares of the elements of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,40 +29,37 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup power Power
*
* Calculates the sum of the squares of the elements in the input vector.
* The underlying algorithm is used:
*
* <pre>
* Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
* </pre>
*
* There are separate functions for floating point, Q31, Q15, and Q7 data types.
@defgroup power Power
Calculates the sum of the squares of the elements in the input vector.
The underlying algorithm is used:
<pre>
Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
</pre>
There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
* @addtogroup power
* @{
@addtogroup power
@{
*/
/**
* @brief Sum of the squares of the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult sum of the squares value returned here
* @return none.
*
@brief Sum of the squares of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
*/
#if defined(ARM_MATH_NEON)
void arm_power_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
@ -70,45 +67,32 @@ void arm_power_f32(
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
inV = vld1q_f32(pSrc);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = sumV2[0] + sumV2[1];
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
@ -123,7 +107,69 @@ void arm_power_f32(
/* Store the result to the destination */
*pResult = sum;
}
#else
void arm_power_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of power group
@} end of power group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_power_q15.c
* Description: Sum of the squares of the elements of a Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,110 +29,104 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup power
* @{
@addtogroup power
@{
*/
/**
* @brief Sum of the squares of the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult sum of the squares value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.15 format.
* Intermediate multiplication yields a 2.30 format, and this
* result is added without saturation to a 64-bit accumulator in 34.30 format.
* With 33 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the return result is in 34.30 format.
*
@brief Sum of the squares of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 34.30 format.
*/
void arm_power_q15(
q15_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
const q15_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store packed input value */
#endif
q31_t in32; /* Temporary variable to store input value */
q15_t in16; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
in32 = *__SIMD32(pSrc)++;
sum = __SMLALD(in32, in32, sum);
in32 = *__SIMD32(pSrc)++;
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
in16 = *pSrc++;
sum = __SMLALD(in16, in16, sum);
/* Decrement the loop counter */
blkCnt--;
}
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
#else
/* Run the below code for Cortex-M0 */
q15_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
/* Decrement the loop counter */
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
/* Store the results in 34.30 format */
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in 34.30 format */
*pResult = sum;
}
/**
* @} end of power group
@} end of power group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_power_q31.c
* Description: Sum of the squares of the elements of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,58 +29,51 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup power
* @{
@addtogroup power
@{
*/
/**
* @brief Sum of the squares of the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult sum of the squares value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.31 format.
* Intermediate multiplication yields a 2.62 format, and this
* result is truncated to 2.48 format by discarding the lower 14 bits.
* The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
* With 15 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the return result is in 16.48 format.
*
@brief Sum of the squares of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.31 format.
Intermediate multiplication yields a 2.62 format, and this
result is truncated to 2.48 format by discarding the lower 14 bits.
The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
With 15 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 16.48 format.
*/
void arm_power_q31(
q31_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
const q31_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
q63_t sum = 0; /* Temporary result storage */
q31_t in;
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q31_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
@ -93,37 +86,36 @@ void arm_power_q31(
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* Store the results in 16.48 format */
/* Store results in 16.48 format */
*pResult = sum;
}
/**
* @} end of power group
@} end of power group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_power_q7.c
* Description: Sum of the squares of the elements of a Q7 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,99 +29,108 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup power
* @{
@addtogroup power
@{
*/
/**
* @brief Sum of the squares of the elements of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult sum of the squares value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 32-bit internal accumulator.
* The input is represented in 1.7 format.
* Intermediate multiplication yields a 2.14 format, and this
* result is added without saturation to an accumulator in 18.14 format.
* With 17 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the return result is in 18.14 format.
*
@brief Sum of the squares of the elements of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.7 format.
Intermediate multiplication yields a 2.14 format, and this
result is added without saturation to an accumulator in 18.14 format.
With 17 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 18.14 format.
*/
void arm_power_q7(
q7_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
const q7_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q31_t sum = 0; /* Temporary result storage */
q7_t in; /* Temporary variable to store input */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
q7_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store packed input value */
q31_t in1, in2; /* Temporary variables to store input value */
#endif
q31_t input1; /* Temporary variable to store packed input */
q31_t in1, in2; /* Temporary variables to store input */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Reading two inputs of pSrc vector and packing */
input1 = *__SIMD32(pSrc)++;
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in1 = __SXTB16(__ROR(input1, 8));
in2 = __SXTB16(input1);
/* Compute Power and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q7x4_ia ((q7_t **) &pSrc);
in1 = __SXTB16(__ROR(in32, 8));
in2 = __SXTB16(in32);
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* calculate power and accumulate to accumulator */
sum = __SMLAD(in1, in1, sum);
sum = __SMLAD(in2, in2, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
/* Decrement the loop counter */
in = *pSrc++;
sum += ((q15_t) in * in);
in = *pSrc++;
sum += ((q15_t) in * in);
in = *pSrc++;
sum += ((q15_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Store the result in 18.14 format */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in 18.14 format */
*pResult = sum;
}
/**
* @} end of power group
@} end of power group
*/

View file

@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_f32.c
* Description: Root mean square value of an array of F32 type
* Description: Root mean square value of the elements of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,88 +29,75 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup RMS Root mean square (RMS)
*
*
* Calculates the Root Mean Sqaure of the elements in the input vector.
* The underlying algorithm is used:
*
* <pre>
* Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
* </pre>
*
* There are separate functions for floating point, Q31, and Q15 data types.
@defgroup RMS Root mean square (RMS)
Calculates the Root Mean Square of the elements in the input vector.
The underlying algorithm is used:
<pre>
Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
* @addtogroup RMS
* @{
@addtogroup RMS
@{
*/
/**
* @brief Root Mean Square of the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult rms value returned here
* @return none.
*
@brief Root Mean Square of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
*/
#if defined(ARM_MATH_NEON)
void arm_rms_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Accumulator */
float32_t in; /* Tempoprary variable to store input value */
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute sum of the squares and then store the result in a temporary variable, sum */
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
/* Compute Power and then store the result in a temporary variable, sum. */
inV = vld1q_f32(pSrc);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = sumV2[0] + sumV2[1];
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute sum of the squares and then store the results in a temporary variable, sum */
/* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
@ -121,7 +108,69 @@ void arm_rms_f32(
/* Compute Rms and store the result in the destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
#else
void arm_rms_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sum. */
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ( in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Compute Rms and store result in destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of RMS group
@} end of RMS group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_rms_q15.c
* Description: Root Mean Square of the elements of a Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,111 +29,106 @@
#include "arm_math.h"
/**
* @addtogroup RMS
* @{
@ingroup groupStats
*/
/**
* @brief Root Mean Square of the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult rms value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.15 format.
* Intermediate multiplication yields a 2.30 format, and this
* result is added without saturation to a 64-bit accumulator in 34.30 format.
* With 33 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
* 15 bits, and then saturated to yield a result in 1.15 format.
*
@addtogroup RMS
@{
*/
/**
@brief Root Mean Square of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_rms_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q63_t sum = 0; /* accumulator */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
q31_t in; /* temporary variable to store the input value */
q15_t in1; /* temporary variable to store the input value */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute sum of the squares and then store the results in a temporary variable, sum */
in = *__SIMD32(pSrc)++;
sum = __SMLALD(in, in, sum);
in = *__SIMD32(pSrc)++;
sum = __SMLALD(in, in, sum);
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute sum of the squares and then store the results in a temporary variable, sum */
in1 = *pSrc++;
sum = __SMLALD(in1, in1, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* Truncating and saturating the accumulator to 1.15 format */
/* Store the result in the destination */
arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
/* Compute sum of squares and store result in a temporary variable. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
#else
/* Run the below code for Cortex-M0 */
q15_t in; /* temporary variable to store the input value */
uint32_t blkCnt; /* loop counter */
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute sum of the squares and then store the results in a temporary variable, sum */
in = *pSrc++;
sum += ((q31_t) in * in);
/* Decrement the loop counter */
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ((q31_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Truncating and saturating the accumulator to 1.15 format */
/* Store the result in the destination */
/* Store result in destination */
arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of RMS group
@} end of RMS group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_rms_q31.c
* Description: Root Mean Square of the elements of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,109 +29,96 @@
#include "arm_math.h"
/**
* @addtogroup RMS
* @{
@ingroup groupStats
*/
/**
@addtogroup RMS
@{
*/
/**
* @brief Root Mean Square of the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult rms value returned here
* @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
*\par
* The function is implemented using an internal 64-bit accumulator.
* The input is represented in 1.31 format, and intermediate multiplication
* yields a 2.62 format.
* The accumulator maintains full precision of the intermediate multiplication results,
* but provides only a single guard bit.
* There is no saturation on intermediate additions.
* If the accumulator overflows, it wraps around and distorts the result.
* In order to avoid overflows completely, the input signal must be scaled down by
* log2(blockSize) bits, as a total of blockSize additions are performed internally.
* Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
*
@brief Root Mean Square of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, and intermediate multiplication
yields a 2.62 format.
The accumulator maintains full precision of the intermediate multiplication results,
but provides only a single guard bit.
There is no saturation on intermediate additions.
If the accumulator overflows, it wraps around and distorts the result.
In order to avoid overflows completely, the input signal must be scaled down by
log2(blockSize) bits, as a total of blockSize additions are performed internally.
Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
*/
void arm_rms_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q63_t sum = 0; /* accumulator */
q31_t in; /* Temporary variable to store the input */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */
q31_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in1, in2, in3, in4; /* Temporary input variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute sum of the squares and then store the result in a temporary variable, sum */
/* read two samples from source buffer */
in1 = pSrc[0];
in2 = pSrc[1];
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* calculate power and accumulate to accumulator */
sum += (q63_t) in1 *in1;
sum += (q63_t) in2 *in2;
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sum. */
sum += ((q63_t) in * in);
/* read two samples from source buffer */
in3 = pSrc[2];
in4 = pSrc[3];
in = *pSrc++;
sum += ((q63_t) in * in);
/* calculate power and accumulate to accumulator */
sum += (q63_t) in3 *in3;
sum += (q63_t) in4 *in4;
in = *pSrc++;
sum += ((q63_t) in * in);
in = *pSrc++;
sum += ((q63_t) in * in);
/* update source buffer to process next samples */
pSrc += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 8, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute sum of the squares and then store the results in a temporary variable, sum */
in = *pSrc++;
sum += (q63_t) in *in;
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Decrement the loop counter */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ((q63_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
/* Compute Rms and store the result in the destination vector */
/* Compute Rms and store result in destination vector */
arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
}
/**
* @} end of RMS group
@} end of RMS group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_std_f32.c
* Description: Standard deviation of the elements of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,111 +29,131 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup STD Standard deviation
*
* Calculates the standard deviation of the elements in the input vector.
* The underlying algorithm is used:
*
* <pre>
* Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
*
* where, sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
*
* sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
* </pre>
*
* There are separate functions for floating point, Q31, and Q15 data types.
@defgroup STD Standard deviation
Calculates the standard deviation of the elements in the input vector.
The underlying algorithm is used:
<pre>
Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
* @addtogroup STD
* @{
@addtogroup STD
@{
*/
/**
* @brief Standard deviation of the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult standard deviation value returned here
* @return none.
@brief Standard deviation of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult standard deviation value returned here
@return none
*/
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_std_f32(
float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Temporary result storage */
float32_t sumOfSquares = 0.0f; /* Sum of squares */
float32_t in; /* input value */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
float32_t var;
arm_var_f32(pSrc,blockSize,&var);
arm_sqrt_f32(var, pResult);
}
#else
float32_t squareOfSum; /* Square of Sum */
float32_t var; /* Temporary varaince storage */
void arm_std_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t sumOfSquares = 0.0f; /* Sum of squares */
float32_t in; /* Temporary variable to store input value */
#ifndef ARM_MATH_CM0_FAMILY
float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
#else
float32_t squareOfSum; /* Square of Sum */
float32_t var; /* Temporary varaince storage */
#endif
if (blockSize == 1U)
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in;
sumOfSquares += in * in;
in = *pSrc++;
sum += in;
sumOfSquares += in * in;
in = *pSrc++;
sum += in;
sumOfSquares += in * in;
in = *pSrc++;
sum += in;
sumOfSquares += in * in;
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += in * in;
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
in = *pSrc++;
sumOfSquares += in * in;
sum += in;
in = *pSrc++;
sumOfSquares += in * in;
sum += in;
in = *pSrc++;
sumOfSquares += in * in;
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in;
sumOfSquares += in * in;
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ( in * in);
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
#ifndef ARM_MATH_CM0_FAMILY
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
/* Compute mean of all input values */
@ -143,44 +163,26 @@ void arm_std_f32(
squareOfMean = (mean * mean) * (((float32_t) blockSize) /
((float32_t) blockSize - 1.0f));
/* Compute standard deviation and then store the result to the destination */
/* Compute standard deviation and store result to destination */
arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++;
sumOfSquares += in * in;
/* C = (A[0] + A[1] + ... + A[blockSize-1]) */
/* Compute Sum of the input samples
* and then store the result in a temporary variable, sum. */
sum += in;
/* Decrement the loop counter */
blkCnt--;
}
/* Compute the square of sum */
/* Compute square of sum */
squareOfSum = ((sum * sum) / (float32_t) blockSize);
/* Compute the variance */
/* Compute variance */
var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
/* Compute standard deviation and then store the result to the destination */
/* Compute standard deviation and store result in destination */
arm_sqrt_f32(var, pResult);
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of STD group
@} end of STD group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_std_q15.c
* Description: Standard deviation of an array of Q15 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,146 +29,133 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup STD
* @{
@addtogroup STD
@{
*/
/**
* @brief Standard deviation of the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult standard deviation value returned here
* @return none.
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.15 format.
* Intermediate multiplication yields a 2.30 format, and this
* result is added without saturation to a 64-bit accumulator in 34.30 format.
* With 33 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
* 15 bits, and then saturated to yield a result in 1.15 format.
@brief Standard deviation of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult standard deviation value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_std_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
uint32_t blkCnt; /* loop counter */
q63_t sumOfSquares = 0; /* Accumulator */
#if defined (ARM_MATH_DSP)
q31_t in; /* input value */
q15_t in1; /* input value */
#else
q15_t in; /* input value */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
if (blockSize == 1U)
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *__SIMD32(pSrc)++;
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
sumOfSquares = __SMLALD(in, in, sumOfSquares);
in = *__SIMD32(pSrc)++;
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
sumOfSquares = __SMLALD(in, in, sumOfSquares);
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in1 = *pSrc++;
sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
sum += in1;
/* Decrement the loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of the squares minus the square of the mean. */
/* Compute standard deviation and store the result to the destination */
arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
/* Compute sum and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++;
sumOfSquares += (in * in);
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
/* Compute sum of all input values and then store the result in a temporary variable, sum. */
sum += in;
/* Decrement the loop counter */
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += (in * in);
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of the squares minus the square of the mean. */
/* Compute standard deviation and store the result to the destination */
/* mean of squares minus the square of mean. */
/* Compute standard deviation and store result in destination */
arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of STD group
@} end of STD group
*/

View file

@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_q31.c
* Description: Standard deviation of an array of Q31 type.
* Description: Standard deviation of the elements of a Q31 vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,141 +29,119 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup STD
* @{
@addtogroup STD
@{
*/
/**
* @brief Standard deviation of the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult standard deviation value returned here
* @return none.
* @details
* <b>Scaling and Overflow Behavior:</b>
*
*\par
* The function is implemented using an internal 64-bit accumulator.
* The input is represented in 1.31 format, which is then downshifted by 8 bits
* which yields 1.23, and intermediate multiplication yields a 2.46 format.
* The accumulator maintains full precision of the intermediate multiplication results,
* but provides only a 16 guard bits.
* There is no saturation on intermediate additions.
* If the accumulator overflows it wraps around and distorts the result.
* In order to avoid overflows completely the input signal must be scaled down by
* log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
* After division, internal variables should be Q18.46
* Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*
@brief Standard deviation of the elements of a Q31 vector.
@param[in] pSrc points to the input vector.
@param[in] blockSize number of samples in input vector.
@param[out] pResult standard deviation value returned here.
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, which is then downshifted by 8 bits
which yields 1.23, and intermediate multiplication yields a 2.46 format.
The accumulator maintains full precision of the intermediate multiplication results,
but provides only a 16 guard bits.
There is no saturation on intermediate additions.
If the accumulator overflows it wraps around and distorts the result.
In order to avoid overflows completely the input signal must be scaled down by
log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
After division, internal variables should be Q18.46
Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_std_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q63_t sum = 0; /* Accumulator */
q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
q31_t in; /* input value */
uint32_t blkCnt; /* loop counter */
q63_t sumOfSquares = 0; /* Accumulator */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Accumulator */
q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q31_t in; /* Temporary variable to store input value */
if (blockSize == 1U)
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
/* Decrement the loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
/* Compute sum of all input values and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* Compute standard deviation and then store the result to the destination */
/* Compute standard deviation and store result in destination */
arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
}
/**
* @} end of STD group
@} end of STD group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_var_f32.c
* Description: Variance of the elements of a floating-point vector
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,153 +29,206 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @defgroup variance Variance
*
* Calculates the variance of the elements in the input vector.
* The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
*
* <pre>
* Result = sum(element - meanOfElements)^2) / numElement - 1
*
* where, meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
*
* </pre>
*
* There are separate functions for floating point, Q31, and Q15 data types.
@defgroup variance Variance
Calculates the variance of the elements in the input vector.
The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
<pre>
Result = sum(element - meanOfElements)^2) / numElement - 1
meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
* @addtogroup variance
* @{
@addtogroup variance
@{
*/
/**
* @brief Variance of the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult variance value returned here
* @return none.
@brief Variance of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
*/
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_var_f32(
float32_t * pSrc,
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t fMean, fValue;
uint32_t blkCnt; /* loop counter */
float32_t * pInput = pSrc;
float32_t sum = 0.0f;
float32_t fSum = 0.0f;
#if defined(ARM_MATH_DSP)
float32_t in1, in2, in3, in4;
#endif
float32_t mean;
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
#if defined(ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M7 */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
float32x4_t avg;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
arm_mean_f32(pSrc,blockSize,&mean);
avg = vdupq_n_f32(mean);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in1 = *pInput++;
in2 = *pInput++;
in3 = *pInput++;
in4 = *pInput++;
blkCnt = blockSize >> 2U;
sum += in1;
sum += in2;
sum += in3;
sum += in4;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
inV = vld1q_f32(pSrc);
inV = vsubq_f32(inV, avg);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = sumV2[0] + sumV2[1];
#else
/* Run the below code for Cortex-M0 or Cortex-M3 */
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
in = in - mean;
sum += in * in;
#endif
/* Decrement the loop counter */
blkCnt--;
}
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
/* Variance */
*pResult = sum / (float32_t)(blockSize - 1.0f);
/* Decrement the loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
fMean = sum / (float32_t) blockSize;
pInput = pSrc;
#if defined(ARM_MATH_DSP)
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement the loop counter */
blkCnt--;
}
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 or Cortex-M3 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement the loop counter */
blkCnt--;
}
/* Variance */
*pResult = fSum / (float32_t)(blockSize - 1.0f);
}
#else
void arm_var_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t fSum = 0.0f;
float32_t fMean, fValue;
const float32_t * pInput = pSrc;
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
sum += *pInput++;
sum += *pInput++;
sum += *pInput++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
fMean = sum / (float32_t) blockSize;
pInput = pSrc;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Variance */
*pResult = fSum / (float32_t)(blockSize - 1.0f);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of variance group
@} end of variance group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_var_q15.c
* Description: Variance of an array of Q15 type
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,144 +29,136 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup variance
* @{
@addtogroup variance
@{
*/
/**
* @brief Variance of the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult variance value returned here
* @return none.
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* The input is represented in 1.15 format.
* Intermediate multiplication yields a 2.30 format, and this
* result is added without saturation to a 64-bit accumulator in 34.30 format.
* With 33 guard bits in the accumulator, there is no risk of overflow, and the
* full precision of the intermediate multiplication is preserved.
* Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
* 15 bits, and then saturated to yield a result in 1.15 format.
@brief Variance of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_var_q15(
q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
uint32_t blkCnt; /* loop counter */
q63_t sumOfSquares = 0; /* Accumulator */
#if defined (ARM_MATH_DSP)
q31_t in; /* input value */
q15_t in1; /* input value */
#else
q15_t in; /* input value */
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
if (blockSize == 1U)
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *__SIMD32(pSrc)++;
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
sumOfSquares = __SMLALD(in, in, sumOfSquares);
in = *__SIMD32(pSrc)++;
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
sumOfSquares = __SMLALD(in, in, sumOfSquares);
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in1 = *pSrc++;
sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
sum += in1;
/* Decrement the loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of the squares minus the square of the mean. */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
/* Compute sum and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++;
sumOfSquares += (in * in);
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
/* Compute sum of all input values and then store the result in a temporary variable, sum. */
sum += in;
/* Decrement the loop counter */
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
#if defined (ARM_MATH_DSP)
sumOfSquares = __SMLALD(in, in, sumOfSquares);
#else
sumOfSquares += (in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of the squares minus the square of the mean. */
*pResult = (meanOfSquares - squareOfMean) >> 15;
#endif /* #if defined (ARM_MATH_DSP) */
/* mean of squares minus the square of mean. */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
* @} end of variance group
@} end of variance group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_var_q31.c
* Description: Variance of an array of Q31 type
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,141 +29,119 @@
#include "arm_math.h"
/**
* @ingroup groupStats
@ingroup groupStats
*/
/**
* @addtogroup variance
* @{
@addtogroup variance
@{
*/
/**
* @brief Variance of the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] blockSize length of the input vector
* @param[out] *pResult variance value returned here
* @return none.
* @details
* <b>Scaling and Overflow Behavior:</b>
*
*\par
* The function is implemented using an internal 64-bit accumulator.
* The input is represented in 1.31 format, which is then downshifted by 8 bits
* which yields 1.23, and intermediate multiplication yields a 2.46 format.
* The accumulator maintains full precision of the intermediate multiplication results,
* but provides only a 16 guard bits.
* There is no saturation on intermediate additions.
* If the accumulator overflows it wraps around and distorts the result.
* In order to avoid overflows completely the input signal must be scaled down by
* log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
* After division, internal variables should be Q18.46
* Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*
@brief Variance of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, which is then downshifted by 8 bits
which yields 1.23, and intermediate multiplication yields a 2.46 format.
The accumulator maintains full precision of the intermediate multiplication results,
but provides only a 16 guard bits.
There is no saturation on intermediate additions.
If the accumulator overflows it wraps around and distorts the result.
In order to avoid overflows completely the input signal must be scaled down by
log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
After division, internal variables should be Q18.46
Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_var_q31(
q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q63_t sum = 0; /* Accumulator */
q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
q31_t in; /* input value */
uint32_t blkCnt; /* loop counter */
q63_t sumOfSquares = 0; /* Accumulator */
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q31_t in; /* Temporary variable to store input value */
if (blockSize == 1U)
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Decrement the loop counter */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
sum += in;
sumOfSquares += ((q63_t) (in) * (in));
/* Decrement the loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
/* Compute sum of all input values and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* Compute standard deviation and then store the result to the destination */
/* Compute variance and store result in destination */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
* @} end of variance group
@} end of variance group
*/