Official ARM version: v5.6.0

This commit is contained in:
rihab kouki 2020-07-28 11:24:49 +01:00
parent 9f95ff5b6b
commit 96d6da4e25
2939 changed files with 339304 additions and 113320 deletions

View file

@ -0,0 +1,16 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPMatrix)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPMatrix STATIC ${SRC})
configdsp(CMSISDSPMatrix ..)
### Includes
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/../../Include")

View file

@ -0,0 +1,53 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f32.c"
#include "arm_mat_add_q15.c"
#include "arm_mat_add_q31.c"
#include "arm_mat_cmplx_mult_f32.c"
#include "arm_mat_cmplx_mult_q15.c"
#include "arm_mat_cmplx_mult_q31.c"
#include "arm_mat_init_f32.c"
#include "arm_mat_init_q15.c"
#include "arm_mat_init_q31.c"
#include "arm_mat_inverse_f32.c"
#include "arm_mat_inverse_f64.c"
#include "arm_mat_mult_f32.c"
#include "arm_mat_mult_fast_q15.c"
#include "arm_mat_mult_fast_q31.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
#include "arm_mat_sub_f32.c"
#include "arm_mat_sub_q15.c"
#include "arm_mat_sub_q31.c"
#include "arm_mat_trans_f32.c"
#include "arm_mat_trans_q15.c"
#include "arm_mat_trans_q31.c"

View file

@ -3,13 +3,13 @@
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,35 +29,43 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixAdd Matrix Addition
*
* Adds two matrices.
* \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
*
* The functions check to make sure that
* <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
* number of rows and columns.
@defgroup MatrixAdd Matrix Addition
Adds two matrices.
\image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Floating-point matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
/*
Neon version is assuming the matrix is small enough.
So no blocking is used for taking into account cache effects.
For big matrix, there exist better libraries for Neon.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@ -67,12 +75,8 @@ arm_status arm_mat_add_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
#if defined (ARM_MATH_DSP)
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
@ -89,69 +93,27 @@ arm_status arm_mat_add_f32(
else
#endif
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Loop unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Read values from source B */
inB1 = pIn2[0];
/* Read values from source A */
inA2 = pIn1[1];
/* out = sourceA + sourceB */
out1 = inA1 + inB1;
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* out = sourceA + sourceB */
out2 = inA2 + inB2;
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* out = sourceA + sourceB */
out1 = inA1 + inB1;
/* out = sourceA + sourceB */
out2 = inA2 + inB2;
/* Store result in destination */
pOut[2] = out1;
/* Store result in destination */
pOut[3] = out2;
/* update pointers to process next sampels */
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@ -163,15 +125,6 @@ arm_status arm_mat_add_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
@ -184,13 +137,96 @@ arm_status arm_mat_add_f32(
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,116 +29,114 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Q15 matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
numSamples = (uint16_t) (pSrcA->numRows * pSrcA->numCols);
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
/* Run the below code for Cortex-M4 and Cortex-M3 */
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
/* Loop unrolling */
blkCnt = (uint32_t) numSamples >> 2U;
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
*__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
/* Decrement the loop counter */
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = (uint32_t) numSamples % 0x4U;
/* q15 pointers of input and output are initialized */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = (uint32_t) numSamples;
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* q15 pointers of input and output are initialized */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ + *pInB++), 16);
/* Decrement the loop counter */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -147,5 +145,5 @@ arm_status arm_mat_add_q15(
}
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,160 +29,104 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Q31 matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t inA1, inB1; /* temporary variables */
#if defined (ARM_MATH_DSP)
q31_t inA2, inB2; /* temporary variables */
q31_t out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
/* Read values from source B */
inB1 = pIn2[0];
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Read values from source A */
inA2 = pIn1[1];
*pOut++ = __QADD(*pInA++, *pInB++);
/* Add and saturate */
out1 = __QADD(inA1, inB1);
*pOut++ = __QADD(*pInA++, *pInB++);
/* Read values from source B */
inB2 = pIn2[1];
*pOut++ = __QADD(*pInA++, *pInB++);
/* Read values from source A */
inA1 = pIn1[2];
/* Add and saturate */
out2 = __QADD(inA2, inB2);
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* Add and saturate */
out1 = __QADD(inA1, inB1);
out2 = __QADD(inA2, inB2);
/* Store result in destination */
pOut[2] = out1;
pOut[3] = out2;
/* update pointers to process next sampels */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and then store the results in the destination buffer. */
inA1 = *pIn1++;
inB1 = *pIn2++;
inA1 = __QADD(inA1, inB1);
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
*pOut++ = inA1;
}
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -191,5 +135,5 @@ arm_status arm_mat_add_q31(
}
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,36 +29,38 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup CmplxMatrixMult Complex Matrix Multiplication
*
* Complex Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
@defgroup CmplxMatrixMult Complex Matrix Multiplication
Complex Matrix multiplication is only defined if the number of columns of the
first matrix equals the number of rows of the second matrix.
Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
in an <code>M x P</code> matrix.
@par
When matrix size checking is enabled, the functions check:
- that the inner dimensions of <code>pSrcA</code> and <code>pSrcB</code> are equal;
- that the size of the output matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Floating-point Complex matrix multiplication.
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@ -74,14 +76,20 @@ arm_status arm_mat_cmplx_mult_f32(
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
float32_t sumReal1, sumImag1; /* accumulator */
float32_t a0, b0, c0, d0;
float32_t a1, b1, c1, d1;
float32_t a1, a1B,b1, b1B, c1, d1;
float32_t sumReal2, sumImag2; /* accumulator */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32x4x2_t a0V, a1V;
float32x4_t accR0,accI0, accR1,accI1,tempR, tempI;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
uint16_t col, i = 0U, j, rowCnt, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
float32_t sumReal1B, sumImag1B;
float32_t sumReal2B, sumImag2B;
float32_t *pxB;
#ifdef ARM_MATH_MATRIX_CHECK
@ -99,8 +107,172 @@ arm_status arm_mat_cmplx_mult_f32(
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
rowCnt = row >> 1;
/* Row loop */
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
pxB = px + 2 * numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
sumImag1 = 0.0f;
sumReal1B = 0.0f;
sumImag1B = 0.0f;
sumReal2 = 0.0f;
sumImag2 = 0.0f;
sumReal2B = 0.0f;
sumImag2B = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + 2*numColsA;
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
accR1 = vdupq_n_f32(0.0);
accI1 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
a1V = vld2q_f32(pIn1B); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
pIn1B += 8;
tempR[0] = *pIn2;
tempI[0] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[1] = *pIn2;
tempI[1] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[2] = *pIn2;
tempI[2] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[3] = *pIn2;
tempI[3] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
accR1 = vmlaq_f32(accR1,a1V.val[0],tempR);
accR1 = vmlsq_f32(accR1,a1V.val[1],tempI);
accI1 = vmlaq_f32(accI1,a1V.val[1],tempR);
accI1 = vmlaq_f32(accI1,a1V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
sumReal1B += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
sumImag1B += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
a1B = *pIn1B;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
b1B = *(pIn1B + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
sumReal1B += a1B * c1;
sumImag1B += b1B * c1;
pIn1 += 2U;
pIn1B += 2U;
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
sumReal2B -= b1B * d1;
sumImag2B += a1B * d1;
/* Decrement the loop counter */
colCnt--;
}
sumReal1 += sumReal2;
sumImag1 += sumImag2;
sumReal1B += sumReal2B;
sumImag1B += sumImag2B;
/* Store the result in the destination buffer */
*px++ = sumReal1;
*px++ = sumImag1;
*pxB++ = sumReal1B;
*pxB++ = sumImag1B;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
col--;
}
/* Update the pointer pInA to point to the starting address of the next 2 row */
i = i + 2*numColsB;
pInA = pInA + 4 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
rowCnt = row & 1;
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
@ -114,8 +286,8 @@ arm_status arm_mat_cmplx_mult_f32(
j = 0U;
/* column loop */
do
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
@ -127,94 +299,58 @@ arm_status arm_mat_cmplx_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* matrix multiplication */
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
sumReal1 += a0 * c0;
sumImag1 += b0 * c0;
pIn1 += 2U;
tempR[0] = *pIn2;
tempI[0] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
sumReal2 -= b0 * d0;
sumImag2 += a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
pIn1 += 2U;
tempR[1] = *pIn2;
tempI[1] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
a0 = *pIn1;
c0 = *pIn2;
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
sumReal1 += a0 * c0;
sumImag1 += b0 * c0;
pIn1 += 2U;
tempR[2] = *pIn2;
tempI[2] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
sumReal2 -= b0 * d0;
sumImag2 += a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
pIn1 += 2U;
tempR[3] = *pIn2;
tempI[3] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
@ -248,13 +384,234 @@ arm_status arm_mat_cmplx_mult_f32(
/* Decrement the column loop counter */
col--;
} while (col > 0U);
}
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float32_t sumReal, sumImag; /* Accumulator */
float32_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
float32_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f;
sumImag = 0.0f;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
@ -267,6 +624,8 @@ arm_status arm_mat_cmplx_mult_f32(
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,141 +29,115 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Q15 Complex matrix multiplication
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @param[in] *pScratch points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* \par Conditions for optimum performance
* Input, output and state buffers should be aligned by 32-bit
*
* \par Restrictions
* If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
* In this case input, output, scratch buffers should be aligned by 32-bit
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator. The inputs to the
* multiplications are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate
* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
* truncated to 34.15 format by discarding the low 15 bits and then saturated to
* 1.15 format.
*
* \par
* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
*
@brief Q15 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@param[in] pScratch points to an array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Conditions for optimum performance
Input, output and state buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
/* accumulator */
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q63_t sumReal, sumImag;
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
q63_t sumReal, sumImag; /* accumulator */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef UNALIGNED_SUPPORT_DISABLE
q15_t in; /* Temporary variable to hold the input value */
q15_t a, b, c, d;
#if defined (ARM_MATH_DSP)
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#else
q31_t in; /* Temporary variable to hold the input value */
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#endif
q15_t a, b, c, d;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pSrcBT + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifdef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
@ -171,79 +145,33 @@ arm_status arm_mat_cmplx_mult_q15(
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
#else
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Initialize blkCnt with number of samples */
col = numColsB;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
*__SIMD32(px) = in;
#endif
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + 2U;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@ -252,33 +180,61 @@ arm_status arm_mat_cmplx_mult_q15(
/* row loop */
do
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#ifdef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
@ -304,53 +260,40 @@ arm_status arm_mat_cmplx_mult_q15(
pInA += 4U;
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
#else
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#ifdef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
@ -359,48 +302,32 @@ arm_status arm_mat_cmplx_mult_q15(
d = *pInB++;
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
#else
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
#endif /* #if defined (ARM_MATH_DSP) */
}
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -409,5 +336,5 @@ arm_status arm_mat_cmplx_mult_q15(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_q31.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,74 +29,69 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Q31 Complex matrix multiplication
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate
* multiplication results but provides only a single guard bit. There is no saturation
* on intermediate additions. Thus, if the accumulator overflows it wraps around and
* distorts the result. The input signals should be scaled down to avoid intermediate
* overflows. The input is thus scaled down by log2(numColsA) bits
* to avoid overflows, as a total of numColsA additions are performed internally.
* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*
*
@brief Q31 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*/
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
q63_t sumReal1, sumImag1; /* accumulator */
q31_t a0, b0, c0, d0;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
q63_t sumReal, sumImag; /* Accumulator */
q31_t a1, b1, c1, d1;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
@ -119,16 +114,18 @@ arm_status arm_mat_cmplx_mult_q31(
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0;
sumImag1 = 0.0;
sumReal = 0.0;
sumImag = 0.0;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* matrix multiplication */
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
@ -145,76 +142,74 @@ arm_status arm_mat_cmplx_mult_q31(
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a0 *c0;
sumImag1 += (q63_t) b0 *c0;
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b0 *d0;
sumImag1 += (q63_t) a0 *d0;
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *pIn1;
c1 = *pIn2;
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
a0 = *pIn1;
c0 = *pIn2;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a0 *c0;
sumImag1 += (q63_t) b0 *c0;
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b0 *d0;
sumImag1 += (q63_t) a0 *d0;
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement the loop count */
/* Decrement loop count */
colCnt--;
}
@ -222,49 +217,55 @@ arm_status arm_mat_cmplx_mult_q31(
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sumReal1 >> 31);
*px++ = (q31_t) clip_q63_to_q31(sumImag1 >> 31);
/* Store result in destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sumReal >> 31);
*px++ = (q31_t) clip_q63_to_q31(sumImag >> 31);
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
@ -278,5 +279,5 @@ arm_status arm_mat_cmplx_mult_q31(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,31 +29,31 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInit Matrix Initialization
*
* Initializes the underlying matrix data structure.
* The functions set the <code>numRows</code>,
* <code>numCols</code>, and <code>pData</code> fields
* of the matrix data structure.
@defgroup MatrixInit Matrix Initialization
Initializes the underlying matrix data structure.
The functions set the <code>numRows</code>,
<code>numCols</code>, and <code>pData</code> fields
of the matrix data structure.
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Floating-point matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
@ -72,5 +72,5 @@ void arm_mat_init_f32(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,22 +29,22 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Q15 matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
/**
@brief Q15 matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
@ -63,5 +63,5 @@ void arm_mat_init_q15(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,27 +29,27 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInit Matrix Initialization
*
@defgroup MatrixInit Matrix Initialization
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Q31 matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
/**
@brief Q31 matrix initialization.
@param[in,out] S points to an instance of the Q31 matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
@ -68,5 +68,5 @@ void arm_mat_init_q31(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f32.c
* Description: Floating-point matrix inverse
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,47 +29,45 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInv Matrix Inverse
*
* Computes the inverse of a matrix.
*
* The inverse is defined only if the input matrix is square and non-singular (the determinant
* is non-zero). The function checks that the input and output matrices are square and of the
* same size.
*
* Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
* inversion of floating-point matrices.
*
* \par Algorithm
* The Gauss-Jordan method is used to find the inverse.
* The algorithm performs a sequence of elementary row-operations until it
* reduces the input matrix to an identity matrix. Applying the same sequence
* of elementary row-operations to an identity matrix yields the inverse matrix.
* If the input matrix is singular, then the algorithm terminates and returns error status
* <code>ARM_MATH_SINGULAR</code>.
* \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
@defgroup MatrixInv Matrix Inverse
Computes the inverse of a matrix.
The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
The function checks that the input and output matrices are square and of the same size.
Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
inversion of floating-point matrices.
@par Algorithm
The Gauss-Jordan method is used to find the inverse.
The algorithm performs a sequence of elementary row-operations until it
reduces the input matrix to an identity matrix. Applying the same sequence
of elementary row-operations to an identity matrix yields the inverse matrix.
If the input matrix is singular, then the algorithm terminates and returns error status
<code>ARM_MATH_SINGULAR</code>.
\image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
*/
/**
* @addtogroup MatrixInv
* @{
@addtogroup MatrixInv
@{
*/
/**
* @brief Floating-point matrix inverse.
* @param[in] *pSrc points to input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns
* <code>ARM_MATH_SIZE_MISMATCH</code> if the input matrix is not square or if the size
* of the output matrix does not match the size of the input matrix.
* If the input matrix is found to be singular (non-invertible), then the function returns
* <code>ARM_MATH_SINGULAR</code>. Otherwise, the function returns <code>ARM_MATH_SUCCESS</code>.
@brief Floating-point matrix inverse.
@param[in] pSrc points to input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
@ -82,18 +80,17 @@ arm_status arm_mat_inverse_f32(
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float32_t maxC; /* maximum value in the column */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
float32x4_t vec1;
float32x4_t vec2;
float32x4_t tmpV;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
@ -104,43 +101,476 @@ arm_status arm_mat_inverse_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Decrement the loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Grab the most significant value from column l */
maxC = 0;
for (i = l; i < numRows; i++)
{
maxC = *pInT1 > 0 ? (*pInT1 > maxC ? *pInT1 : maxC) : (-*pInT1 > maxC ? -*pInT1 : maxC);
pInT1 += numCols;
}
/* Update the status if the matrix is singular */
if (maxC == 0.0f)
{
return ARM_MATH_SINGULAR;
}
/* Restore pInT1 */
pInT1 = pIn;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is the most significant of the column */
if ( (in > 0.0f ? in : -in) != maxC)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
while (i > 0U)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
/* Look for the most significant element to
* replace in the rows below */
if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
/* Decrement the loop counter */
i--;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
tmpV = vdupq_n_f32(1.0/in);
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l) >> 2;
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
vec1 = vld1q_f32(pInT1);
vec1 = vmulq_f32(vec1, tmpV);
vst1q_f32(pInT1, vec1);
pInT1 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = (numCols - l) & 3;
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols >> 2;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
vec1 = vld1q_f32(pInT2);
vec1 = vmulq_f32(vec1, tmpV);
vst1q_f32(pInT2, vec1);
pInT2 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = numCols & 3;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
tmpV = vdupq_n_f32(in);
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l) >> 2;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
vec1 = vld1q_f32(pInT1);
vec2 = vld1q_f32(pPRT_in);
vec1 = vmlsq_f32(vec1, tmpV, vec2);
vst1q_f32(pInT1, vec1);
pPRT_in += 4;
pInT1 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = (numCols - l) & 3;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols >> 2;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
vec1 = vld1q_f32(pInT2);
vec2 = vld1q_f32(pPRT_pDst);
vec1 = vmlsq_f32(vec1, tmpV, vec2);
vst1q_f32(pInT2, vec1);
pPRT_pDst += 4;
pInT2 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = numCols & 3;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
j--;
}
}
/* Increment the temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
k--;
/* Increment the pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float32_t maxC; /* maximum value in the column */
float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@ -170,7 +600,7 @@ arm_status arm_mat_inverse_f32(
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@ -260,7 +690,7 @@ arm_status arm_mat_inverse_f32(
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
@ -274,7 +704,7 @@ arm_status arm_mat_inverse_f32(
/* Update the destination pointer modifier */
k++;
/* Decrement the loop counter */
/* Decrement loop counter */
i--;
}
}
@ -385,19 +815,19 @@ arm_status arm_mat_inverse_f32(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
/* Decrement loop counter */
k--;
/* Increment the pivot index */
/* Increment pivot index */
i++;
}
@ -414,8 +844,6 @@ arm_status arm_mat_inverse_f32(
#else
/* Run the below code for Cortex-M0 */
float32_t Xchg, in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
@ -423,50 +851,53 @@ arm_status arm_mat_inverse_f32(
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@ -496,7 +927,7 @@ arm_status arm_mat_inverse_f32(
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@ -506,7 +937,7 @@ arm_status arm_mat_inverse_f32(
/* Index modifier to navigate through the columns */
l = 0U;
//for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@ -640,6 +1071,7 @@ arm_status arm_mat_inverse_f32(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@ -651,19 +1083,21 @@ arm_status arm_mat_inverse_f32(
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
@ -682,10 +1116,12 @@ arm_status arm_mat_inverse_f32(
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixInv group
@} end of MatrixInv group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,50 +29,28 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @defgroup MatrixInv Matrix Inverse
*
* Computes the inverse of a matrix.
*
* The inverse is defined only if the input matrix is square and non-singular (the determinant
* is non-zero). The function checks that the input and output matrices are square and of the
* same size.
*
* Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
* inversion of floating-point matrices.
*
* \par Algorithm
* The Gauss-Jordan method is used to find the inverse.
* The algorithm performs a sequence of elementary row-operations until it
* reduces the input matrix to an identity matrix. Applying the same sequence
* of elementary row-operations to an identity matrix yields the inverse matrix.
* If the input matrix is singular, then the algorithm terminates and returns error status
* <code>ARM_MATH_SINGULAR</code>.
* \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
*/
/**
* @addtogroup MatrixInv
* @{
*/
/**
* @brief Floating-point matrix inverse.
* @param[in] *pSrc points to input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns
* <code>ARM_MATH_SIZE_MISMATCH</code> if the input matrix is not square or if the size
* of the output matrix does not match the size of the input matrix.
* If the input matrix is found to be singular (non-invertible), then the function returns
* <code>ARM_MATH_SINGULAR</code>. Otherwise, the function returns <code>ARM_MATH_SUCCESS</code>.
@brief Floating-point (64 bit) matrix inverse.
@param[in] pSrc points to input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
@ -85,62 +63,61 @@ arm_status arm_mat_inverse_f64(
#if defined (ARM_MATH_DSP)
float64_t maxC; /* maximum value in the column */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float64_t Xchg, in = 0.0f, in1; /* Temporary input values */
float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@ -155,22 +132,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@ -208,7 +185,7 @@ arm_status arm_mat_inverse_f64(
}
/* Update the status if the matrix is singular */
if (maxC == 0.0f)
if (maxC == 0.0)
{
return ARM_MATH_SINGULAR;
}
@ -220,7 +197,7 @@ arm_status arm_mat_inverse_f64(
k = 1U;
/* Check if the pivot element is the most significant of the column */
if ( (in > 0.0f ? in : -in) != maxC)
if ( (in > 0.0 ? in : -in) != maxC)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
@ -233,7 +210,7 @@ arm_status arm_mat_inverse_f64(
/* Look for the most significant element to
* replace in the rows below */
if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
if ((*pInT2 > 0.0 ? *pInT2: -*pInT2) == maxC)
{
/* Loop over number of columns
* to the right of the pilot element */
@ -260,7 +237,7 @@ arm_status arm_mat_inverse_f64(
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
@ -274,13 +251,13 @@ arm_status arm_mat_inverse_f64(
/* Update the destination pointer modifier */
k++;
/* Decrement the loop counter */
/* Decrement loop counter */
i--;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@ -385,19 +362,19 @@ arm_status arm_mat_inverse_f64(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
/* Decrement loop counter */
k--;
/* Increment the pivot index */
/* Increment pivot index */
i++;
}
@ -414,59 +391,60 @@ arm_status arm_mat_inverse_f64(
#else
/* Run the below code for Cortex-M0 */
float64_t Xchg, in = 0.0f; /* Temporary input values */
float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@ -481,22 +459,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@ -506,7 +484,7 @@ arm_status arm_mat_inverse_f64(
/* Index modifier to navigate through the columns */
l = 0U;
//for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@ -529,7 +507,7 @@ arm_status arm_mat_inverse_f64(
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
@ -540,7 +518,7 @@ arm_status arm_mat_inverse_f64(
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0f)
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
@ -572,7 +550,7 @@ arm_status arm_mat_inverse_f64(
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@ -640,6 +618,7 @@ arm_status arm_mat_inverse_f64(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@ -651,30 +630,32 @@ arm_status arm_mat_inverse_f64(
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
if (pIn[i] != 0.0)
break;
}
@ -682,10 +663,11 @@ arm_status arm_mat_inverse_f64(
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixInv group
@} end of MatrixInv group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -62,6 +62,9 @@
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_NEON)
#define GROUPOFROWS 8
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
@ -78,32 +81,225 @@ arm_status arm_mat_mult_f32(
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
float32_t *pIn1C = pSrcA->pData;
float32_t *pIn1D = pSrcA->pData;
float32_t *pIn1E = pSrcA->pData;
float32_t *pIn1F = pSrcA->pData;
float32_t *pIn1G = pSrcA->pData;
float32_t *pIn1H = pSrcA->pData;
float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
/* Row loop */
rowCnt = row >> 3;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + GROUPOFROWS*i;
pxB = px + numColsB;
pxC = px + 2*numColsB;
pxD = px + 3*numColsB;
pxE = px + 4*numColsB;
pxF = px + 5*numColsB;
pxG = px + 6*numColsB;
pxH = px + 7*numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum0 = 0.0f;
sum1 = 0.0f;
sum2 = 0.0f;
sum3 = 0.0f;
sum4 = 0.0f;
sum5 = 0.0f;
sum6 = 0.0f;
sum7 = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + numColsA;
pIn1C = pIn1 + 2*numColsA;
pIn1D = pIn1 + 3*numColsA;
pIn1E = pIn1 + 4*numColsA;
pIn1F = pIn1 + 5*numColsA;
pIn1G = pIn1 + 6*numColsA;
pIn1H = pIn1 + 7*numColsA;
acc0 = vdupq_n_f32(0.0);
acc1 = vdupq_n_f32(0.0);
acc2 = vdupq_n_f32(0.0);
acc3 = vdupq_n_f32(0.0);
acc4 = vdupq_n_f32(0.0);
acc5 = vdupq_n_f32(0.0);
acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1);
a1V = vld1q_f32(pIn1B);
a2V = vld1q_f32(pIn1C);
a3V = vld1q_f32(pIn1D);
a4V = vld1q_f32(pIn1E);
a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1B += 4;
pIn1C += 4;
pIn1D += 4;
pIn1E += 4;
pIn1F += 4;
pIn1G += 4;
pIn1H += 4;
temp[0] = *pIn2;
pIn2 += numColsB;
temp[1] = *pIn2;
pIn2 += numColsB;
temp[2] = *pIn2;
pIn2 += numColsB;
temp[3] = *pIn2;
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
acc1 = vmlaq_f32(acc1,a1V,temp);
acc2 = vmlaq_f32(acc2,a2V,temp);
acc3 = vmlaq_f32(acc3,a3V,temp);
acc4 = vmlaq_f32(acc4,a4V,temp);
acc5 = vmlaq_f32(acc5,a5V,temp);
acc6 = vmlaq_f32(acc6,a6V,temp);
acc7 = vmlaq_f32(acc7,a7V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
sum4 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
sum5 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
sum6 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
sum7 += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum0 += *pIn1++ * (*pIn2);
sum1 += *pIn1B++ * (*pIn2);
sum2 += *pIn1C++ * (*pIn2);
sum3 += *pIn1D++ * (*pIn2);
sum4 += *pIn1E++ * (*pIn2);
sum5 += *pIn1F++ * (*pIn2);
sum6 += *pIn1G++ * (*pIn2);
sum7 += *pIn1H++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum0;
*pxB++ = sum1;
*pxC++ = sum2;
*pxD++ = sum3;
*pxE++ = sum4;
*pxF++ = sum5;
*pxG++ = sum6;
*pxH++ = sum7;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + GROUPOFROWS*numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/*
i was the index of a group of rows computed by previous loop.
Now i is the index of a row since below code is computing row per row
and no more group of row per group of rows.
*/
i = GROUPOFROWS*i;
rowCnt = row & 7;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
@ -117,7 +313,7 @@ arm_status arm_mat_mult_f32(
j = 0U;
/* column loop */
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
@ -126,43 +322,43 @@ arm_status arm_mat_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
acc0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
in3 = *pIn2;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 4;
temp[0] = *pIn2;
pIn2 += numColsB;
in1 = pIn1[0];
in2 = pIn1[1];
sum += in1 * in3;
in4 = *pIn2;
temp[1] = *pIn2;
pIn2 += numColsB;
temp[2] = *pIn2;
pIn2 += numColsB;
temp[3] = *pIn2;
pIn2 += numColsB;
sum += in2 * in4;
in3 = *pIn2;
pIn2 += numColsB;
in1 = pIn1[2];
in2 = pIn1[3];
sum += in1 * in3;
in4 = *pIn2;
pIn2 += numColsB;
sum += in2 * in4;
pIn1 += 4U;
acc0 = vmlaq_f32(acc0,a0V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
pIn2 += numColsB;
@ -182,40 +378,67 @@ arm_status arm_mat_mult_f32(
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
/* Run the below code for Cortex-M0 */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pInA with each column in pInB */
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@ -224,43 +447,78 @@ arm_status arm_mat_mult_f32(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initialize the pointer pIn1 to point to the starting address of the row being processed */
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Update the pointer pInA to point to the starting address of the next row */
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -269,6 +527,8 @@ arm_status arm_mat_mult_f32(
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,206 +29,165 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The difference between the function arm_mat_mult_q15() and this fast variant is that
* the fast variant use a 32-bit rather than a 64-bit accumulator.
* The result of each 1.15 x 1.15 multiplication is truncated to
* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
* format. Finally, the accumulator is saturated and converted to a 1.15 result.
*
* \par
* The fast version has the same overflow behavior as the standard version but provides
* less precision since it discards the low 16 bits of each multiplication result.
* In order to avoid overflows completely the input signals must be scaled down.
* Scale down one of the input matrices by log2(numColsA) bits to
* avoid overflows, as a total of numColsA additions are computed internally for each
* output element.
*
* \par
* See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
* which uses 64-bit accumulation to provide higher precision.
@brief Q15 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.15 x 1.15 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.15 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 16 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q31_t sum; /* accumulator */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inA2, inB1, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
q31_t sum; /* Accumulator */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inA2, inB1, inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
#if defined (ARM_MATH_DSP)
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
in = read_q15x2_ia ((q15_t **) &pInB);
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Read one element from the row */
#else /* #if defined (ARM_MATH_DSP) */
/* Read one element from row */
in = *pInB++;
/* Store one element in the destination */
/* Store one element in destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
in = *pInB++;
*px = in;
px += numRowsB;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement column loop counter */
col--;
}
@ -238,31 +197,31 @@ arm_status arm_mat_mult_fast_q15(
while (col > 0U)
{
/* Read and store the input element in the destination */
/* Read and store input element in destination */
*px = *pInB++;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1;
row = row >> 1U;
px2 = px + numColsB;
#endif
@ -270,29 +229,28 @@ arm_status arm_mat_mult_fast_q15(
/* row loop */
while (row > 0U)
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1;
col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pInA to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
@ -301,56 +259,55 @@ arm_status arm_mat_mult_fast_q15(
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
colCnt = numColsA >> 1;
colCnt = numColsA >> 1U;
#else
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
#ifndef UNALIGNED_SUPPORT_DISABLE
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
inA1 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inA2 = *__SIMD32(pInA2)++;
inB2 = *__SIMD32(pInB2)++;
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inA2 = read_q15x2_ia ((q15_t **) &pInA2);
inB2 = read_q15x2_ia ((q15_t **) &pInB2);
/* Multiply and Accumlates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else
inA1 = *pInA;
inB1 = *pInB;
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
inA2 = pInA[1];
inB2 = pInB[1];
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
inA1 = pInA[2];
inB1 = pInB[2];
inA1 = *pInA++;
inB1 = *pInB++;
sum += inA1 * inB1;
inA2 = pInA[3];
inB2 = pInB[3];
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
#endif /* #if defined (ARM_MATH_DSP) */
pInA += 4;
pInB += 4;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
@ -366,44 +323,45 @@ arm_status arm_mat_mult_fast_q15(
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
sum += (q31_t) (*pInA++) * (*pInB++);
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += (q31_t) *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
#endif
#endif /* #if defined (ARM_MATH_DSP) */
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
@ -412,7 +370,7 @@ arm_status arm_mat_mult_fast_q15(
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@ -420,26 +378,26 @@ arm_status arm_mat_mult_fast_q15(
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB*(numColsB-1);
pInB = pSrcBT + numRowsB * (numColsB-1);
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
@ -449,11 +407,11 @@ arm_status arm_mat_mult_fast_q15(
colCnt--;
}
/* Store the result in the destination buffer */
*px = (q15_t) (sum >> 15);
/* Store result in destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
}
@ -462,7 +420,7 @@ arm_status arm_mat_mult_fast_q15(
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
@ -471,48 +429,48 @@ arm_status arm_mat_mult_fast_q15(
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement the col loop counter */
/* Decrement column loop counter */
col--;
}
}
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -521,5 +479,5 @@ arm_status arm_mat_mult_fast_q15(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,226 +29,166 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The difference between the function arm_mat_mult_q31() and this fast variant is that
* the fast variant use a 32-bit rather than a 64-bit accumulator.
* The result of each 1.31 x 1.31 multiplication is truncated to
* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
* format. Finally, the accumulator is saturated and converted to a 1.31 result.
*
* \par
* The fast version has the same overflow behavior as the standard version but provides
* less precision since it discards the low 32 bits of each multiplication result.
* In order to avoid overflows completely the input signals must be scaled down.
* Scale down one of the input matrices by log2(numColsA) bits to
* avoid overflows, as a total of numColsA additions are computed internally for each
* output element.
*
* \par
* See <code>arm_mat_mult_q31()</code> for a slower implementation of this function
* which uses 64-bit accumulation to provide higher precision.
@brief Q31 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.31 x 1.31 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.31 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *px; /* Temporary output data matrix pointer */
q31_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q31_t inA1, inB1;
#if defined (ARM_MATH_DSP)
q31_t sum2, sum3, sum4;
q31_t inA2, inB2;
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
q31_t sum1, sum2, sum3, sum4; /* Accumulator */
q31_t inA1, inA2, inB1, inB2;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
px = pDst->pData;
#if defined (ARM_MATH_DSP)
row = row >> 1;
row = row >> 1U;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
#if defined (ARM_MATH_DSP)
col = col >> 1;
#endif
col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
#if defined (ARM_MATH_DSP)
sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
colCnt = numColsA;
#else
colCnt = numColsA >> 2;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[0];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[1];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[2];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[3];
sum = __SMMLA(inA1, inB1, sum);
pInA += 4U;
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
#ifdef ARM_MATH_CM0_FAMILY
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
sum = __SMMLA(*pInA++, *pInB, sum);
pInB += numColsB;
colCnt--;
}
j++;
#endif
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum << 1;
#if defined (ARM_MATH_DSP)
*px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
#endif
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
i = i + (numColsA << 1U);
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@ -258,49 +198,75 @@ arm_status arm_mat_mult_fast_q31(
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
sum = __SMMLA(*pInA++, *pInB, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px = sum << 1;
*px = sum1 << 1;
px += numColsB;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
}
@ -309,7 +275,7 @@ arm_status arm_mat_mult_fast_q31(
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
@ -319,14 +285,16 @@ arm_status arm_mat_mult_fast_q31(
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
@ -337,8 +305,13 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
inA1 = *pInA++;
inA2 = *pInA++;
@ -346,32 +319,49 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
sum = __SMMLA(*pInA++, *pInB, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum << 1;
*px++ = sum1 << 1;
i++;
/* Decrement the col loop counter */
/* Decrement col loop counter */
col--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -380,5 +370,5 @@ arm_status arm_mat_mult_fast_q31(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,206 +29,129 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q15 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results (Unused)
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator. The inputs to the
* multiplications are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate
* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
* truncated to 34.15 format by discarding the low 15 bits and then saturated to
* 1.15 format.
*
* \par
* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
*
@brief Q15 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results (Unused)
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow.
The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
and then saturated to 1.15 format.
@par
Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q63_t sum; /* accumulator */
q63_t sum; /* Accumulator */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_DSP) /* != CM0 */
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* Temporary variable to hold the input value */
q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in the destination */
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
#else
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
@ -238,24 +161,24 @@ arm_status arm_mat_mult_q15(
while (col > 0U)
{
/* Read and store the input element in the destination */
/* Read and store input element in destination */
*px = *pInB++;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@ -264,123 +187,98 @@ arm_status arm_mat_mult_q15(
/* row loop */
do
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
#ifndef UNALIGNED_SUPPORT_DISABLE
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA1 = *__SIMD32(pInA)++;
pSourceB1 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
pSourceA2 = *__SIMD32(pInA)++;
pSourceB2 = *__SIMD32(pInB)++;
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
sum = __SMLALD(pSourceA1, pSourceB1, sum);
sum = __SMLALD(pSourceA2, pSourceB2, sum);
sum = __SMLALD(inA1, inB1, sum);
sum = __SMLALD(inA2, inB2, sum);
#else
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
inB2 = *pInB++;
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumlates */
sum += inA2 * inB2;
inA2 = *pInA++;
inB2 = *pInB++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
sum += inA2 * inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
colCnt = numColsA & 3U;
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
#else
#else /* #if defined (ARM_MATH_DSP) */
/* Run the below code for Cortex-M0 */
q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
@ -391,11 +289,10 @@ arm_status arm_mat_mult_q15(
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@ -404,7 +301,7 @@ arm_status arm_mat_mult_q15(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pSrcA */
/* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
@ -413,38 +310,41 @@ arm_status arm_mat_mult_q15(
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
/* Saturate and store the result in the destination buffer */
/* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update the pointer pSrcA to point to the starting address of the next row */
/* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -453,5 +353,5 @@ arm_status arm_mat_mult_q15(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,254 +29,168 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q31 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate
* multiplication results but provides only a single guard bit. There is no saturation
* on intermediate additions. Thus, if the accumulator overflows it wraps around and
* distorts the result. The input signals should be scaled down to avoid intermediate
* overflows. The input is thus scaled down by log2(numColsA) bits
* to avoid overflows, as a total of numColsA additions are performed internally.
* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*
* \par
* See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
*
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q31_t a0, a1, a2, a3, b0, b1, b2, b3;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pInA */
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
b0 = *pIn2;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
a0 = *pIn1++;
a1 = *pIn1++;
b1 = *pIn2;
pIn2 += numColsB;
b2 = *pIn2;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) a0 *b0;
sum += (q63_t) a1 *b1;
a2 = *pIn1++;
a3 = *pIn1++;
b3 = *pIn2;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) a2 *b2;
sum += (q63_t) a3 *b3;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = (pSrcB->pData) + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
#else
/* Run the below code for Cortex-M0 */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pInA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
/* matrix multiplication */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) * pIn1++ * *pIn2;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sum >> 31);
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
#endif
/* Update the pointer pInA to point to the starting address of the next row */
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,42 +29,42 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixScale Matrix Scale
*
* Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
* matrix by the scalar. For example:
* \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
*
* The function checks to make sure that the input and output matrices are of the same size.
*
* In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
* a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
* The shift allows the gain of the scaling operation to exceed 1.0.
* The overall scale factor applied to the fixed-point data is
* <pre>
* scale = scaleFract * 2^shift.
* </pre>
@defgroup MatrixScale Matrix Scale
Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
matrix by the scalar. For example:
\image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
The function checks to make sure that the input and output matrices are of the same size.
In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Floating-point matrix scaling.
* @param[in] *pSrc points to input matrix structure
* @param[in] scale scale factor to be applied
* @param[out] *pDst points to output matrix structure
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
@ -76,12 +76,10 @@ arm_status arm_mat_scale_f32(
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
#if defined (ARM_MATH_DSP)
float32_t in1, in2, in3, in4; /* temporary variables */
float32_t out1, out2, out3, out4; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
@ -93,37 +91,23 @@ arm_status arm_mat_scale_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
in1 = pIn[0];
in2 = pIn[1];
in3 = pIn[2];
in4 = pIn[3];
out1 = in1 * scale;
out2 = in2 * scale;
out3 = in3 * scale;
out4 = in4 * scale;
pOut[0] = out1;
pOut[1] = out2;
pOut[2] = out3;
pOut[3] = out4;
vec1 = vld1q_f32(pIn);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
@ -137,15 +121,6 @@ arm_status arm_mat_scale_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
@ -163,7 +138,84 @@ arm_status arm_mat_scale_f32(
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,135 +29,134 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Q15 matrix scaling.
* @param[in] *pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
* These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
@brief Q15 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
int32_t totShift = 15 - shift; /* total shift to apply after scaling */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
#if defined (ARM_MATH_DSP)
q15_t in1, in2, in3, in4;
q31_t out1, out2, out3, out4;
q31_t inA1, inA2;
#endif // #if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif // #ifdef ARM_MATH_MATRIX_CHECK
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
/* Reading 2 inputs from memory */
inA1 = _SIMD32_OFFSET(pIn);
inA2 = _SIMD32_OFFSET(pIn + 2);
/* C = A * scale */
/* Scale the inputs and then store the 2 results in the destination buffer
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia ((q15_t **) &pIn);
inA2 = read_q15x2_ia ((q15_t **) &pIn);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) inA1 * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) inA2 * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
out1 = out1 >> totShift;
inA1 = _SIMD32_OFFSET(pIn + 4);
out2 = out2 >> totShift;
inA2 = _SIMD32_OFFSET(pIn + 6);
out3 = out3 >> totShift;
out4 = out4 >> totShift;
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
_SIMD32_OFFSET(pOut) = __PKHBT(in2, in1, 16);
_SIMD32_OFFSET(pOut + 2) = __PKHBT(in4, in3, 16);
/* store result to destination */
write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
#else
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
#endif
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
*pOut++ =
(q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> totShift, 16));
/* Decrement the numSamples loop counter */
/* Scale, saturate and store result in destination buffer. */
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -167,5 +166,5 @@ arm_status arm_mat_scale_q15(
}
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,152 +29,125 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Q31 matrix scaling.
* @param[in] *pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
* These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
@brief Q31 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
int32_t totShift = shift + 1; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
q31_t in1, in2, out1; /* temporary variabels */
#if defined (ARM_MATH_DSP)
q31_t in3, in4, out2, out3, out4; /* temporary variables */
#endif // #ifndef ARM_MAT_CM0
q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = shift + 1; /* Shift to apply after scaling */
q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif // #ifdef ARM_MATH_MATRIX_CHECK
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Read values from input */
in1 = *pIn;
in2 = *(pIn + 1);
in3 = *(pIn + 2);
in4 = *(pIn + 3);
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* Scale, saturate and store result in destination buffer. */
in = *pIn++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the results. */
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out; /* Store result destination */
/* apply shifting */
out1 = in1 << totShift;
out2 = in2 << totShift;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* saturate the results. */
if (in1 != (out1 >> totShift))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
if (in2 != (out2 >> totShift))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
out3 = in3 << totShift;
out4 = in4 << totShift;
*pOut = out1;
*(pOut + 1) = out2;
if (in3 != (out3 >> totShift))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
if (in4 != (out4 >> totShift))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
*(pOut + 2) = out3;
*(pOut + 3) = out4;
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
in1 = *pIn++;
in2 = ((q63_t) in1 * scaleFract) >> 32;
/* Scale, saturate and store result in destination buffer. */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
out1 = in2 << totShift;
if (in2 != (out1 >> totShift))
out1 = 0x7FFFFFFF ^ (in2 >> 31);
*pOut++ = out1;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -187,5 +160,5 @@ arm_status arm_mat_scale_q31(
}
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,34 +29,36 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixSub Matrix Subtraction
*
* Subtract two matrices.
* \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
*
* The functions check to make sure that
* <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
* number of rows and columns.
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
* @addtogroup MatrixSub
* @{
@addtogroup MatrixSub
@{
*/
/**
* @brief Floating-point matrix subtraction
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@ -66,11 +68,9 @@ arm_status arm_mat_sub_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
#if defined (ARM_MATH_DSP)
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
@ -88,71 +88,28 @@ arm_status arm_mat_sub_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Read values from source B */
inB1 = pIn2[0];
/* Read values from source A */
inA2 = pIn1[1];
/* out = sourceA - sourceB */
out1 = inA1 - inB1;
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* out = sourceA - sourceB */
out2 = inA2 - inB2;
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* out = sourceA - sourceB */
out1 = inA1 - inB1;
/* out = sourceA - sourceB */
out2 = inA2 - inB2;
/* Store result in destination */
pOut[2] = out1;
/* Store result in destination */
pOut[3] = out2;
/* update pointers to process next sampels */
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@ -165,14 +122,6 @@ arm_status arm_mat_sub_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
@ -191,7 +140,87 @@ arm_status arm_mat_sub_f32(
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixSub group
@} end of MatrixSub group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,112 +29,108 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixSub
* @{
@addtogroup MatrixSub
@{
*/
/**
* @brief Q15 matrix subtraction.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Apply loop unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and then store the results in the destination buffer. */
*__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
*__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
/* Decrement the loop counter */
/* Subtract, Saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
/* Decrement the loop counter */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -144,5 +140,5 @@ arm_status arm_mat_sub_q15(
}
/**
* @} end of MatrixSub group
@} end of MatrixSub group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_sub_q31.c
* Description: Q31 matrix subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,157 +29,100 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixSub
* @{
@addtogroup MatrixSub
@{
*/
/**
* @brief Q31 matrix subtraction.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
*/
@brief Q31 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t inA1, inB1; /* temporary variables */
#if defined (ARM_MATH_DSP)
q31_t inA2, inB2; /* temporary variables */
q31_t out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Read values from source B */
inB1 = pIn2[0];
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Read values from source A */
inA2 = pIn1[1];
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Subtract and saturate */
out1 = __QSUB(inA1, inB1);
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* Subtract and saturate */
out2 = __QSUB(inA2, inB2);
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* Subtract and saturate */
out1 = __QSUB(inA1, inB1);
/* Subtract and saturate */
out2 = __QSUB(inA2, inB2);
/* Store result in destination */
pOut[2] = out1;
pOut[3] = out2;
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and then store the results in the destination buffer. */
inA1 = *pIn1++;
inB1 = *pIn2++;
inA1 = __QSUB(inA1, inB1);
/* Subtract, saturate and store result in destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = inA1;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
@ -192,5 +135,5 @@ arm_status arm_mat_sub_q31(
}
/**
* @} end of MatrixSub group
@} end of MatrixSub group
*/

View file

@ -3,8 +3,8 @@
* Title: arm_mat_trans_f32.c
* Description: Floating-point matrix transpose
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@ -26,33 +26,36 @@
* limitations under the License.
*/
/**
* @defgroup MatrixTrans Matrix Transpose
*
* Tranposes a matrix.
* Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
* \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixTrans
* @{
@defgroup MatrixTrans Matrix Transpose
Tranposes a matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
* @brief Floating-point matrix transpose.
* @param[in] *pSrc points to the input matrix
* @param[out] *pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
@ -64,17 +67,11 @@ arm_status arm_mat_trans_f32(
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t blkCnt, i = 0U, row = nRows; /* loop counters */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
@ -86,41 +83,44 @@ arm_status arm_mat_trans_f32(
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
/* Row loop */
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/* Loop Unrolling */
float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* column loop */
while (blkCnt > 0U) /* Column loop */
{
/* Read and store the input element in the destination */
*px = *pIn++;
row0V = vld1q_f32(pIn);
row1V = vld1q_f32(pIn + 1 * nColumns);
row2V = vld1q_f32(pIn + 2 * nColumns);
row3V = vld1q_f32(pIn + 3 * nColumns);
pIn += 4;
/* Update the pointer px to point to the next row of the transposed matrix */
ra0 = vzipq_f32(row0V,row2V);
ra1 = vzipq_f32(row1V,row3V);
rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
vst1q_f32(px,rb0.val[0]);
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
vst1q_f32(px,rb0.val[1]);
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
vst1q_f32(px,rb1.val[0]);
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
vst1q_f32(px,rb1.val[1]);
px += nRows;
/* Decrement the column loop counter */
@ -133,46 +133,34 @@ arm_status arm_mat_trans_f32(
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
*px++ = *(pIn + 2 * nColumns);
*px++ = *(pIn + 3 * nColumns);
px += (nRows - 4);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
#else
i += 4;
pIn += 3 * nColumns;
/* Run the below code for Cortex-M0 */
/* Decrement the row loop counter */
rowCnt--;
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
} /* Row loop end */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
rowCnt = row & 3;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
@ -181,17 +169,11 @@ arm_status arm_mat_trans_f32(
px += nRows;
/* Decrement the column loop counter */
col--;
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
@ -200,6 +182,102 @@ arm_status arm_mat_trans_f32(
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixTrans group

View file

@ -3,13 +3,13 @@
* Title: arm_mat_trans_q15.c
* Description: Q15 matrix transpose
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,244 +29,154 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixTrans
* @{
@addtogroup MatrixTrans
@{
*/
/*
* @brief Q15 matrix transpose.
* @param[in] *pSrc points to the input matrix
* @param[out] *pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
/**
@brief Q15 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
arm_matrix_instance_q15 * pDst)
{
q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
arm_status status; /* status of matrix transpose */
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
#ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* variable to hold temporary output */
#else
q15_t in;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in; /* variable to hold temporary output */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
/* row loop */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
col = nColumns >> 2U;
/* The pointer pOut is set to starting address of the column being processed */
/* Pointer pOut is set to starting address of column being processed */
pOut = pDst->pData + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Read two elements from the row */
in = *__SIMD32(pSrcA)++;
/* Unpack and store one element in the destination */
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer pOut to point to the next row of the transposed matrix */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store the second element in the destination */
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer pOut to point to the next row of the transposed matrix */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Read two elements from the row */
#ifndef ARM_MATH_BIG_ENDIAN
in = *__SIMD32(pSrcA)++;
#else
in = *__SIMD32(pSrcA)++;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Unpack and store one element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer pOut to point to the next row of the transposed matrix */
pOut += nRows;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
/* Read one element from the row */
in = *pSrcA++;
/* Store one element in the destination */
*pOut = in;
/* Update the pointer px to point to the next row of the transposed matrix */
pOut += nRows;
/* Read one element from the row */
in = *pSrcA++;
/* Store one element in the destination */
*pOut = in;
/* Update the pointer px to point to the next row of the transposed matrix */
pOut += nRows;
/* Read one element from the row */
in = *pSrcA++;
/* Store one element in the destination */
*pOut = in;
/* Update the pointer px to point to the next row of the transposed matrix */
pOut += nRows;
/* Read one element from the row */
in = *pSrcA++;
/* Store one element in the destination */
*pOut = in;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Update the pointer pOut to point to the next row of the transposed matrix */
pOut += nRows;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
/* Perform matrix transpose for last 3 samples here. */
col = nColumns % 0x4U;
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize col with number of samples */
col = nCols;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer pOut is set to starting address of the column being processed */
pOut = pDst->pData + i;
/* Initialize column loop counter */
col = nColumns;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store the input element in the destination */
*pOut = *pSrcA++;
/* Read and store input element in destination */
*pOut = *pIn++;
/* Update the pointer pOut to point to the next row of the transposed matrix */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
} while (row > 0U); /* row loop end */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixTrans group
@} end of MatrixTrans group
*/

View file

@ -3,13 +3,13 @@
* Title: arm_mat_trans_q31.c
* Description: Q31 matrix transpose
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -29,163 +29,111 @@
#include "arm_math.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixTrans
* @{
@addtogroup MatrixTrans
@{
*/
/*
* @brief Q31 matrix transpose.
* @param[in] *pSrc points to the input matrix
* @param[out] *pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
/**
@brief Q31 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t blkCnt, i = 0U, row = nRows; /* loop counters */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
/* row loop */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
blkCnt = nColumns >> 2U;
/* The pointer px is set to starting address of the column being processed */
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store the input element in the destination */
/* Read and store input element in destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
while (row > 0U); /* row loop end */
} while (row > 0U); /* row loop end */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@ -194,5 +142,5 @@ arm_status arm_mat_trans_q31(
}
/**
* @} end of MatrixTrans group
@} end of MatrixTrans group
*/