Official ARM version: v5.6.0

2020-07-28 11:24:49 +01:00 · 2020-07-28 11:24:49 +01:00 · 96d6da4e25
commit 96d6da4e25
parent 9f95ff5b6b
2939 changed files with 339304 additions and 113320 deletions
--- a/DSP/Source/BasicMathFunctions/BasicMathFunctions.c
+++ b/DSP/Source/BasicMathFunctions/BasicMathFunctions.c
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BasicMathFunctions.c
+ * Description:  Combination of all basic math function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_abs_f32.c"
+#include "arm_abs_q15.c"
+#include "arm_abs_q31.c"
+#include "arm_abs_q7.c"
+#include "arm_add_f32.c"
+#include "arm_add_q15.c"
+#include "arm_add_q31.c"
+#include "arm_add_q7.c"
+#include "arm_dot_prod_f32.c"
+#include "arm_dot_prod_q15.c"
+#include "arm_dot_prod_q31.c"
+#include "arm_dot_prod_q7.c"
+#include "arm_mult_f32.c"
+#include "arm_mult_q15.c"
+#include "arm_mult_q31.c"
+#include "arm_mult_q7.c"
+#include "arm_negate_f32.c"
+#include "arm_negate_q15.c"
+#include "arm_negate_q31.c"
+#include "arm_negate_q7.c"
+#include "arm_offset_f32.c"
+#include "arm_offset_q15.c"
+#include "arm_offset_q31.c"
+#include "arm_offset_q7.c"
+#include "arm_scale_f32.c"
+#include "arm_scale_q15.c"
+#include "arm_scale_q31.c"
+#include "arm_scale_q7.c"
+#include "arm_shift_q15.c"
+#include "arm_shift_q31.c"
+#include "arm_shift_q7.c"
+#include "arm_sub_f32.c"
+#include "arm_sub_q15.c"
+#include "arm_sub_q31.c"
+#include "arm_sub_q7.c"
--- a/DSP/Source/BasicMathFunctions/CMakeLists.txt
+++ b/DSP/Source/BasicMathFunctions/CMakeLists.txt
@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPBasicMath)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPBasicMath STATIC ${SRC})
+
+configdsp(CMSISDSPBasicMath ..)
+
+### Includes
+target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_abs_f32.c
 * Description:  Floating-point vector absolute value
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,124 +30,117 @@
 #include <math.h>

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup BasicAbs Vector Absolute Value
- *
- * Computes the absolute value of a vector on an element-by-element basis.
- *
- * <pre>
- *     pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
- * </pre>
- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicAbs Vector Absolute Value
+
+  Computes the absolute value of a vector on an element-by-element basis.
+
+  <pre>
+      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup BasicAbs
- * @{
+  @addtogroup BasicAbs
+  @{
 */

 /**
- * @brief Floating-point vector absolute value.
- * @param[in]       *pSrc points to the input buffer
- * @param[out]      *pDst points to the output buffer
- * @param[in]       blockSize number of samples in each vector
- * @return none.
+  @brief         Floating-point vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
 */

 void arm_abs_f32(
-  float32_t * pSrc,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4_t vec1;
+    float32x4_t res;

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t in1, in2, in3, in4;                  /* temporary variables */
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-  /*loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+    	/* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vabsq_f32(vec1);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Calculate absolute and then store the results in the destination buffer. */
-    /* read sample from source */
-    in1 = *pSrc;
-    in2 = *(pSrc + 1);
-    in3 = *(pSrc + 2);

-    /* find absolute value */
-    in1 = fabsf(in1);
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);

-    /* read sample from source */
-    in4 = *(pSrc + 3);
+    *pDst++ = fabsf(*pSrc++);

-    /* find absolute value */
-    in2 = fabsf(in2);
+    *pDst++ = fabsf(*pSrc++);

-    /* read sample from source */
-    *pDst = in1;
+    *pDst++ = fabsf(*pSrc++);

-    /* find absolute value */
-    in3 = fabsf(in3);
-
-    /* find absolute value */
-    in4 = fabsf(in4);
-
-    /* store result to destination */
-    *(pDst + 1) = in2;
-
-    /* store result to destination */
-    *(pDst + 2) = in3;
-
-    /* store result to destination */
-    *(pDst + 3) = in4;
-
-
-    /* Update source pointer to process next sampels */
-    pSrc += 4U;
-
-    /* Update destination pointer to process next sampels */
-    pDst += 4U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /*   #if defined (ARM_MATH_DSP)   */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Calculate absolute and then store the results in the destination buffer. */
+
+    /* Calculate absolute and store result in destination buffer. */
    *pDst++ = fabsf(*pSrc++);

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicAbs group
+  @} end of BasicAbs group
 */
--- a/DSP/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_abs_q15.c
 * Description:  Q15 vector absolute value
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,139 +29,104 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAbs
- * @{
+  @addtogroup BasicAbs
+  @{
 */

 /**
- * @brief Q15 vector absolute value.
- * @param[in]       *pSrc points to the input buffer
- * @param[out]      *pDst points to the output buffer
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+  @brief         Q15 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
 */

 void arm_abs_q15(
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */

-#if defined (ARM_MATH_DSP)
-  __SIMD32_TYPE *simd;
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q15_t in1;                                     /* Input value1 */
-  q15_t in2;                                     /* Input value2 */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  simd = __SIMD32_CONST(pDst);
  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Read two inputs */
-    in1 = *pSrc++;
-    in2 = *pSrc++;
-
-
-    /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
-#ifndef  ARM_MATH_BIG_ENDIAN
-    *simd++ =
-      __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
-              ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);

+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif

-
-    *simd++ =
-      __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
-              ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
-
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-    in1 = *pSrc++;
-    in2 = *pSrc++;
-
-
-#ifndef  ARM_MATH_BIG_ENDIAN
-
-    *simd++ =
-      __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
-              ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
-
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif

+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif

-    *simd++ =
-      __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
-              ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif

-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
-  pDst = (q15_t *)simd;

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = |A| */
-    /* Read the input */
-    in1 = *pSrc++;
-
-    /* Calculate absolute value of input and then store the result in the destination buffer. */
-    *pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
-  q15_t in;                                      /* Temporary input variable */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Read the input */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
    in = *pSrc++;
-
-    /* Calculate absolute value of input and then store the result in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of BasicAbs group
+  @} end of BasicAbs group
 */
--- a/DSP/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_abs_q31.c
 * Description:  Q31 vector absolute value
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,90 +29,104 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAbs
- * @{
+  @addtogroup BasicAbs
+  @{
 */

-
 /**
- * @brief Q31 vector absolute value.
- * @param[in]       *pSrc points to the input buffer
- * @param[out]      *pDst points to the output buffer
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+  @brief         Q31 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

 void arm_abs_q31(
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in;                                      /* Input value */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t in1, in2, in3, in4;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
-    in1 = *pSrc++;
-    in2 = *pSrc++;
-    in3 = *pSrc++;
-    in4 = *pSrc++;

-    *pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
-    *pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
-    *pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
-    *pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif

-    /* Decrement the loop counter */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /*   #if defined (ARM_MATH_DSP)   */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
-    in = *pSrc++;
-    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);

-    /* Decrement the loop counter */
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

 }

 /**
- * @} end of BasicAbs group
+  @} end of BasicAbs group
 */
--- a/DSP/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_abs_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_abs_q7.c
 * Description:  Q7 vector absolute value
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,117 +29,106 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAbs
- * @{
+  @addtogroup BasicAbs
+  @{
 */

 /**
- * @brief Q7 vector absolute value.
- * @param[in]       *pSrc points to the input buffer
- * @param[out]      *pDst points to the output buffer
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * \par Conditions for optimum performance
- *  Input and output buffers should be aligned by 32-bit
- *
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+  @brief         Q7 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
 */

 void arm_abs_q7(
-  q7_t * pSrc,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  q7_t in;                                       /* Input value1 */
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t in1, in2, in3, in4;                      /* temporary input variables */
-  q31_t out1, out2, out3, out4;                  /* temporary output variables */
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Read inputs */
-    in1 = (q31_t) * pSrc;
-    in2 = (q31_t) * (pSrc + 1);
-    in3 = (q31_t) * (pSrc + 2);

-    /* find absolute value */
-    out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif

-    /* read input */
-    in4 = (q31_t) * (pSrc + 3);
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif

-    /* find absolute value */
-    out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif

-    /* store result to destination */
-    *pDst = (q7_t) out1;
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif

-    /* find absolute value */
-    out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
-
-    /* find absolute value */
-    out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
-
-    /* store result to destination */
-    *(pDst + 1) = (q7_t) out2;
-
-    /* store result to destination */
-    *(pDst + 2) = (q7_t) out3;
-
-    /* store result to destination */
-    *(pDst + 3) = (q7_t) out4;
-
-    /* update pointers to process next samples */
-    pSrc += 4U;
-    pDst += 4U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #define ARM_MATH_CM0_FAMILY */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = |A| */
-    /* Read the input */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t) __QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif

-    /* Store the Absolute result in the destination buffer */
-    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicAbs group
+  @} end of BasicAbs group
 */
--- a/DSP/Source/BasicMathFunctions/arm_add_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_add_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_add_f32.c
 * Description:  Floating-point vector addition
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,110 +29,117 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup BasicAdd Vector Addition
- *
- * Element-by-element addition of two vectors.
- *
- * <pre>
- *     pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
- * </pre>
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicAdd Vector Addition
+
+  Element-by-element addition of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup BasicAdd
- * @{
+  @addtogroup BasicAdd
+  @{
 */

 /**
- * @brief Floating-point vector addition.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
+  @brief         Floating-point vector addition.
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
 */

 void arm_add_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4_t vec1;
+    float32x4_t vec2;
+    float32x4_t res;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t inA1, inA2, inA3, inA4;              /* temporary input variabels */
-  float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-  /*loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+    	/* Add and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vaddq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */

-    /* read four inputs from sourceA and four inputs from sourceB */
-    inA1 = *pSrcA;
-    inB1 = *pSrcB;
-    inA2 = *(pSrcA + 1);
-    inB2 = *(pSrcB + 1);
-    inA3 = *(pSrcA + 2);
-    inB3 = *(pSrcB + 2);
-    inA4 = *(pSrcA + 3);
-    inB4 = *(pSrcB + 3);
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);

-    /* C = A + B */
-    /* add and store result to destination */
-    *pDst = inA1 + inB1;
-    *(pDst + 1) = inA2 + inB2;
-    *(pDst + 2) = inA3 + inB3;
-    *(pDst + 3) = inA4 + inB4;
-
-    /* update pointers to process next samples */
-    pSrcA += 4U;
-    pSrcB += 4U;
-    pDst += 4U;
-
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
+
+    /* Add and store result in destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicAdd group
+  @} end of BasicAdd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_add_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_add_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_add_q15.c
 * Description:  Q15 vector addition
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,100 +29,98 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAdd
- * @{
+  @addtogroup BasicAdd
+  @{
 */

 /**
- * @brief Q15 vector addition.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+  @brief         Q15 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_add_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t inB1, inB2;
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inB1, inB2;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    inA1 = *__SIMD32(pSrcA)++;
-    inA2 = *__SIMD32(pSrcA)++;
-    inB1 = *__SIMD32(pSrcB)++;
-    inB2 = *__SIMD32(pSrcB)++;

-    *__SIMD32(pDst)++ = __QADD16(inA1, inB1);
-    *__SIMD32(pDst)++ = __QADD16(inA2, inB2);
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);

-    /* Decrement the loop counter */
+    /* Add and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
-
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16);

-    /* Decrement the loop counter */
+    /* Add and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-
 }

 /**
- * @} end of BasicAdd group
+  @} end of BasicAdd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_add_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_add_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_add_q31.c
 * Description:  Q31 vector addition
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,108 +29,80 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAdd
- * @{
+  @addtogroup BasicAdd
+  @{
 */

-
 /**
- * @brief Q31 vector addition.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+  @brief         Q31 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_add_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inA3, inA4;
-  q31_t inB1, inB2, inB3, inB4;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    inA1 = *pSrcA++;
-    inA2 = *pSrcA++;
-    inB1 = *pSrcB++;
-    inB2 = *pSrcB++;

-    inA3 = *pSrcA++;
-    inA4 = *pSrcA++;
-    inB3 = *pSrcB++;
-    inB4 = *pSrcB++;
-
-    *pDst++ = __QADD(inA1, inB1);
-    *pDst++ = __QADD(inA2, inB2);
-    *pDst++ = __QADD(inA3, inB3);
-    *pDst++ = __QADD(inA4, inB4);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = blockSize % 0x4U;
-
-  while (blkCnt > 0U)
-  {
-    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
+    /* Add and store result in destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);

-    /* Decrement the loop counter */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
-
-
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);

-    /* Decrement the loop counter */
+    /* Add and store result in destination buffer. */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of BasicAdd group
+  @} end of BasicAdd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_add_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_add_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_add_q7.c
 * Description:  Q7 vector addition
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,94 +29,81 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicAdd
- * @{
+  @addtogroup BasicAdd
+  @{
 */

 /**
- * @brief Q7 vector addition.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+  @brief         Q7 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

 void arm_add_q7(
-  q7_t * pSrcA,
-  q7_t * pSrcB,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    /* Add and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
-
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + B */
-    /* Add and then store the results in the destination buffer. */
-    *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8);

-    /* Decrement the loop counter */
+    /* Add and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-
 }

 /**
- * @} end of BasicAdd group
+  @} end of BasicAdd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_dot_prod_f32.c
 * Description:  Floating-point dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,95 +29,135 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup dot_prod Vector Dot Product
- *
- * Computes the dot product of two vectors.
- * The vectors are multiplied element-by-element and then summed.
- *
- * <pre>
- *     sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
- * </pre>
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicDotProd Vector Dot Product
+
+  Computes the dot product of two vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  <pre>
+      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup dot_prod
- * @{
+  @addtogroup BasicDotProd
+  @{
 */

 /**
- * @brief Dot product of floating-point vectors.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[in]       blockSize number of samples in each vector
- * @param[out]      *result output result returned here
- * @return none.
+  @brief         Dot product of floating-point vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
 */

-
 void arm_dot_prod_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  uint32_t blockSize,
-  float32_t * result)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result)
 {
-  float32_t sum = 0.0f;                          /* Temporary result storage */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t sum = 0.0f;                          /* Temporary return variable */

+#if defined(ARM_MATH_NEON)
+    float32x4_t vec1;
+    float32x4_t vec2;
+    float32x4_t res;
+    float32x4_t accum = vdupq_n_f32(0);    

-#if defined (ARM_MATH_DSP)
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  /*loop Unrolling */
+    vec1 = vld1q_f32(pSrcA);
+    vec2 = vld1q_f32(pSrcB);
+
+    while (blkCnt > 0U)
+    {
+        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
+        /* Calculate dot product and then store the result in a temporary buffer. */
+        
+	accum = vmlaq_f32(accum, vec1, vec2);
+	
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+    
+#if __aarch64__
+    sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
+#else
+    sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1];
+#endif    
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the result in a temporary buffer */
-    sum += (*pSrcA++) * (*pSrcB++);
-    sum += (*pSrcA++) * (*pSrcB++);
-    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);

-    /* Decrement the loop counter */
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the result in a temporary buffer. */
+
+    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
-  /* Store the result back in the destination buffer */
+
+  /* Store result in destination buffer */
  *result = sum;
 }

 /**
- * @} end of dot_prod group
+  @} end of BasicDotProd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_dot_prod_q15.c
 * Description:  Q15 dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,100 +29,92 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup dot_prod
- * @{
+  @addtogroup BasicDotProd
+  @{
 */

 /**
- * @brief Dot product of Q15 vectors.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[in]       blockSize number of samples in each vector
- * @param[out]      *result output result returned here
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
- * results are added to a 64-bit accumulator in 34.30 format.
- * Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
- * there is no risk of overflow.
- * The return result is in 34.30 format.
+  @brief         Dot product of Q15 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
+                   results are added to a 64-bit accumulator in 34.30 format.
+                   Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
+                   there is no risk of overflow.
+                   The return result is in 34.30 format.
 */

 void arm_dot_prod_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  uint32_t blockSize,
-  q63_t * result)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
 {
-  q63_t sum = 0;                                 /* Temporary result storage */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the result in a temporary buffer. */
-    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
-    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the results in a temporary buffer. */
-    sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the results in a temporary buffer. */
-    sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);

-    /* Decrement the loop counter */
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* Store the result in the destination buffer in 34.30 format */
+  /* Store result in destination buffer in 34.30 format */
  *result = sum;
-
 }

 /**
- * @} end of dot_prod group
+  @} end of BasicDotProd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_dot_prod_q31.c
 * Description:  Q31 dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,103 +29,87 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup dot_prod
- * @{
+  @addtogroup BasicDotProd
+  @{
 */

 /**
- * @brief Dot product of Q31 vectors.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[in]       blockSize number of samples in each vector
- * @param[out]      *result output result returned here
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
- * are truncated to 2.48 format by discarding the lower 14 bits.
- * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
- * There are 15 guard bits in the accumulator and there is no risk of overflow as long as
- * the length of the vectors is less than 2^16 elements.
- * The return result is in 16.48 format.
+  @brief         Dot product of Q31 vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
+                   are truncated to 2.48 format by discarding the lower 14 bits.
+                   The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+                   There are 15 guard bits in the accumulator and there is no risk of overflow as long as
+                   the length of the vectors is less than 2^16 elements.
+                   The return result is in 16.48 format.
 */

 void arm_dot_prod_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  uint32_t blockSize,
-  q63_t * result)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
 {
-  q63_t sum = 0;                                 /* Temporary result storage */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */

+#if defined (ARM_MATH_LOOPUNROLL)

-#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inA3, inA4;
-  q31_t inB1, inB2, inB3, inB4;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the result in a temporary buffer. */
-    inA1 = *pSrcA++;
-    inA2 = *pSrcA++;
-    inA3 = *pSrcA++;
-    inA4 = *pSrcA++;
-    inB1 = *pSrcB++;
-    inB2 = *pSrcB++;
-    inB3 = *pSrcB++;
-    inB4 = *pSrcB++;

-    sum += ((q63_t) inA1 * inB1) >> 14U;
-    sum += ((q63_t) inA2 * inB2) >> 14U;
-    sum += ((q63_t) inA3 * inB3) >> 14U;
-    sum += ((q63_t) inA4 * inB4) >> 14U;
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;

-    /* Decrement the loop counter */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Calculate dot product and then store the result in a temporary buffer. */
-    sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14U;

-    /* Decrement the loop counter */
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* Store the result in the destination buffer in 16.48 format */
+  /* Store result in destination buffer in 16.48 format */
  *result = sum;
 }

 /**
- * @} end of dot_prod group
+  @} end of BasicDotProd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_dot_prod_q7.c
 * Description:  Q7 dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,61 +29,58 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup dot_prod
- * @{
+  @addtogroup BasicDotProd
+  @{
 */

 /**
- * @brief Dot product of Q7 vectors.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[in]       blockSize number of samples in each vector
- * @param[out]      *result output result returned here
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
- * results are added to an accumulator in 18.14 format.
- * Nonsaturating additions are used and there is no danger of wrap around as long as
- * the vectors are less than 2^18 elements long.
- * The return result is in 18.14 format.
+  @brief         Dot product of Q7 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
+                   results are added to an accumulator in 18.14 format.
+                   Nonsaturating additions are used and there is no danger of wrap around as long as
+                   the vectors are less than 2^18 elements long.
+                   The return result is in 18.14 format.
 */

 void arm_dot_prod_q7(
-  q7_t * pSrcA,
-  q7_t * pSrcB,
-  uint32_t blockSize,
-  q31_t * result)
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t sum = 0;                                 /* Temporary return variable */

-  q31_t sum = 0;                                 /* Temporary variables to store output */
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q31_t input1, input2;                          /* Temporary variables */
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q31_t input1, input2;                          /* Temporary variables to store input */
-  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */
-
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
    /* read 4 samples at a time from sourceA */
-    input1 = *__SIMD32(pSrcA)++;
+    input1 = read_q7x4_ia ((q7_t **) &pSrcA);
    /* read 4 samples at a time from sourceB */
-    input2 = *__SIMD32(pSrcB)++;
+    input2 = read_q7x4_ia ((q7_t **) &pSrcB);

    /* extract two q7_t samples to q15_t samples */
    inA1 = __SXTB16(__ROR(input1, 8));
@ -97,51 +94,46 @@ void arm_dot_prod_q7(
    /* multiply and accumulate two samples at a time */
    sum = __SMLAD(inA1, inB1, sum);
    sum = __SMLAD(inA2, inB2, sum);
+#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+#endif

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Dot product and then store the results in a temporary buffer. */
-    sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
-
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-    /* Dot product and then store the results in a temporary buffer. */
-    sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);

-    /* Decrement the loop counter */
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLAD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-
-  /* Store the result in the destination buffer in 18.14 format */
+  /* Store result in destination buffer in 18.14 format */
  *result = sum;
 }

 /**
- * @} end of dot_prod group
+  @} end of BasicDotProd group
 */
--- a/DSP/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_mult_f32.c
 * Description:  Floating-point vector multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,134 +29,120 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup BasicMult Vector Multiplication
- *
- * Element-by-element multiplication of two vectors.
- *
- * <pre>
- *     pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
- * </pre>
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicMult Vector Multiplication
+
+  Element-by-element multiplication of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup BasicMult
- * @{
+  @addtogroup BasicMult
+  @{
 */

 /**
- * @brief Floating-point vector multiplication.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
+  @brief         Floating-point vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
 */

 void arm_mult_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counters */
-#if defined (ARM_MATH_DSP)
+    uint32_t blkCnt;                               /* Loop counter */

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t inA1, inA2, inA3, inA4;              /* temporary input variables */
-  float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */
-  float32_t out1, out2, out3, out4;              /* temporary output variables */
+#if defined(ARM_MATH_NEON)
+    float32x4_t vec1;
+    float32x4_t vec2;
+    float32x4_t res;

-  /* loop Unrolling */
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * B */
+
+    	/* Multiply the inputs and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vmulq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and store the results in output buffer */
-    /* read sample from sourceA */
-    inA1 = *pSrcA;
-    /* read sample from sourceB */
-    inB1 = *pSrcB;
-    /* read sample from sourceA */
-    inA2 = *(pSrcA + 1);
-    /* read sample from sourceB */
-    inB2 = *(pSrcB + 1);

-    /* out = sourceA * sourceB */
-    out1 = inA1 * inB1;
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);

-    /* read sample from sourceA */
-    inA3 = *(pSrcA + 2);
-    /* read sample from sourceB */
-    inB3 = *(pSrcB + 2);
+    *pDst++ = (*pSrcA++) * (*pSrcB++);

-    /* out = sourceA * sourceB */
-    out2 = inA2 * inB2;
+    *pDst++ = (*pSrcA++) * (*pSrcB++);

-    /* read sample from sourceA */
-    inA4 = *(pSrcA + 3);
+    *pDst++ = (*pSrcA++) * (*pSrcB++);

-    /* store result to destination buffer */
-    *pDst = out1;
-
-    /* read sample from sourceB */
-    inB4 = *(pSrcB + 3);
-
-    /* out = sourceA * sourceB */
-    out3 = inA3 * inB3;
-
-    /* store result to destination buffer */
-    *(pDst + 1) = out2;
-
-    /* out = sourceA * sourceB */
-    out4 = inA4 * inB4;
-    /* store result to destination buffer */
-    *(pDst + 2) = out3;
-    /* store result to destination buffer */
-    *(pDst + 3) = out4;
-
-
-    /* update pointers to process next samples */
-    pSrcA += 4U;
-    pSrcB += 4U;
-    pDst += 4U;
-
-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and store the results in output buffer */
+
+    /* Multiply input and store result in destination buffer. */
    *pDst++ = (*pSrcA++) * (*pSrcB++);

-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicMult group
+  @} end of BasicMult group
 */
--- a/DSP/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_mult_q15.c
 * Description:  Q15 vector multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,65 +29,65 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicMult
- * @{
+  @addtogroup BasicMult
+  @{
 */

-
 /**
- * @brief           Q15 vector multiplication
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+  @brief         Q15 vector multiplication
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_mult_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counters */
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary input variables */
+  q15_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inB1, inB2;                  /* temporary input variables */
-  q15_t out1, out2, out3, out4;                  /* temporary output variables */
-  q31_t mul1, mul2, mul3, mul4;                  /* temporary variables */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* read two samples at a time from sourceA */
-    inA1 = *__SIMD32(pSrcA)++;
-    /* read two samples at a time from sourceB */
-    inB1 = *__SIMD32(pSrcB)++;
-    /* read two samples at a time from sourceA */
-    inA2 = *__SIMD32(pSrcA)++;
-    /* read two samples at a time from sourceB */
-    inB2 = *__SIMD32(pSrcB)++;
+    /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    /* read 2 samples at a time from sourceA */
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);

    /* multiply mul = sourceA * sourceB */
    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
-    mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
+    mul2 = (q31_t) ((q15_t) (inA1      ) * (q15_t) (inB1      ));
    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
-    mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
+    mul4 = (q31_t) ((q15_t) (inA2      ) * (q15_t) (inB2      ));

    /* saturate result to 16 bit */
    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
@ -95,48 +95,49 @@ void arm_mult_q15(
    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15, 16);

-    /* store the result */
+    /* store result to destination */
 #ifndef ARM_MATH_BIG_ENDIAN
-
-    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
+    write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
 #else
-
-    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
+    write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */

-    /* Decrement the blockSize loop counter */
+#else
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and store the result in the destination buffer */
+
+    /* Multiply inputs and store result in destination buffer. */
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);

-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicMult group
+  @} end of BasicMult group
 */
--- a/DSP/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_mult_q31.c
 * Description:  Q31 vector multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,120 +29,91 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicMult
- * @{
+  @addtogroup BasicMult
+  @{
 */

 /**
- * @brief Q31 vector multiplication.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+  @brief         Q31 vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_mult_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counters */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t out;                                     /* Temporary output variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inA3, inA4;                  /* temporary input variables */
-  q31_t inB1, inB2, inB3, inB4;                  /* temporary input variables */
-  q31_t out1, out2, out3, out4;                  /* temporary output variables */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and then store the results in the destination buffer. */
-    inA1 = *pSrcA++;
-    inA2 = *pSrcA++;
-    inA3 = *pSrcA++;
-    inA4 = *pSrcA++;
-    inB1 = *pSrcB++;
-    inB2 = *pSrcB++;
-    inB3 = *pSrcB++;
-    inB4 = *pSrcB++;

-    out1 = ((q63_t) inA1 * inB1) >> 32;
-    out2 = ((q63_t) inA2 * inB2) >> 32;
-    out3 = ((q63_t) inA3 * inB3) >> 32;
-    out4 = ((q63_t) inA4 * inB4) >> 32;
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;

-    out1 = __SSAT(out1, 31);
-    out2 = __SSAT(out2, 31);
-    out3 = __SSAT(out3, 31);
-    out4 = __SSAT(out4, 31);
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;

-    *pDst++ = out1 << 1U;
-    *pDst++ = out2 << 1U;
-    *pDst++ = out3 << 1U;
-    *pDst++ = out4 << 1U;
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;

-    /* Decrement the blockSize loop counter */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A * B */
-    /* Multiply the inputs and then store the results in the destination buffer. */
-    inA1 = *pSrcA++;
-    inB1 = *pSrcB++;
-    out1 = ((q63_t) inA1 * inB1) >> 32;
-    out1 = __SSAT(out1, 31);
-    *pDst++ = out1 << 1U;
-
-    /* Decrement the blockSize loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and then store the results in the destination buffer. */
-    *pDst++ =
-      (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);

-    /* Decrement the blockSize loop counter */
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
 }

 /**
- * @} end of BasicMult group
+  @} end of BasicMult group
 */
--- a/DSP/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_mult_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_mult_q7.c
 * Description:  Q7 vector multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,87 +29,91 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicMult
- * @{
+  @addtogroup BasicMult
+  @{
 */

 /**
- * @brief           Q7 vector multiplication
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+  @brief         Q7 vector multiplication
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

 void arm_mult_q7(
-  q7_t * pSrcA,
-  q7_t * pSrcB,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counters */
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q7_t out1, out2, out3, out4;                   /* Temporary variables to store the product */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and store the results in temporary variables */
+
+#if defined (ARM_MATH_DSP)
+    /* Multiply inputs and store results in temporary variables */
    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);

-    /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
-    *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+#endif

-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A * B */
-    /* Multiply the inputs and store the result in the destination buffer */
+
+    /* Multiply input and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);

-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicMult group
+  @} end of BasicMult group
 */
--- a/DSP/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_negate_f32.c
 * Description:  Negates floating-point vectors
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,106 +29,117 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup negate Vector Negate
- *
- * Negates the elements of a vector.
- *
- * <pre>
- *     pDst[n] = -pSrc[n],   0 <= n < blockSize.
- * </pre>
- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicNegate Vector Negate
+
+  Negates the elements of a vector.
+
+  <pre>
+      pDst[n] = -pSrc[n],   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup negate
- * @{
+  @addtogroup BasicNegate
+  @{
 */

 /**
- * @brief  Negates the elements of a floating-point vector.
- * @param[in]  *pSrc points to the input vector
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
+  @brief         Negates the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector.
+  @param[out]    pDst       points to output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
 */

 void arm_negate_f32(
-  float32_t * pSrc,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+    float32x4_t vec1;
+    float32x4_t res;

-#if defined (ARM_MATH_DSP)
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t in1, in2, in3, in4;                  /* temporary variables */
+    while (blkCnt > 0U)
+    {
+        /* C = -A */

-  /*loop Unrolling */
-  blkCnt = blockSize >> 2U;
+    	/* Negate and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vnegq_f32(vec1);
+        vst1q_f32(pDst, res);

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  while (blkCnt > 0U)
-  {
-    /* read inputs from source */
-    in1 = *pSrc;
-    in2 = *(pSrc + 1);
-    in3 = *(pSrc + 2);
-    in4 = *(pSrc + 3);
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }

-    /* negate the input */
-    in1 = -in1;
-    in2 = -in2;
-    in3 = -in3;
-    in4 = -in4;
-
-    /* store the result to destination */
-    *pDst = in1;
-    *(pDst + 1) = in2;
-    *(pDst + 2) = in3;
-    *(pDst + 3) = in4;
-
-    /* update pointers to process next samples */
-    pSrc += 4U;
-    pDst += 4U;
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = blockSize % 0x4U;
+    /* Tail */
+    blkCnt = blockSize & 0x3;

 #else
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M0 */
-
-  /* Initialize blkCnt with number of samples */
-  blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Negate and then store the results in the destination buffer. */
+
+    /* Negate and store result in destination buffer. */
    *pDst++ = -*pSrc++;

-    /* Decrement the loop counter */
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
 }

 /**
- * @} end of negate group
+  @} end of BasicNegate group
 */
--- a/DSP/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_negate_q15.c
 * Description:  Negates Q15 vectors
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,103 +29,98 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup negate
- * @{
+  @addtogroup BasicNegate
+  @{
 */

 /**
- * @brief  Negates the elements of a Q15 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * \par Conditions for optimum performance
- *  Input and output buffers should be aligned by 32-bit
- *
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+  @brief         Negates the elements of a Q15 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */

 void arm_negate_q15(
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  q15_t in;
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q31_t in1, in2;                                /* Temporary variables */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Read two inputs at a time */
-    in1 = _SIMD32_OFFSET(pSrc);
-    in2 = _SIMD32_OFFSET(pSrc + 2);

-    /* negate two samples at a time */
-    in1 = __QSUB16(0, in1);
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (2 samples at a time). */
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));

-    /* negate two samples at a time */
-    in2 = __QSUB16(0, in2);
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;

-    /* store the result to destination 2 samples at a time */
-    _SIMD32_OFFSET(pDst) = in1;
-    /* store the result to destination 2 samples at a time */
-    _SIMD32_OFFSET(pDst + 2) = in2;
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;

+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;

-    /* update pointers to process next samples */
-    pSrc += 4U;
-    pDst += 4U;
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+#endif

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Negate and then store the result in the destination buffer. */
-    in = *pSrc++;
-    *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;

-    /* Decrement the loop counter */
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of negate group
+  @} end of BasicNegate group
 */
--- a/DSP/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_negate_q31.c
 * Description:  Negates Q31 vectors
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,89 +29,104 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup negate
- * @{
+  @addtogroup BasicNegate
+  @{
 */

 /**
- * @brief  Negates the elements of a Q31 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+  @brief         Negates the elements of a Q31 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

 void arm_negate_q31(
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  q31_t in;                                      /* Temporary variable */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary input variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t in1, in2, in3, in4;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Negate and then store the results in the destination buffer. */
-    in1 = *pSrc++;
-    in2 = *pSrc++;
-    in3 = *pSrc++;
-    in4 = *pSrc++;

-    *pDst++ = __QSUB(0, in1);
-    *pDst++ = __QSUB(0, in2);
-    *pDst++ = __QSUB(0, in3);
-    *pDst++ = __QSUB(0, in4);
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif

-    /* Decrement the loop counter */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Negate and then store the result in the destination buffer. */
-    in = *pSrc++;
-    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;

-    /* Decrement the loop counter */
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of negate group
+  @} end of BasicNegate group
 */
--- a/DSP/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_negate_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_negate_q7.c
 * Description:  Negates Q7 vectors
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,85 +29,98 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup negate
- * @{
+  @addtogroup BasicNegate
+  @{
 */

 /**
- * @brief  Negates the elements of a Q7 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+  @brief         Negates the elements of a Q7 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
 */

 void arm_negate_q7(
-  q7_t * pSrc,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  q7_t in;
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variable */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t input;                                   /* Input values1-4 */
-  q31_t zero = 0x00000000;
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Read four inputs */
-    input = *__SIMD32(pSrc)++;

-    /* Store the Negated results in the destination buffer in a single cycle by packing the results */
-    *__SIMD32(pDst)++ = __QSUB8(zero, input);
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (4 samples at a time). */
+    in1 = read_q7x4_ia ((q7_t **) &pSrc);
+    write_q7x4_ia (&pDst, __QSUB8(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;

-    /* Decrement the loop counter */
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = -A */
-    /* Negate and then store the results in the destination buffer. */ \
-      in = *pSrc++;
-    *pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;

-    /* Decrement the loop counter */
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q7_t) __QSUB(0, in);
+#else
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of negate group
+  @} end of BasicNegate group
 */
--- a/DSP/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_offset_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_offset_f32.c
 * Description:  Floating-point vector offset
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,126 +29,119 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup offset Vector Offset
- *
- * Adds a constant offset to each element of a vector.
- *
- * <pre>
- *     pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
- * </pre>
- *
- * The functions support in-place computation allowing the source and
- * destination pointers to reference the same memory buffer.
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicOffset Vector Offset
+
+  Adds a constant offset to each element of a vector.
+
+  <pre>
+      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup offset
- * @{
+  @addtogroup BasicOffset
+  @{
 */

 /**
- * @brief  Adds a constant offset to a floating-point vector.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  offset is the offset to be added
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
+  @brief         Adds a constant offset to a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
 */

-
 void arm_offset_f32(
-  float32_t * pSrc,
-  float32_t offset,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+    float32x4_t vec1;
+    float32x4_t res;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t in1, in2, in3, in4;
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-  /*loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+ 
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vaddq_f32(vec1,vdupq_n_f32(offset));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the results in the destination buffer. */
-    /* read samples from source */
-    in1 = *pSrc;
-    in2 = *(pSrc + 1);

-    /* add offset to input */
-    in1 = in1 + offset;
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;

-    /* read samples from source */
-    in3 = *(pSrc + 2);
+    *pDst++ = (*pSrc++) + offset;

-    /* add offset to input */
-    in2 = in2 + offset;
+    *pDst++ = (*pSrc++) + offset;

-    /* read samples from source */
-    in4 = *(pSrc + 3);
+    *pDst++ = (*pSrc++) + offset;

-    /* add offset to input */
-    in3 = in3 + offset;
-
-    /* store result to destination */
-    *pDst = in1;
-
-    /* add offset to input */
-    in4 = in4 + offset;
-
-    /* store result to destination */
-    *(pDst + 1) = in2;
-
-    /* store result to destination */
-    *(pDst + 2) = in3;
-
-    /* store result to destination */
-    *(pDst + 3) = in4;
-
-    /* update pointers to process next samples */
-    pSrc += 4U;
-    pDst += 4U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */

  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the result in the destination buffer. */
+
+    /* Add offset and store result in destination buffer. */
    *pDst++ = (*pSrc++) + offset;

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of offset group
+  @} end of BasicOffset group
 */
--- a/DSP/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_offset_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_offset_q15.c
 * Description:  Q15 vector offset
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,96 +29,93 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup offset
- * @{
+  @addtogroup BasicOffset
+  @{
 */

 /**
- * @brief  Adds a constant offset to a Q15 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  offset is the offset to be added
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+  @brief         Adds a constant offset to a Q15 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_offset_q15(
-  q15_t * pSrc,
-  q15_t offset,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */

-
-  /*loop Unrolling */
-  blkCnt = blockSize >> 2U;
-
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the results in the destination buffer, 2 samples at a time. */
-    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
-    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (2 samples at a time). */
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A + offset */
-    /* Add offset and then store the results in the destination buffer. */
-    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the results in the destination buffer. */
-    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);

-    /* Decrement the loop counter */
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of offset group
+  @} end of BasicOffset group
 */
--- a/DSP/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_offset_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_offset_q31.c
 * Description:  Q31 vector offset
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,100 +29,100 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup offset
- * @{
+  @addtogroup BasicOffset
+  @{
 */

 /**
- * @brief  Adds a constant offset to a Q31 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  offset is the offset to be added
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+  @brief         Adds a constant offset to a Q31 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_offset_q31(
-  q31_t * pSrc,
-  q31_t offset,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t in1, in2, in3, in4;
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the results in the destination buffer. */
-    in1 = *pSrc++;
-    in2 = *pSrc++;
-    in3 = *pSrc++;
-    in4 = *pSrc++;

-    *pDst++ = __QADD(in1, offset);
-    *pDst++ = __QADD(in2, offset);
-    *pDst++ = __QADD(in3, offset);
-    *pDst++ = __QADD(in4, offset);
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A + offset */
-    /* Add offset and then store the result in the destination buffer. */
-    *pDst++ = __QADD(*pSrc++, offset);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the result in the destination buffer. */
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);

-    /* Decrement the loop counter */
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of offset group
+  @} end of BasicOffset group
 */
--- a/DSP/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_offset_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_offset_q7.c
 * Description:  Q7 vector offset
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,95 +29,88 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup offset
- * @{
+  @addtogroup BasicOffset
+  @{
 */

 /**
- * @brief  Adds a constant offset to a Q7 vector.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  offset is the offset to be added
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+  @brief         Adds a constant offset to a Q7 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

 void arm_offset_q7(
-  q7_t * pSrc,
-  q7_t offset,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */

-
-  /*loop Unrolling */
-  blkCnt = blockSize >> 2U;
-
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PACKq7(offset, offset, offset, offset);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the results in the destination bufferfor 4 samples at a time. */
-    *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed);

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A + offset */
-    /* Add offset and then store the result in the destination buffer. */
-    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A + offset */
-    /* Add offset and then store the result in the destination buffer. */
-    *pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8);

-    /* Decrement the loop counter */
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of offset group
+  @} end of BasicOffset group
 */
--- a/DSP/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_scale_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_scale_f32.c
 * Description:  Multiplies a floating-point vector by a scalar
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,129 +29,131 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup scale Vector Scale
- *
- * Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
- *
- * <pre>
- *     pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
- * </pre>
- *
- * In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
- * a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
- * The shift allows the gain of the scaling operation to exceed 1.0.
- * The algorithm used with fixed-point data is:
- *
- * <pre>
- *     pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
- * </pre>
- *
- * The overall scale factor applied to the fixed-point data is
- * <pre>
- *     scale = scaleFract * 2^shift.
- * </pre>
- *
- * The functions support in-place computation allowing the source and destination
- * pointers to reference the same memory buffer.
+  @defgroup BasicScale Vector Scale
+
+  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
+  </pre>
+
+  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
+  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
+  The shift allows the gain of the scaling operation to exceed 1.0.
+  The algorithm used with fixed-point data is:
+
+  <pre>
+      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
+  </pre>
+
+  The overall scale factor applied to the fixed-point data is
+  <pre>
+      scale = scaleFract * 2^shift.
+  </pre>
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
 */

 /**
- * @addtogroup scale
- * @{
+  @addtogroup BasicScale
+  @{
 */

 /**
- * @brief Multiplies a floating-point vector by a scalar.
- * @param[in]       *pSrc points to the input vector
- * @param[in]       scale scale factor to be applied
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in the vector
- * @return none.
+  @brief         Multiplies a floating-point vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scale      scale factor to be applied
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
 */

-
 void arm_scale_f32(
-  float32_t * pSrc,
-  float32_t scale,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t *pSrc,
+        float32_t scale,
+        float32_t *pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-#if defined (ARM_MATH_DSP)
+  uint32_t blkCnt;                               /* Loop counter */
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+    float32x4_t vec1;
+    float32x4_t res;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t in1, in2, in3, in4;                  /* temporary variabels */
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-  /*loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        /* C = A * scale */
+
+    	/* Scale the input and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vmulq_f32(vec1, vdupq_n_f32(scale));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
-    /* Scale the input and then store the results in the destination buffer. */
-    /* read input samples from source */
-    in1 = *pSrc;
-    in2 = *(pSrc + 1);

-    /* multiply with scaling factor */
-    in1 = in1 * scale;
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (*pSrc++) * scale;

-    /* read input sample from source */
-    in3 = *(pSrc + 2);
+    *pDst++ = (*pSrc++) * scale;

-    /* multiply with scaling factor */
-    in2 = in2 * scale;
+    *pDst++ = (*pSrc++) * scale;

-    /* read input sample from source */
-    in4 = *(pSrc + 3);
+    *pDst++ = (*pSrc++) * scale;

-    /* multiply with scaling factor */
-    in3 = in3 * scale;
-    in4 = in4 * scale;
-    /* store the result to destination */
-    *pDst = in1;
-    *(pDst + 1) = in2;
-    *(pDst + 2) = in3;
-    *(pDst + 3) = in4;
-
-    /* update pointers to process next samples */
-    pSrc += 4U;
-    pDst += 4U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */

  while (blkCnt > 0U)
  {
    /* C = A * scale */
-    /* Scale the input and then store the result in the destination buffer. */
+
+    /* Scale input and store result in destination buffer. */
    *pDst++ = (*pSrc++) * scale;

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of scale group
+  @} end of BasicScale group
 */
--- a/DSP/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_scale_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_scale_q15.c
 * Description:  Multiplies a Q15 vector by a scalar
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,66 +29,66 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup scale
- * @{
+  @addtogroup BasicScale
+  @{
 */

 /**
- * @brief Multiplies a Q15 vector by a scalar.
- * @param[in]       *pSrc points to the input vector
- * @param[in]       scaleFract fractional portion of the scale value
- * @param[in]       shift number of bits to shift the result by
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
- * These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
- */
+  @brief         Multiplies a Q15 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none

+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
+                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ */

 void arm_scale_q15(
-  q15_t * pSrc,
-  q15_t scaleFract,
-  int8_t shift,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t *pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t *pDst,
+        uint32_t blockSize)
 {
-  int8_t kShift = 15 - shift;                    /* shift to apply after scaling */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 15 - shift;                    /* Shift to apply after scaling */

+#if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q15_t in1, in2, in3, in4;                      /* Temporary input variables */
+#endif
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q15_t in1, in2, in3, in4;
-  q31_t inA1, inA2;                              /* Temporary variables */
-  q31_t out1, out2, out3, out4;
+#if defined (ARM_MATH_LOOPUNROLL)

-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* Reading 2 inputs from memory */
-    inA1 = *__SIMD32(pSrc)++;
-    inA2 = *__SIMD32(pSrc)++;
-
    /* C = A * scale */
-    /* Scale the inputs and then store the 2 results in the destination buffer
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from source */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrc);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrc);
+
+    /* Scale inputs and store result in temporary variables
     * in single cycle by packing the outputs */
    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
-    out2 = (q31_t) ((q15_t) inA1 * scaleFract);
+    out2 = (q31_t) ((q15_t) (inA1      ) * scaleFract);
    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
-    out4 = (q31_t) ((q15_t) inA2 * scaleFract);
+    out4 = (q31_t) ((q15_t) (inA2      ) * scaleFract);

    /* apply shifting */
    out1 = out1 >> kShift;
@ -102,49 +102,43 @@ void arm_scale_q15(
    in3 = (q15_t) (__SSAT(out3, 16));
    in4 = (q15_t) (__SSAT(out4, 16));

-    /* store the result to destination */
-    *__SIMD32(pDst)++ = __PKHBT(in2, in1, 16);
-    *__SIMD32(pDst)++ = __PKHBT(in4, in3, 16);
+    /* store result to destination */
+    write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
+#else
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+#endif

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A * scale */
-    /* Scale the input and then store the result in the destination buffer. */
-    *pDst++ = (q15_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 16));
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A * scale */
-    /* Scale the input and then store the result in the destination buffer. */
-    *pDst++ = (q15_t) (__SSAT(((q31_t) * pSrc++ * scaleFract) >> kShift, 16));

-    /* Decrement the loop counter */
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of scale group
+  @} end of BasicScale group
 */
--- a/DSP/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_scale_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_scale_q31.c
 * Description:  Multiplies a Q31 vector by a scalar
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,199 +29,163 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup scale
- * @{
+  @addtogroup BasicScale
+  @{
 */

 /**
- * @brief Multiplies a Q31 vector by a scalar.
- * @param[in]       *pSrc points to the input vector
- * @param[in]       scaleFract fractional portion of the scale value
- * @param[in]       shift number of bits to shift the result by
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
- * These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
+  @brief         Multiplies a Q31 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
+                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
 */

 void arm_scale_q31(
-  q31_t * pSrc,
-  q31_t scaleFract,
-  int8_t shift,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t *pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t *pDst,
+        uint32_t blockSize)
 {
-  int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
-  int8_t sign = (kShift & 0x80);
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in, out;
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in, out;                                 /* Temporary variables */
+        int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
+        int8_t sign = (kShift & 0x80);

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q31_t in1, in2, in3, in4;                      /* temporary input variables */
-  q31_t out1, out2, out3, out4;                  /* temporary output variabels */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  if (sign == 0U)
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
-      /* read four inputs from source */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      in3 = *(pSrc + 2);
-      in4 = *(pSrc + 3);
+      /* C = A * scale */

-      /* multiply input with scaler value */
-      in1 = ((q63_t) in1 * scaleFract) >> 32;
-      in2 = ((q63_t) in2 * scaleFract) >> 32;
-      in3 = ((q63_t) in3 * scaleFract) >> 32;
-      in4 = ((q63_t) in4 * scaleFract) >> 32;
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read input from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in << kShift;                          /* apply shifting */
+      if (in != (out >> kShift))                   /* saturate the result */
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;                               /* Store result destination */

-      /* apply shifting */
-      out1 = in1 << kShift;
-      out2 = in2 << kShift;
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      /* saturate the results. */
-      if (in1 != (out1 >> kShift))
-        out1 = 0x7FFFFFFF ^ (in1 >> 31);
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      if (in2 != (out2 >> kShift))
-        out2 = 0x7FFFFFFF ^ (in2 >> 31);
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      out3 = in3 << kShift;
-      out4 = in4 << kShift;
-
-      *pDst = out1;
-      *(pDst + 1) = out2;
-
-      if (in3 != (out3 >> kShift))
-        out3 = 0x7FFFFFFF ^ (in3 >> 31);
-
-      if (in4 != (out4 >> kShift))
-        out4 = 0x7FFFFFFF ^ (in4 >> 31);
-
-      /* Store result destination */
-      *(pDst + 2) = out3;
-      *(pDst + 3) = out4;
-
-      /* Update pointers to process next sampels */
-      pSrc += 4U;
-      pDst += 4U;
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
-
  }
  else
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
-      /* read four inputs from source */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      in3 = *(pSrc + 2);
-      in4 = *(pSrc + 3);
+      /* C = A * scale */

-      /* multiply input with scaler value */
-      in1 = ((q63_t) in1 * scaleFract) >> 32;
-      in2 = ((q63_t) in2 * scaleFract) >> 32;
-      in3 = ((q63_t) in3 * scaleFract) >> 32;
-      in4 = ((q63_t) in4 * scaleFract) >> 32;
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read four inputs from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in >> -kShift;                         /* apply shifting */
+      *pDst++ = out;                               /* Store result destination */

-      /* apply shifting */
-      out1 = in1 >> -kShift;
-      out2 = in2 >> -kShift;
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;

-      out3 = in3 >> -kShift;
-      out4 = in4 >> -kShift;
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;

-      /* Store result destination */
-      *pDst = out1;
-      *(pDst + 1) = out2;
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;

-      *(pDst + 2) = out3;
-      *(pDst + 3) = out4;
-
-      /* Update pointers to process next sampels */
-      pSrc += 4U;
-      pDst += 4U;
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

-  if (sign == 0)
+  if (sign == 0U)
  {
-	  while (blkCnt > 0U)
-	  {
-		/* C = A * scale */
-		/* Scale the input and then store the result in the destination buffer. */
-		in = *pSrc++;
-		in = ((q63_t) in * scaleFract) >> 32;
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */

-		out = in << kShift;
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+          out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-		if (in != (out >> kShift))
-			out = 0x7FFFFFFF ^ (in >> 31);
-
-		*pDst++ = out;
-
-		/* Decrement the loop counter */
-		blkCnt--;
-	  }
+      /* Decrement loop counter */
+      blkCnt--;
+    }
  }
  else
  {
-	  while (blkCnt > 0U)
-	  {
-		/* C = A * scale */
-		/* Scale the input and then store the result in the destination buffer. */
-		in = *pSrc++;
-		in = ((q63_t) in * scaleFract) >> 32;
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */

-		out = in >> -kShift;
-
-		*pDst++ = out;
-
-		/* Decrement the loop counter */
-		blkCnt--;
-	  }
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;

+      /* Decrement loop counter */
+      blkCnt--;
+    }
  }
+
 }

 /**
- * @} end of scale group
+  @} end of BasicScale group
 */
--- a/DSP/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_scale_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_scale_q7.c
 * Description:  Multiplies a Q7 vector by a scalar
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,109 +29,101 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup scale
- * @{
+  @addtogroup BasicScale
+  @{
 */

 /**
- * @brief Multiplies a Q7 vector by a scalar.
- * @param[in]       *pSrc points to the input vector
- * @param[in]       scaleFract fractional portion of the scale value
- * @param[in]       shift number of bits to shift the result by
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
- * These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
+  @brief         Multiplies a Q7 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
+                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
 */

 void arm_scale_q7(
-  q7_t * pSrc,
-  q7_t scaleFract,
-  int8_t shift,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  int8_t kShift = 7 - shift;                     /* shift to apply after scaling */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 7 - shift;                     /* Shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q7_t in1, in2, in3, in4, out1, out2, out3, out4;      /* Temporary variables to store input & output */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
+    /* C = A * scale */
+
+#if defined (ARM_MATH_DSP)
    /* Reading 4 inputs from memory */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;

-    /* C = A * scale */
-    /* Scale the inputs and then store the results in the temporary variables. */
+    /* Scale inputs and store result in the temporary variable. */
    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));

-    /* Packing the individual outputs into 32bit and storing in
-     * destination buffer in single write */
-    *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+#endif

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A * scale */
-    /* Scale the input and then store the result in the destination buffer. */
-    *pDst++ = (q7_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 8));
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A * scale */
-    /* Scale the input and then store the result in the destination buffer. */
-    *pDst++ = (q7_t) (__SSAT((((q15_t) * pSrc++ * scaleFract) >> kShift), 8));

-    /* Decrement the loop counter */
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of scale group
+  @} end of BasicScale group
 */
--- a/DSP/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_shift_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_shift_q15.c
 * Description:  Shifts the elements of a Q15 vector by a specified number of bits
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,208 +29,173 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup shift
- * @{
+  @addtogroup BasicShift
+  @{
 */

 /**
- * @brief  Shifts the elements of a Q15 vector a specified number of bits.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+  @brief         Shifts the elements of a Q15 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_shift_q15(
-  q15_t * pSrc,
-  int8_t shiftBits,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  uint8_t sign;                                  /* Sign of shiftBits */
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q15_t in1, in2;                                /* Temporary input variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q15_t in1, in2;                                /* Temporary variables */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* Getting the sign of shiftBits */
-  sign = (shiftBits & 0x80);
-
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
-      /* Read 2 inputs */
+      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
-      /* C = A << shiftBits */
+
      /* Shift the inputs and then store the results in the destination buffer. */
-#ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
-                                  __SSAT((in2 << shiftBits), 16), 16);
-
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
 #else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                      __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

-      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
-                                  __SSAT((in1 << shiftBits), 16), 16);
-
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
+      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;

-#ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
-                                  __SSAT((in2 << shiftBits), 16), 16);
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                     __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

 #else
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+#endif

-      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
-                                  __SSAT((in1 << shiftBits), 16), 16);
-
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-      /* Decrement the loop counter */
-      blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-    while (blkCnt > 0U)
-    {
-      /* C = A << shiftBits */
-      /* Shift and then store the results in the destination buffer. */
-      *pDst++ = __SSAT((*pSrc++ << shiftBits), 16);
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0U)
-    {
-      /* Read 2 inputs */
-      in1 = *pSrc++;
-      in2 = *pSrc++;
-
-      /* C = A >> shiftBits */
-      /* Shift the inputs and then store the results in the destination buffer. */
-#ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
-                                  (in2 >> -shiftBits), 16);
-
-#else
-
-      *__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
-                                  (in1 >> -shiftBits), 16);
-
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-      in1 = *pSrc++;
-      in2 = *pSrc++;
-
-#ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
-                                  (in2 >> -shiftBits), 16);
-
-#else
-
-      *__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
-                                  (in1 >> -shiftBits), 16);
-
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-      /* Decrement the loop counter */
-      blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
      /* Shift the inputs and then store the results in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif

-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }

+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;

-  /* Getting the sign of shiftBits */
-  sign = (shiftBits & 0x80);
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
-      /* Shift and then store the results in the destination buffer. */
-      *pDst++ = __SSAT(((q31_t) * pSrc++ << shiftBits), 16);

-      /* Decrement the loop counter */
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
-      /* Shift the inputs and then store the results in the destination buffer. */
+
+      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);

-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of shift group
+  @} end of BasicShift group
 */
--- a/DSP/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_shift_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_shift_q31.c
 * Description:  Shifts the elements of a Q31 vector by a specified number of bits
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,163 +29,153 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */
 /**
- * @defgroup shift Vector Shift
- *
- * Shifts the elements of a fixed-point vector by a specified number of bits.
- * There are separate functions for Q7, Q15, and Q31 data types.
- * The underlying algorithm used is:
- *
- * <pre>
- *     pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
- * </pre>
- *
- * If <code>shift</code> is positive then the elements of the vector are shifted to the left.
- * If <code>shift</code> is negative then the elements of the vector are shifted to the right.
- *
- * The functions support in-place computation allowing the source and destination
- * pointers to reference the same memory buffer.
+  @defgroup BasicShift Vector Shift
+
+  Shifts the elements of a fixed-point vector by a specified number of bits.
+  There are separate functions for Q7, Q15, and Q31 data types.
+  The underlying algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
+  </pre>
+
+  If <code>shift</code> is positive then the elements of the vector are shifted to the left.
+  If <code>shift</code> is negative then the elements of the vector are shifted to the right.
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
 */

 /**
- * @addtogroup shift
- * @{
+  @addtogroup BasicShift
+  @{
 */

 /**
- * @brief  Shifts the elements of a Q31 vector a specified number of bits.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+  @brief         Shifts the elements of a Q31 vector a specified number of bits.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in the vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_shift_q31(
-  q31_t * pSrc,
-  int8_t shiftBits,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  q31_t in1, in2, in3, in4;                      /* Temporary input variables */
-  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q31_t in, out;                                 /* Temporary variables */

-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-
+  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
-      /* C = A  << shiftBits */
-      /* Shift the input and then store the results in the destination buffer. */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      out1 = in1 << shiftBits;
-      in3 = *(pSrc + 2);
-      out2 = in2 << shiftBits;
-      in4 = *(pSrc + 3);
-      if (in1 != (out1 >> shiftBits))
-        out1 = 0x7FFFFFFF ^ (in1 >> 31);
+      /* C = A << shiftBits */

-      if (in2 != (out2 >> shiftBits))
-        out2 = 0x7FFFFFFF ^ (in2 >> 31);
+      /* Shift input and store result in destination buffer. */
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      *pDst = out1;
-      out3 = in3 << shiftBits;
-      *(pDst + 1) = out2;
-      out4 = in4 << shiftBits;
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      if (in3 != (out3 >> shiftBits))
-        out3 = 0x7FFFFFFF ^ (in3 >> 31);
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      if (in4 != (out4 >> shiftBits))
-        out4 = 0x7FFFFFFF ^ (in4 >> 31);
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;

-      *(pDst + 2) = out3;
-      *(pDst + 3) = out4;
-
-      /* Update destination pointer to process next sampels */
-      pSrc += 4U;
-      pDst += 4U;
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
-      /* C = A >>  shiftBits */
-      /* Shift the input and then store the results in the destination buffer. */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      in3 = *(pSrc + 2);
-      in4 = *(pSrc + 3);
+      /* C = A >> shiftBits */

-      *pDst = (in1 >> -shiftBits);
-      *(pDst + 1) = (in2 >> -shiftBits);
-      *(pDst + 2) = (in3 >> -shiftBits);
-      *(pDst + 3) = (in4 >> -shiftBits);
-
-
-      pSrc += 4U;
-      pDst += 4U;
+      /* Shift input and store results in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);

+      /* Decrement loop counter */
      blkCnt--;
    }
-
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

-
-  while (blkCnt > 0U)
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
  {
-    /* C = A (>> or <<) shiftBits */
-    /* Shift the input and then store the result in the destination buffer. */
-    *pDst++ = (sign == 0U) ? clip_q63_to_q31((q63_t) * pSrc++ << shiftBits) :
-      (*pSrc++ >> -shiftBits);
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */

-    /* Decrement the loop counter */
-    blkCnt--;
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */

+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }

 }

 /**
- * @} end of shift group
+  @} end of BasicShift group
 */
--- a/DSP/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_shift_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_shift_q7.c
 * Description:  Processing function for the Q7 Shifting
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,180 +29,147 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup shift
- * @{
+  @addtogroup BasicShift
+  @{
 */

-
 /**
- * @brief  Shifts the elements of a Q7 vector a specified number of bits.
- * @param[in]  *pSrc points to the input vector
- * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.
- * @param[out]  *pDst points to the output vector
- * @param[in]  blockSize number of samples in the vector
- * @return none.
- *
- * \par Conditions for optimum performance
- *  Input and output buffers should be aligned by 32-bit
- *
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x8 0x7F] will be saturated.
+  @brief         Shifts the elements of a Q7 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           onditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

 void arm_shift_q7(
-  q7_t * pSrc,
-  int8_t shiftBits,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  uint8_t sign;                                  /* Sign of shiftBits */
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+#endif

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q7_t in1;                                      /* Input value1 */
-  q7_t in2;                                      /* Input value2 */
-  q7_t in3;                                      /* Input value3 */
-  q7_t in4;                                      /* Input value4 */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* Getting the sign of shiftBits */
-  sign = (shiftBits & 0x80);
-
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      in3 = *(pSrc + 2);
-      in4 = *(pSrc + 3);
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;

-      /* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
-      *__SIMD32(pDst)++ = __PACKq7(__SSAT((in1 << shiftBits), 8),
-                                   __SSAT((in2 << shiftBits), 8),
-                                   __SSAT((in3 << shiftBits), 8),
-                                   __SSAT((in4 << shiftBits), 8));
-      /* Update source pointer to process next sampels */
-      pSrc += 4U;
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
+                                     __SSAT((in2 << shiftBits), 8),
+                                     __SSAT((in3 << shiftBits), 8),
+                                     __SSAT((in4 << shiftBits), 8) ));
+#else
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+#endif

-      /* Decrement the loop counter */
-      blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-    while (blkCnt > 0U)
-    {
-      /* C = A << shiftBits */
-      /* Shift the input and then store the result in the destination buffer. */
-      *pDst++ = (q7_t) __SSAT((*pSrc++ << shiftBits), 8);
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
-    shiftBits = -shiftBits;
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
-      in1 = *pSrc;
-      in2 = *(pSrc + 1);
-      in3 = *(pSrc + 2);
-      in4 = *(pSrc + 3);
-
-      /* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
-      *__SIMD32(pDst)++ = __PACKq7((in1 >> shiftBits), (in2 >> shiftBits),
-                                   (in3 >> shiftBits), (in4 >> shiftBits));
-
-
-      pSrc += 4U;
-
-      /* Decrement the loop counter */
-      blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-    while (blkCnt > 0U)
-    {
-      /* C = A >> shiftBits */
-      /* Shift the input and then store the result in the destination buffer. */
      in1 = *pSrc++;
-      *pDst++ = (in1 >> shiftBits);
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;

-      /* Decrement the loop counter */
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits),
+                                     (in3 >> -shiftBits),
+                                     (in4 >> -shiftBits) ));
+#else
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif
+
+      /* Decrement loop counter */
      blkCnt--;
    }
  }

+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;

-  /* Getting the sign of shiftBits */
-  sign = (shiftBits & 0x80);
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
-      /* Shift the input and then store the result in the destination buffer. */
-      *pDst++ = (q7_t) __SSAT(((q15_t) * pSrc++ << shiftBits), 8);

-      /* Decrement the loop counter */
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+
+      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
-      /* Shift the input and then store the result in the destination buffer. */
+
+      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);

-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      blkCnt--;
    }
  }

-#endif /* #if defined (ARM_MATH_DSP) */
 }

 /**
- * @} end of shift group
+  @} end of BasicShift group
 */
--- a/DSP/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_f32.c
@ -1,15 +1,15 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_f32.c
- * Description:  Floating-point vector subtraction.
+ * Description:  Floating-point vector subtraction
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,110 +29,120 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @defgroup BasicSub Vector Subtraction
- *
- * Element-by-element subtraction of two vectors.
- *
- * <pre>
- *     pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
- * </pre>
- *
- * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+  @defgroup BasicSub Vector Subtraction
+
+  Element-by-element subtraction of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */

 /**
- * @addtogroup BasicSub
- * @{
+  @addtogroup BasicSub
+  @{
 */

-
 /**
- * @brief Floating-point vector subtraction.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
+  @brief         Floating-point vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
 */

 void arm_sub_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  float32_t * pDst,
-  uint32_t blockSize)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4_t vec1;
+    float32x4_t vec2;
+    float32x4_t res;

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t inA1, inA2, inA3, inA4;              /* temporary variables */
-  float32_t inB1, inB2, inB3, inB4;              /* temporary variables */
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;

-  /*loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        /* C = A - B */
+
+        /* Subtract and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vsubq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the results in the destination buffer. */
-    /* Read 4 input samples from sourceA and sourceB */
-    inA1 = *pSrcA;
-    inB1 = *pSrcB;
-    inA2 = *(pSrcA + 1);
-    inB2 = *(pSrcB + 1);
-    inA3 = *(pSrcA + 2);
-    inB3 = *(pSrcB + 2);
-    inA4 = *(pSrcA + 3);
-    inB4 = *(pSrcB + 3);

-    /* dst = srcA - srcB */
-    /* subtract and store the result */
-    *pDst = inA1 - inB1;
-    *(pDst + 1) = inA2 - inB2;
-    *(pDst + 2) = inA3 - inB3;
-    *(pDst + 3) = inA4 - inB4;
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);

+    *pDst++ = (*pSrcA++) - (*pSrcB++);

-    /* Update pointers to process next sampels */
-    pSrcA += 4U;
-    pSrcB += 4U;
-    pDst += 4U;
+    *pDst++ = (*pSrcA++) - (*pSrcB++);

-    /* Decrement the loop counter */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the results in the destination buffer. */
+
+    /* Subtract and store result in destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of BasicSub group
+  @} end of BasicSub group
 */
--- a/DSP/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_sub_q15.c
 * Description:  Q15 vector subtraction
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,100 +29,98 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicSub
- * @{
+  @addtogroup BasicSub
+  @{
 */

 /**
- * @brief Q15 vector subtraction.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+  @brief         Q15 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_sub_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2;
  q31_t inB1, inB2;
+#endif

-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the results in the destination buffer two samples at a time. */
-    inA1 = *__SIMD32(pSrcA)++;
-    inA2 = *__SIMD32(pSrcA)++;
-    inB1 = *__SIMD32(pSrcB)++;
-    inB2 = *__SIMD32(pSrcB)++;

-    *__SIMD32(pDst)++ = __QSUB16(inA1, inB1);
-    *__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);

-    /* Decrement the loop counter */
+    /* Subtract and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
-    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
-    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);

-    /* Decrement the loop counter */
+    /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-
 }

 /**
- * @} end of BasicSub group
+  @} end of BasicSub group
 */
--- a/DSP/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_sub_q31.c
 * Description:  Q31 vector subtraction
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,106 +29,80 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicSub
- * @{
+  @addtogroup BasicSub
+  @{
 */

 /**
- * @brief Q31 vector subtraction.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+  @brief         Q31 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_sub_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

+#if defined (ARM_MATH_LOOPUNROLL)

-#if defined (ARM_MATH_DSP)
-
-/* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inA1, inA2, inA3, inA4;
-  q31_t inB1, inB2, inB3, inB4;
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the results in the destination buffer. */
-    inA1 = *pSrcA++;
-    inA2 = *pSrcA++;
-    inB1 = *pSrcB++;
-    inB2 = *pSrcB++;

-    inA3 = *pSrcA++;
-    inA4 = *pSrcA++;
-    inB3 = *pSrcB++;
-    inB4 = *pSrcB++;
-
-    *pDst++ = __QSUB(inA1, inB1);
-    *pDst++ = __QSUB(inA2, inB2);
-    *pDst++ = __QSUB(inA3, inB3);
-    *pDst++ = __QSUB(inA4, inB4);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = blockSize % 0x4U;
-
-  while (blkCnt > 0U)
-  {
-    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
+    /* Subtract and store result in destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);

-    /* Decrement the loop counter */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#else
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;

-  /* Run the below code for Cortex-M0 */
+#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);

-    /* Decrement the loop counter */
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of BasicSub group
+  @} end of BasicSub group
 */
--- a/DSP/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/DSP/Source/BasicMathFunctions/arm_sub_q7.c
@ -3,13 +3,13 @@
 * Title:        arm_sub_q7.c
 * Description:  Q7 vector subtraction
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,91 +29,81 @@
 #include "arm_math.h"

 /**
- * @ingroup groupMath
+  @ingroup groupMath
 */

 /**
- * @addtogroup BasicSub
- * @{
+  @addtogroup BasicSub
+  @{
 */

 /**
- * @brief Q7 vector subtraction.
- * @param[in]       *pSrcA points to the first input vector
- * @param[in]       *pSrcB points to the second input vector
- * @param[out]      *pDst points to the output vector
- * @param[in]       blockSize number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+  @brief         Q7 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
 */

 void arm_sub_q7(
-  q7_t * pSrcA,
-  q7_t * pSrcB,
-  q7_t * pDst,
-  uint32_t blockSize)
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-/* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the results in the destination buffer 4 samples at a time. */
-    *__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);

-    /* Decrement the loop counter */
+#if defined (ARM_MATH_DSP)
+    /* Subtract and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
-    *pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
  while (blkCnt > 0U)
  {
    /* C = A - B */
-    /* Subtract and then store the result in the destination buffer. */
-    *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);

-    /* Decrement the loop counter */
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-
 }

 /**
- * @} end of BasicSub group
+  @} end of BasicSub group
 */
--- a/DSP/Source/CMakeLists.txt
+++ b/DSP/Source/CMakeLists.txt
@ -0,0 +1,223 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSP)
+
+# Needed to find the config modules
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+# Select which parts of the CMSIS-DSP must be compiled.
+# There are some dependencies between the parts but they are not tracked
+# by this cmake. So, enabling some functions may require to enable some
+# other ones.
+option(BASICMATH            "Basic Math Functions"              ON)
+option(COMPLEXMATH          "Complex Math Functions"            ON)
+option(CONTROLLER           "Controller Functions"              ON)
+option(FASTMATH             "Fast Math Functions"               ON)
+option(FILTERING            "Filtering Functions"               ON)
+option(MATRIX               "Matrix Functions"                  ON)
+option(STATISTICS           "Statistics Functions"              ON)
+option(SUPPORT              "Support Functions"                 ON)
+option(TRANSFORM            "Transform Functions"               ON)
+
+# When OFF it is the default behavior : all tables are included.
+option(CONFIGTABLE          "Configuration of table allowed"    OFF)
+
+# When CONFIGTABLE is ON, select if all interpolation tables must be included
+option(ALLFAST              "All interpolation tables included" OFF)
+# When CONFIGTABLE is ON, select if all FFT tables must be included
+option(ALLFFT               "All fft tables included"           OFF)
+
+# Features which require inclusion of a data table.
+# Since some tables may be big, the corresponding feature can be
+# disabled.
+# Those options are taken into account only when CONFIGTABLE is ON
+option(ARM_COS_F32          "cos f32"                           OFF)
+option(ARM_COS_Q31          "cos q31"                           OFF)
+option(ARM_COS_Q15          "cos q15"                           OFF)
+option(ARM_SIN_F32          "sin f32"                           OFF)
+option(ARM_SIN_Q31          "sin q31"                           OFF)
+option(ARM_SIN_Q15          "sin q15"                           OFF)
+option(ARM_SIN_COS_F32      "sin cos f32"                       OFF)
+option(ARM_SIN_COS_Q31      "sin cos q31"                       OFF)
+             
+option(ARM_LMS_NORM_Q31     "lms norm q31"                      OFF)
+option(ARM_LMS_NORM_Q15     "lms norm q15"                      OFF)
+             
+option(CFFT_F32_16          "cfft f32 16"                       OFF)
+option(CFFT_F32_32          "cfft f32 32"                       OFF)
+option(CFFT_F32_64          "cfft f32 64"                       OFF)
+option(CFFT_F32_128         "cfft f32 128"                      OFF)
+option(CFFT_F32_256         "cfft f32 256"                      OFF)
+option(CFFT_F32_512         "cfft f32 512"                      OFF)
+option(CFFT_F32_1024        "cfft f32 1024"                     OFF)
+option(CFFT_F32_2048        "cfft f32 2048"                     OFF)
+option(CFFT_F32_4096        "cfft f32 4096"                     OFF)
+                  
+option(CFFT_Q31_16          "cfft q31 16"                       OFF)
+option(CFFT_Q31_32          "cfft q31 32"                       OFF)
+option(CFFT_Q31_64          "cfft q31 64"                       OFF)
+option(CFFT_Q31_128         "cfft q31 128"                      OFF)
+option(CFFT_Q31_256         "cfft q31 256"                      OFF)
+option(CFFT_Q31_512         "cfft q31 512"                      OFF)
+option(CFFT_Q31_1024        "cfft q31 1024"                     OFF)
+option(CFFT_Q31_2048        "cfft q31 2048"                     OFF)
+option(CFFT_Q31_4096        "cfft q31 4096"                     OFF)
+                  
+option(CFFT_Q15_16          "cfft q15 16"                       OFF)
+option(CFFT_Q15_32          "cfft q15 32"                       OFF)
+option(CFFT_Q15_64          "cfft q15 64"                       OFF)
+option(CFFT_Q15_128         "cfft q15 128"                      OFF)
+option(CFFT_Q15_256         "cfft q15 256"                      OFF)
+option(CFFT_Q15_512         "cfft q15 512"                      OFF)
+option(CFFT_Q15_1024        "cfft q15 1024"                     OFF)
+option(CFFT_Q15_2048        "cfft q15 2048"                     OFF)
+option(CFFT_Q15_4096        "cfft q15 4096"                     OFF)
+             
+option(RFFT_FAST_F32_32     "rfft fast f32 32"                  OFF)
+option(RFFT_FAST_F32_64     "rfft fast f32 64"                  OFF)
+option(RFFT_FAST_F32_128    "rfft fast f32 128"                 OFF)
+option(RFFT_FAST_F32_256    "rfft fast f32 256"                 OFF)
+option(RFFT_FAST_F32_512    "rfft fast f32 512"                 OFF)
+option(RFFT_FAST_F32_1024   "rfft fast f32 1024"                OFF)
+option(RFFT_FAST_F32_2048   "rfft fast f32 2048"                OFF)
+option(RFFT_FAST_F32_4096   "rfft fast f32 4096"                OFF)
+             
+             
+option(RFFT_F32_128         "rfft f32 128"                      OFF)
+option(RFFT_F32_512         "rfft f32 512"                      OFF)
+option(RFFT_F32_2048        "rfft f32 2048"                     OFF)
+option(RFFT_F32_8192        "rfft f32 8192"                     OFF)
+             
+option(RFFT_Q31_32          "rfft q31 32"                       OFF)
+option(RFFT_Q31_64          "rfft q31 64"                       OFF)
+option(RFFT_Q31_128         "rfft q31 128"                      OFF)
+option(RFFT_Q31_256         "rfft q31 256"                      OFF)
+option(RFFT_Q31_512         "rfft q31 512"                      OFF)
+option(RFFT_Q31_1024        "rfft q31 1024"                     OFF)
+option(RFFT_Q31_2048        "rfft q31 2048"                     OFF)
+option(RFFT_Q31_4096        "rfft q31 4096"                     OFF)
+option(RFFT_Q31_8192        "rfft q31 8192"                     OFF)
+             
+option(RFFT_Q15_32          "rfft q15 32"                       OFF)
+option(RFFT_Q15_64          "rfft q15 64"                       OFF)
+option(RFFT_Q15_128         "rfft q15 128"                      OFF)
+option(RFFT_Q15_256         "rfft q15 256"                      OFF)
+option(RFFT_Q15_512         "rfft q15 512"                      OFF)
+option(RFFT_Q15_1024        "rfft q15 1024"                     OFF)
+option(RFFT_Q15_2048        "rfft q15 2048"                     OFF)
+option(RFFT_Q15_4096        "rfft q15 4096"                     OFF)
+option(RFFT_Q15_8192        "rfft q15 8192"                     OFF)
+             
+option(DCT4_F32_128          "dct4 f32 128"                     OFF)
+option(DCT4_F32_512          "dct4 f32 512"                     OFF)
+option(DCT4_F32_2048         "dct4 f32 2048"                    OFF)
+option(DCT4_F32_8192         "dct4 f32 8192"                    OFF)
+             
+option(DCT4_Q31_128          "dct4 q31 128"                     OFF)
+option(DCT4_Q31_512          "dct4 q31 512"                     OFF)
+option(DCT4_Q31_2048         "dct4 q31 2048"                    OFF)
+option(DCT4_Q31_8192         "dct4 q31 8192"                    OFF)
+             
+option(DCT4_Q15_128          "dct4 q15 128"                     OFF)
+option(DCT4_Q15_512          "dct4 q15 512"                     OFF)
+option(DCT4_Q15_2048         "dct4 q15 2048"                    OFF)
+option(DCT4_Q15_8192         "dct4 q15 8192"                    OFF)
+
+
+###########################
+#
+# CMSIS DSP
+#
+###########################
+
+# DSP Sources
+SET(DSP ".")
+
+add_library(CMSISDSP INTERFACE)
+
+include(config)
+
+
+if (BASICMATH)
+  add_subdirectory(BasicMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
+endif()
+
+if (COMPLEXMATH)
+  add_subdirectory(ComplexMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
+endif()
+
+if (CONTROLLER)
+  add_subdirectory(ControllerFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES) 
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
+endif()
+
+if (FASTMATH)
+  add_subdirectory(FastMathFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES) 
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
+endif()
+
+if (FILTERING)
+  add_subdirectory(FilteringFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif() 
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
+endif()
+
+if (MATRIX)
+  add_subdirectory(MatrixFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
+endif()
+
+if (STATISTICS)
+  add_subdirectory(StatisticsFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
+endif()
+
+if (SUPPORT)
+  add_subdirectory(SupportFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
+endif()
+
+if (TRANSFORM)
+  add_subdirectory(TransformFunctions)
+  # FFT tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
+endif()
+
+if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM)
+  add_subdirectory(CommonTables)
+  if (TRANSFORM)
+    # FFT tables inclusion is allowed
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES) 
+    endif()
+  endif()
+  if (FILTERING OR CONTROLLER OR FASTMATH)
+    # Select which tables to include
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES) 
+    endif()
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
+endif()
+
+### Includes
+target_include_directories(CMSISDSP INTERFACE "${DSP}/../Include")
+
+
+
--- a/DSP/Source/CommonTables/CMakeLists.txt
+++ b/DSP/Source/CommonTables/CMakeLists.txt
@ -0,0 +1,31 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPCommon)
+
+
+
+add_library(CMSISDSPCommon STATIC arm_common_tables.c)
+
+if (CONFIGTABLE AND ALLFFT)
+    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES) 
+endif()
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES) 
+endif()
+
+include(fft)
+fft(CMSISDSPCommon)
+
+include(interpol)
+interpol(CMSISDSPCommon)
+
+target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
+
+configdsp(CMSISDSPCommon ..)
+
+### Includes
+target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/CommonTables/CommonTables.c
+++ b/DSP/Source/CommonTables/CommonTables.c
@ -0,0 +1,31 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        CommonTables.c
+ * Description:  Combination of all common table source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_common_tables.c"
+#include "arm_const_structs.c"
+
--- a/DSP/Source/CommonTables/arm_common_tables.c
+++ b/DSP/Source/CommonTables/arm_common_tables.c
--- a/DSP/Source/CommonTables/arm_const_structs.c
+++ b/DSP/Source/CommonTables/arm_const_structs.c
@ -29,351 +29,458 @@

 #include "arm_const_structs.h"

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) 
+
 /* Floating-point structs */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
-	16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
+  16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
-	32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
+  32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
-	64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
+  64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
-	128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
+  128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
-	256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
+  256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
-	512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
+  512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
-	1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
+  1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
-	2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
+  2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
-	4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
+  4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
 };
+#endif

 /* Fixed-point structs */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
-	16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
+  16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
-	32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
+  32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
-	64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
+  64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
-	128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
+  128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
-	256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
+  256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
-	512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
+  512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
-	1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
+  1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
-	2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
+  2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
-	4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
+  4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
-	16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
+  16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
-	32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
+  32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
-	64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
+  64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
-	128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
+  128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
-	256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
+  256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
-	512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
+  512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
-	1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
+  1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
-	2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
+  2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
-	4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
+  4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
+#endif

 /* Structure for real-value inputs */
 /* Floating-point structs */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
-	{ 16, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
-	32U,
-	(float32_t *)twiddleCoef_rfft_32
+  { 16, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
+  32U,
+  (float32_t *)twiddleCoef_rfft_32
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
-	 { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
-	64U,
-	(float32_t *)twiddleCoef_rfft_64
+   { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
+  64U,
+  (float32_t *)twiddleCoef_rfft_64
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
-	{ 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
-	128U,
-	(float32_t *)twiddleCoef_rfft_128
+  { 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
+  128U,
+  (float32_t *)twiddleCoef_rfft_128
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
-	{ 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
-	256U,
-	(float32_t *)twiddleCoef_rfft_256
+  { 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
+  256U,
+  (float32_t *)twiddleCoef_rfft_256
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
-	{ 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
-	512U,
-	(float32_t *)twiddleCoef_rfft_512
+  { 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
+  512U,
+  (float32_t *)twiddleCoef_rfft_512
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
-	{ 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
-	1024U,
-	(float32_t *)twiddleCoef_rfft_1024
+  { 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
+  1024U,
+  (float32_t *)twiddleCoef_rfft_1024
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
-	{ 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
-	2048U,
-	(float32_t *)twiddleCoef_rfft_2048
+  { 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
+  2048U,
+  (float32_t *)twiddleCoef_rfft_2048
 };
+#endif 

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
-	{ 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
-	4096U,
-	(float32_t *)twiddleCoef_rfft_4096
+  { 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
+  4096U,
+  (float32_t *)twiddleCoef_rfft_4096
 };
+#endif

 /* Fixed-point structs */
 /* q31_t */
-extern const q31_t realCoefAQ31[8192];
-extern const q31_t realCoefBQ31[8192];

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
-	32U,
-	0,
-	1,
-	256U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len16
+  32U,
+  0,
+  1,
+  256U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len16
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
-	64U,
-	0,
-	1,
-	128U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len32
+  64U,
+  0,
+  1,
+  128U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len32
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
-	128U,
-	0,
-	1,
-	64U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len64
+  128U,
+  0,
+  1,
+  64U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len64
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
-	256U,
-	0,
-	1,
-	32U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len128
+  256U,
+  0,
+  1,
+  32U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len128
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
-	512U,
-	0,
-	1,
-	16U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len256
+  512U,
+  0,
+  1,
+  16U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len256
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
-	1024U,
-	0,
-	1,
-	8U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len512
+  1024U,
+  0,
+  1,
+  8U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len512
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
-	2048U,
-	0,
-	1,
-	4U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len1024
+  2048U,
+  0,
+  1,
+  4U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len1024
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
-	4096U,
-	0,
-	1,
-	2U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len2048
+  4096U,
+  0,
+  1,
+  2U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len2048
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
-	8192U,
-	0,
-	1,
-	1U,
-	(q31_t*)realCoefAQ31,
-	(q31_t*)realCoefBQ31,
-	&arm_cfft_sR_q31_len4096
+  8192U,
+  0,
+  1,
+  1U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len4096
 };
+#endif

 /* q15_t */
-extern const q15_t realCoefAQ15[8192];
-extern const q15_t realCoefBQ15[8192];
-
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
-	32U,
-	0,
-	1,
-	256U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len16
+  32U,
+  0,
+  1,
+  256U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len16
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
-	64U,
-	0,
-	1,
-	128U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len32
+  64U,
+  0,
+  1,
+  128U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len32
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
-	128U,
-	0,
-	1,
-	64U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len64
+  128U,
+  0,
+  1,
+  64U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len64
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
-	256U,
-	0,
-	1,
-	32U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len128
+  256U,
+  0,
+  1,
+  32U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len128
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
-	512U,
-	0,
-	1,
-	16U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len256
+  512U,
+  0,
+  1,
+  16U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len256
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
-	1024U,
-	0,
-	1,
-	8U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len512
+  1024U,
+  0,
+  1,
+  8U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len512
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
-	2048U,
-	0,
-	1,
-	4U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len1024
+  2048U,
+  0,
+  1,
+  4U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len1024
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
-	4096U,
-	0,
-	1,
-	2U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len2048
+  4096U,
+  0,
+  1,
+  2U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len2048
 };
+#endif

+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
-	8192U,
-	0,
-	1,
-	1U,
-	(q15_t*)realCoefAQ15,
-	(q15_t*)realCoefBQ15,
-	&arm_cfft_sR_q15_len4096
+  8192U,
+  0,
+  1,
+  1U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len4096
 };
+#endif
+
+#endif
--- a/DSP/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/DSP/Source/ComplexMathFunctions/CMakeLists.txt
@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPComplexMath)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPComplexMath STATIC ${SRC})
+
+configdsp(CMSISDSPComplexMath ..)
+
+### Includes
+target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/DSP/Source/ComplexMathFunctions/ComplexMathFunctions.c
@ -0,0 +1,46 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        CompexMathFunctions.c
+ * Description:  Combination of all comlex math function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_cmplx_conj_f32.c"
+#include "arm_cmplx_conj_q15.c"
+#include "arm_cmplx_conj_q31.c"
+#include "arm_cmplx_dot_prod_f32.c"
+#include "arm_cmplx_dot_prod_q15.c"
+#include "arm_cmplx_dot_prod_q31.c"
+#include "arm_cmplx_mag_f32.c"
+#include "arm_cmplx_mag_q15.c"
+#include "arm_cmplx_mag_q31.c"
+#include "arm_cmplx_mag_squared_f32.c"
+#include "arm_cmplx_mag_squared_q15.c"
+#include "arm_cmplx_mag_squared_q31.c"
+#include "arm_cmplx_mult_cmplx_f32.c"
+#include "arm_cmplx_mult_cmplx_q15.c"
+#include "arm_cmplx_mult_cmplx_q31.c"
+#include "arm_cmplx_mult_real_f32.c"
+#include "arm_cmplx_mult_real_q15.c"
+#include "arm_cmplx_mult_real_q31.c"
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_conj_f32.c
 * Description:  Floating-point complex conjugate
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,143 +29,133 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup cmplx_conj Complex Conjugate
- *
- * Conjugates the elements of a complex data vector.
- *
- * The <code>pSrc</code> points to the source data and
- * <code>pDst</code> points to the where the result should be written.
- * <code>numSamples</code> specifies the number of complex samples
- * and the data in each array is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * Each array has a total of <code>2*numSamples</code> values.
- * The underlying algorithm is used:
- *
- * <pre>
- * for(n=0; n<numSamples; n++) {
- *     pDst[(2*n)+0)] = pSrc[(2*n)+0];     // real part
- *     pDst[(2*n)+1)] = -pSrc[(2*n)+1];    // imag part
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup cmplx_conj Complex Conjugate
+
+  Conjugates the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the destination data where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  and the data in each array is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  Each array has a total of <code>2*numSamples</code> values.
+
+  The underlying algorithm is used:
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[(2*n)  ] =  pSrc[(2*n)  ];    // real part
+      pDst[(2*n)+1] = -pSrc[(2*n)+1];    // imag part
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup cmplx_conj
- * @{
+  @addtogroup cmplx_conj
+  @{
 */

 /**
- * @brief  Floating-point complex conjugate.
- * @param  *pSrc points to the input vector
- * @param  *pDst points to the output vector
- * @param  numSamples number of complex samples in each vector
- * @return none.
+  @brief         Floating-point complex conjugate.
+  @param[in]     pSrc        points to the input vector
+  @param[out]    pDst        points to the output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
 */

+
 void arm_cmplx_conj_f32(
-  float32_t * pSrc,
-  float32_t * pDst,
-  uint32_t numSamples)
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
 {
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+   float32x4_t zero;
+   float32x4x2_t vec;

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t inR1, inR2, inR3, inR4;
-  float32_t inI1, inI2, inI3, inI4;
+   zero = vdupq_n_f32(0.0);

-  /*loop Unrolling */
+   /* Compute 4 outputs at a time */
+   blkCnt = numSamples >> 2U;
+
+   while (blkCnt > 0U)
+   {
+     /* C[0]+jC[1] = A[0]+(-1)*jA[1] */
+     /* Calculate Complex Conjugate and then store the results in the destination buffer. */
+     vec = vld2q_f32(pSrc);
+     vec.val[1] = vsubq_f32(zero,vec.val[1]);
+     vst2q_f32(pDst,vec);
+
+     /* Increment pointers */
+     pSrc += 8;
+     pDst += 8;
+        
+     /* Decrement the loop counter */
+     blkCnt--;
+   }
+
+   /* Tail */
+   blkCnt = numSamples & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    /* read real input samples */
-    inR1 = pSrc[0];
-    /* store real samples to destination */
-    pDst[0] = inR1;
-    inR2 = pSrc[2];
-    pDst[2] = inR2;
-    inR3 = pSrc[4];
-    pDst[4] = inR3;
-    inR4 = pSrc[6];
-    pDst[6] = inR4;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */

-    /* read imaginary input samples */
-    inI1 = pSrc[1];
-    inI2 = pSrc[3];
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;

-    /* conjugate input */
-    inI1 = -inI1;
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;

-    /* read imaginary input samples */
-    inI3 = pSrc[5];
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;

-    /* conjugate input */
-    inI2 = -inI2;
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;

-    /* read imaginary input samples */
-    inI4 = pSrc[7];
-
-    /* conjugate input */
-    inI3 = -inI3;
-
-    /* store imaginary samples to destination */
-    pDst[1] = inI1;
-    pDst[3] = inI2;
-
-    /* conjugate input */
-    inI4 = -inI4;
-
-    /* store imaginary samples to destination */
-    pDst[5] = inI3;
-
-    /* increment source pointer by 8 to process next sampels */
-    pSrc += 8U;
-
-    /* store imaginary sample to destination */
-    pDst[7] = inI4;
-
-    /* increment destination pointer by 8 to store next samples */
-    pDst += 8U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined (ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
-    /* realOut + j (imagOut) = realIn + j (-1) imagIn */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    *pDst++ = *pSrc++;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of cmplx_conj group
+  @} end of cmplx_conj group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_conj_q15.c
 * Description:  Q15 complex conjugate
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,68 +29,66 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_conj
- * @{
+  @addtogroup cmplx_conj
+  @{
 */

 /**
- * @brief  Q15 complex conjugate.
- * @param  *pSrc points to the input vector
- * @param  *pDst points to the output vector
- * @param  numSamples number of complex samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+  @brief         Q15 complex conjugate.
+  @param[in]     pSrc        points to the input vector
+  @param[out]    pDst        points to the output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */

 void arm_cmplx_conj_q15(
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t numSamples)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
 {
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in1;                                     /* Temporary input variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+        q31_t in2, in3, in4;                           /* Temporary input variables */
+#endif

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in1, in2, in3, in4;
-  q31_t zero = 0;

-  /*loop Unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    in1 = *__SIMD32(pSrc)++;
-    in2 = *__SIMD32(pSrc)++;
-    in3 = *__SIMD32(pSrc)++;
-    in4 = *__SIMD32(pSrc)++;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+
+    #if defined (ARM_MATH_DSP)
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    in2 = read_q15x2_ia ((q15_t **) &pSrc);
+    in3 = read_q15x2_ia ((q15_t **) &pSrc);
+    in4 = read_q15x2_ia ((q15_t **) &pSrc);

 #ifndef ARM_MATH_BIG_ENDIAN
-
-    in1 = __QASX(zero, in1);
-    in2 = __QASX(zero, in2);
-    in3 = __QASX(zero, in3);
-    in4 = __QASX(zero, in4);
-
+    in1 = __QASX(0, in1);
+    in2 = __QASX(0, in2);
+    in3 = __QASX(0, in3);
+    in4 = __QASX(0, in4);
 #else
-
-    in1 = __QSAX(zero, in1);
-    in2 = __QSAX(zero, in2);
-    in3 = __QSAX(zero, in3);
-    in4 = __QSAX(zero, in4);
-
+    in1 = __QSAX(0, in1);
+    in2 = __QSAX(0, in2);
+    in3 = __QSAX(0, in3);
+    in4 = __QSAX(0, in4);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
@ -98,52 +96,62 @@ void arm_cmplx_conj_q15(
    in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
    in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);

-    *__SIMD32(pDst)++ = in1;
-    *__SIMD32(pDst)++ = in2;
-    *__SIMD32(pDst)++ = in3;
-    *__SIMD32(pDst)++ = in4;
+    write_q15x2_ia (&pDst, in1);
+    write_q15x2_ia (&pDst, in2);
+    write_q15x2_ia (&pDst, in3);
+    write_q15x2_ia (&pDst, in4);
+#else
+    *pDst++ =  *pSrc++;
+    in1 = *pSrc++;
+    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;

-    /* Decrement the loop counter */
+    *pDst++ =  *pSrc++;
+    in1 = *pSrc++;
+    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+    *pDst++ =  *pSrc++;
+    in1 = *pSrc++;
+    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+    *pDst++ =  *pSrc++;
+    in1 = *pSrc++;
+    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    *pDst++ = *pSrc++;
-    *pDst++ = __SSAT(-*pSrc++, 16);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  q15_t in;
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  /* Run the below code for Cortex-M0 */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

-  while (numSamples > 0U)
+  while (blkCnt > 0U)
  {
-    /* realOut + j (imagOut) = realIn+ j (-1) imagIn */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    *pDst++ = *pSrc++;
-    in = *pSrc++;
-    *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */

-    /* Decrement the loop counter */
-    numSamples--;
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    in1 = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __SSAT(-in1, 16);
+#else
+    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of cmplx_conj group
+  @} end of cmplx_conj group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_conj_q31.c
 * Description:  Q31 complex conjugate
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,141 +29,109 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_conj
- * @{
+  @addtogroup cmplx_conj
+  @{
 */

 /**
- * @brief  Q31 complex conjugate.
- * @param  *pSrc points to the input vector
- * @param  *pDst points to the output vector
- * @param  numSamples number of complex samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+  @brief         Q31 complex conjugate.
+  @param[in]     pSrc        points to the input vector
+  @param[out]    pDst        points to the output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

 void arm_cmplx_conj_q31(
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t numSamples)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
 {
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in;                                      /* Input value */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary input variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t inR1, inR2, inR3, inR4;                  /* Temporary real variables */
-  q31_t inI1, inI2, inI3, inI4;                  /* Temporary imaginary variables */
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
-    /* read real input sample */
-    inR1 = pSrc[0];
-    /* store real input sample */
-    pDst[0] = inR1;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */

-    /* read imaginary input sample */
-    inI1 = pSrc[1];
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif

-    /* read real input sample */
-    inR2 = pSrc[2];
-    /* store real input sample */
-    pDst[2] = inR2;
+    *pDst++ =  *pSrc++;
+    in =  *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif

-    /* read imaginary input sample */
-    inI2 = pSrc[3];
+    *pDst++ =  *pSrc++;
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif

-    /* negate imaginary input sample */
-    inI1 = __QSUB(0, inI1);
+    *pDst++ =  *pSrc++;
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif

-    /* read real input sample */
-    inR3 = pSrc[4];
-    /* store real input sample */
-    pDst[4] = inR3;
-
-    /* read imaginary input sample */
-    inI3 = pSrc[5];
-
-    /* negate imaginary input sample */
-    inI2 = __QSUB(0, inI2);
-
-    /* read real input sample */
-    inR4 = pSrc[6];
-    /* store real input sample */
-    pDst[6] = inR4;
-
-    /* negate imaginary input sample */
-    inI3 = __QSUB(0, inI3);
-
-    /* store imaginary input sample */
-    inI4 = pSrc[7];
-
-    /* store imaginary input samples */
-    pDst[1] = inI1;
-
-    /* negate imaginary input sample */
-    inI4 = __QSUB(0, inI4);
-
-    /* store imaginary input samples */
-    pDst[3] = inI2;
-
-    /* increment source pointer by 8 to proecess next samples */
-    pSrc += 8U;
-
-    /* store imaginary input samples */
-    pDst[5] = inI3;
-    pDst[7] = inI4;
-
-    /* increment destination pointer by 8 to process next samples */
-    pDst += 8U;
-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;

-
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
-    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
-    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
-    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
-    *pDst++ = *pSrc++;
-    in = *pSrc++;
-    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */

-    /* Decrement the loop counter */
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of cmplx_conj group
+  @} end of cmplx_conj group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_dot_prod_f32.c
 * Description:  Floating-point complex dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,163 +29,205 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup cmplx_dot_prod Complex Dot Product
- *
- * Computes the dot product of two complex vectors.
- * The vectors are multiplied element-by-element and then summed.
- *
- * The <code>pSrcA</code> points to the first complex input vector and
- * <code>pSrcB</code> points to the second complex input vector.
- * <code>numSamples</code> specifies the number of complex samples
- * and the data in each array is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * Each array has a total of <code>2*numSamples</code> values.
- *
- * The underlying algorithm is used:
- * <pre>
- * realResult=0;
- * imagResult=0;
- * for(n=0; n<numSamples; n++) {
- *     realResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+0] - pSrcA[(2*n)+1]*pSrcB[(2*n)+1];
- *     imagResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+1] + pSrcA[(2*n)+1]*pSrcB[(2*n)+0];
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup cmplx_dot_prod Complex Dot Product
+
+  Computes the dot product of two complex vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  The <code>pSrcA</code> points to the first complex input vector and
+  <code>pSrcB</code> points to the second complex input vector.
+  <code>numSamples</code> specifies the number of complex samples
+  and the data in each array is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  Each array has a total of <code>2*numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  realResult = 0;
+  imagResult = 0;
+  for (n = 0; n < numSamples; n++) {
+      realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
+      imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup cmplx_dot_prod
- * @{
+  @addtogroup cmplx_dot_prod
+  @{
 */

 /**
- * @brief  Floating-point complex dot product
- * @param  *pSrcA points to the first input vector
- * @param  *pSrcB points to the second input vector
- * @param  numSamples number of complex samples in each vector
- * @param  *realResult real part of the result returned here
- * @param  *imagResult imaginary part of the result returned here
- * @return none.
+  @brief         Floating-point complex dot product.
+  @param[in]     pSrcA       points to the first input vector
+  @param[in]     pSrcB       points to the second input vector
+  @param[in]     numSamples  number of samples in each vector
+  @param[out]    realResult  real part of the result returned here
+  @param[out]    imagResult  imaginary part of the result returned here
+  @return        none
 */

 void arm_cmplx_dot_prod_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  uint32_t numSamples,
-  float32_t * realResult,
-  float32_t * imagResult)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t numSamples,
+        float32_t * realResult,
+        float32_t * imagResult)
 {
-  float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result storage */
-  float32_t a0,b0,c0,d0;
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
+        float32_t a0,b0,c0,d0;

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4x2_t vec1,vec2,vec3,vec4;
+    float32x4_t accR,accI;
+    float32x2_t accum = vdup_n_f32(0);

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
+    accR = vdupq_n_f32(0.0);
+    accI = vdupq_n_f32(0.0);

-  /*loop Unrolling */
+    /* Loop unrolling: Compute 8 outputs at a time */
+    blkCnt = numSamples >> 3U;
+
+    while (blkCnt > 0U)
+    {
+	/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ...  */
+        /* Calculate dot product and then store the result in a temporary buffer. */
+
+	vec1 = vld2q_f32(pSrcA);
+        vec2 = vld2q_f32(pSrcB);
+
+	/* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8;
+
+	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+        accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
+        accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
+
+	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+        accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
+        accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
+
+        vec3 = vld2q_f32(pSrcA);
+        vec4 = vld2q_f32(pSrcB);
+	
+	/* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8;
+
+	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+        accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
+        accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
+
+	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+        accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
+        accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
+    real_sum += accum[0] + accum[1];
+
+    accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
+    imag_sum += accum[0] + accum[1];
+
+    /* Tail */
+    blkCnt = numSamples & 0x7;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;

-      /* Decrement the loop counter */
-      blkCnt--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples & 0x3U;
-
-  while (blkCnt > 0U)
-  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
-
-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
-
-      /* Decrement the loop counter */
-      blkCnt--;
-  }
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;

-      /* Decrement the loop counter */
-      numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* Store the real and imaginary results in the destination buffers */
+  /* Store real and imaginary result in destination buffer. */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }

 /**
- * @} end of cmplx_dot_prod group
+  @} end of cmplx_dot_prod group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_dot_prod_q15.c
 * Description:  Processing function for the Q15 Complex Dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,143 +29,120 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_dot_prod
- * @{
+  @addtogroup cmplx_dot_prod
+  @{
 */

 /**
- * @brief  Q15 complex dot product
- * @param  *pSrcA points to the first input vector
- * @param  *pSrcB points to the second input vector
- * @param  numSamples number of complex samples in each vector
- * @param  *realResult real part of the result returned here
- * @param  *imagResult imaginary part of the result returned here
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
- * These are accumulated in a 64-bit accumulator with 34.30 precision.
- * As a final step, the accumulators are converted to 8.24 format.
- * The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
+  @brief         Q15 complex dot product.
+  @param[in]     pSrcA       points to the first input vector
+  @param[in]     pSrcB       points to the second input vector
+  @param[in]     numSamples  number of samples in each vector
+  @param[out]    realResult  real part of the result returned here
+  @param[out]    imagResult  imaginary part of the result returned her
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
+                   These are accumulated in a 64-bit accumulator with 34.30 precision.
+                   As a final step, the accumulators are converted to 8.24 format.
+                   The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
 */

 void arm_cmplx_dot_prod_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  uint32_t numSamples,
-  q31_t * realResult,
-  q31_t * imagResult)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t numSamples,
+        q31_t * realResult,
+        q31_t * imagResult)
 {
-  q63_t real_sum = 0, imag_sum = 0;              /* Temporary result storage */
-  q15_t a0,b0,c0,d0;
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
+        q15_t a0,b0,c0,d0;

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += (q31_t)a0 * c0;
-      imag_sum += (q31_t)a0 * d0;
-      real_sum -= (q31_t)b0 * d0;
-      imag_sum += (q31_t)b0 * c0;
+    real_sum += (q31_t)a0 * c0;
+    imag_sum += (q31_t)a0 * d0;
+    real_sum -= (q31_t)b0 * d0;
+    imag_sum += (q31_t)b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += (q31_t)a0 * c0;
-      imag_sum += (q31_t)a0 * d0;
-      real_sum -= (q31_t)b0 * d0;
-      imag_sum += (q31_t)b0 * c0;
+    real_sum += (q31_t)a0 * c0;
+    imag_sum += (q31_t)a0 * d0;
+    real_sum -= (q31_t)b0 * d0;
+    imag_sum += (q31_t)b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += (q31_t)a0 * c0;
-      imag_sum += (q31_t)a0 * d0;
-      real_sum -= (q31_t)b0 * d0;
-      imag_sum += (q31_t)b0 * c0;
+    real_sum += (q31_t)a0 * c0;
+    imag_sum += (q31_t)a0 * d0;
+    real_sum -= (q31_t)b0 * d0;
+    imag_sum += (q31_t)b0 * c0;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += (q31_t)a0 * c0;
-      imag_sum += (q31_t)a0 * d0;
-      real_sum -= (q31_t)b0 * d0;
-      imag_sum += (q31_t)b0 * c0;
+    real_sum += (q31_t)a0 * c0;
+    imag_sum += (q31_t)a0 * d0;
+    real_sum -= (q31_t)b0 * d0;
+    imag_sum += (q31_t)b0 * c0;

-      /* Decrement the loop counter */
-      blkCnt--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
-
-      real_sum += (q31_t)a0 * c0;
-      imag_sum += (q31_t)a0 * d0;
-      real_sum -= (q31_t)b0 * d0;
-      imag_sum += (q31_t)b0 * c0;
-
-      /* Decrement the loop counter */
-      blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
+    real_sum += (q31_t)a0 * c0;
+    imag_sum += (q31_t)a0 * d0;
+    real_sum -= (q31_t)b0 * d0;
+    imag_sum += (q31_t)b0 * c0;

-
-      /* Decrement the loop counter */
-      numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* Store the real and imaginary results in 8.24 format  */
+  /* Store real and imaginary result in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (real_sum >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
@ -173,5 +150,5 @@ void arm_cmplx_dot_prod_q15(
 }

 /**
- * @} end of cmplx_dot_prod group
+  @} end of cmplx_dot_prod group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_dot_prod_q31.c
 * Description:  Q31 complex dot product
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,147 +29,125 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_dot_prod
- * @{
+  @addtogroup cmplx_dot_prod
+  @{
 */

 /**
- * @brief  Q31 complex dot product
- * @param  *pSrcA points to the first input vector
- * @param  *pSrcB points to the second input vector
- * @param  numSamples number of complex samples in each vector
- * @param  *realResult real part of the result returned here
- * @param  *imagResult imaginary part of the result returned here
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
- * The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
- * Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
- * The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
- * Input down scaling is not required.
+  @brief         Q31 complex dot product.
+  @param[in]     pSrcA       points to the first input vector
+  @param[in]     pSrcB       points to the second input vector
+  @param[in]     numSamples  number of samples in each vector
+  @param[out]    realResult  real part of the result returned here
+  @param[out]    imagResult  imaginary part of the result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
+                   The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
+                   Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
+                   The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
+                   Input down scaling is not required.
 */

 void arm_cmplx_dot_prod_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  uint32_t numSamples,
-  q63_t * realResult,
-  q63_t * imagResult)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t numSamples,
+        q63_t * realResult,
+        q63_t * imagResult)
 {
-  q63_t real_sum = 0, imag_sum = 0;              /* Temporary result storage */
-  q31_t a0,b0,c0,d0;
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
+        q31_t a0,b0,c0,d0;

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
+    real_sum += ((q63_t)a0 * c0) >> 14;
+    imag_sum += ((q63_t)a0 * d0) >> 14;
+    real_sum -= ((q63_t)b0 * d0) >> 14;
+    imag_sum += ((q63_t)b0 * c0) >> 14;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
+    real_sum += ((q63_t)a0 * c0) >> 14;
+    imag_sum += ((q63_t)a0 * d0) >> 14;
+    real_sum -= ((q63_t)b0 * d0) >> 14;
+    imag_sum += ((q63_t)b0 * c0) >> 14;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
+    real_sum += ((q63_t)a0 * c0) >> 14;
+    imag_sum += ((q63_t)a0 * d0) >> 14;
+    real_sum -= ((q63_t)b0 * d0) >> 14;
+    imag_sum += ((q63_t)b0 * c0) >> 14;

-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
+    real_sum += ((q63_t)a0 * c0) >> 14;
+    imag_sum += ((q63_t)a0 * d0) >> 14;
+    real_sum -= ((q63_t)b0 * d0) >> 14;
+    imag_sum += ((q63_t)b0 * c0) >> 14;

-      /* Decrement the loop counter */
-      blkCnt--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-  /* If the numSamples  is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
-
-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
-
-      /* Decrement the loop counter */
-      blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;

-      real_sum += ((q63_t)a0 * c0) >> 14;
-      imag_sum += ((q63_t)a0 * d0) >> 14;
-      real_sum -= ((q63_t)b0 * d0) >> 14;
-      imag_sum += ((q63_t)b0 * c0) >> 14;
+    real_sum += ((q63_t)a0 * c0) >> 14;
+    imag_sum += ((q63_t)a0 * d0) >> 14;
+    real_sum -= ((q63_t)b0 * d0) >> 14;
+    imag_sum += ((q63_t)b0 * c0) >> 14;

-      /* Decrement the loop counter */
-      numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* Store the real and imaginary results in 16.48 format  */
+  /* Store real and imaginary result in 16.48 format  */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }

 /**
- * @} end of cmplx_dot_prod group
+  @} end of cmplx_dot_prod group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_f32.c
 * Description:  Floating-point complex magnitude
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,125 +29,160 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup cmplx_mag Complex Magnitude
- *
- * Computes the magnitude of the elements of a complex data vector.
- *
- * The <code>pSrc</code> points to the source data and
- * <code>pDst</code> points to the where the result should be written.
- * <code>numSamples</code> specifies the number of complex samples
- * in the input array and the data is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The input array has a total of <code>2*numSamples</code> values;
- * the output array has a total of <code>numSamples</code> values.
- * The underlying algorithm is used:
- *
- * <pre>
- * for(n=0; n<numSamples; n++) {
- *     pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup cmplx_mag Complex Magnitude
+
+  Computes the magnitude of the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  in the input array and the data is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The input array has a total of <code>2*numSamples</code> values;
+  the output array has a total of <code>numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup cmplx_mag
- * @{
- */
-/**
- * @brief Floating-point complex magnitude.
- * @param[in]       *pSrc points to complex input buffer
- * @param[out]      *pDst points to real output buffer
- * @param[in]       numSamples number of complex samples in the input vector
- * @return none.
- *
+  @addtogroup cmplx_mag
+  @{
 */

+/**
+  @brief         Floating-point complex magnitude.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */

 void arm_cmplx_mag_f32(
-  float32_t * pSrc,
-  float32_t * pDst,
-  uint32_t numSamples)
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
 {
-  float32_t realIn, imagIn;                      /* Temporary variables to hold input values */
-
-#if defined (ARM_MATH_DSP)
-
-  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
+  float32_t real, imag;                      /* Temporary variables to hold input values */

-  /*loop Unrolling */
-  blkCnt = numSamples >> 2U;
+#if defined(ARM_MATH_NEON)
+
+  float32x4x2_t vecA;
+  float32x4_t vRealA;
+  float32x4_t vImagA;
+  float32x4_t vMagSqA;
+
+  float32x4x2_t vecB;
+  float32x4_t vRealB;
+  float32x4_t vImagB;
+  float32x4_t vMagSqB;
+
+  /* Loop unrolling: Compute 8 outputs at a time */
+  blkCnt = numSamples >> 3;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
+    /* out = sqrt((real * real) + (imag * imag)) */

-    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    /* store the result in the destination buffer. */
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+    vecA = vld2q_f32(pSrc);
+    pSrc += 8;

-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+    vecB = vld2q_f32(pSrc);
+    pSrc += 8;

-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+    vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
+    vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
+    vMagSqA = vaddq_f32(vRealA, vImagA);

-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+    vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
+    vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
+    vMagSqB = vaddq_f32(vRealB, vImagB);

+    /* Store the result in the destination buffer. */
+    vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
+    pDst += 4;
+
+    vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
+    pDst += 4;

    /* Decrement the loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples % 0x4U;
-
-  while (blkCnt > 0U)
-  {
-    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    /* store the result in the destination buffer. */
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
+  blkCnt = numSamples & 7;

 #else

-  /* Run the below code for Cortex-M0 */
+#if defined (ARM_MATH_LOOPUNROLL)

-  while (numSamples > 0U)
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
  {
-    /* out = sqrt((real * real) + (imag * imag)) */
-    realIn = *pSrc++;
-    imagIn = *pSrc++;
-    /* store the result in the destination buffer. */
-    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */

-    /* Decrement the loop counter */
-    numSamples--;
+    real = *pSrc++;
+    imag = *pSrc++;
+
+    /* store result in destination buffer. */
+    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+
+    /* store result in destination buffer. */
+    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }

 }

 /**
- * @} end of cmplx_mag group
+  @} end of cmplx_mag group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_q15.c
 * Description:  Q15 complex magnitude
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,113 +29,134 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_mag
- * @{
+  @addtogroup cmplx_mag
+  @{
 */

-
 /**
- * @brief  Q15 complex magnitude
- * @param  *pSrc points to the complex input vector
- * @param  *pDst points to the real output vector
- * @param  numSamples number of complex samples in the input vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
+  @brief         Q15 complex magnitude.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
 */

 void arm_cmplx_mag_q15(
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t numSamples)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
 {
-  q31_t acc0, acc1;                              /* Accumulators */
+        uint32_t blkCnt;                               /* Loop counter */

 #if defined (ARM_MATH_DSP)
+        q31_t in;
+        q31_t acc0;                                    /* Accumulators */
+#else
+       q15_t real, imag;                              /* Temporary input variables */
+       q31_t acc0, acc1;                              /* Accumulators */
+#endif

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in1, in2, in3, in4;
-  q31_t acc2, acc3;
+#if defined (ARM_MATH_LOOPUNROLL)

-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  while (blkCnt > 0U)
-  {
-
-    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
-    in1 = *__SIMD32(pSrc)++;
-    in2 = *__SIMD32(pSrc)++;
-    in3 = *__SIMD32(pSrc)++;
-    in4 = *__SIMD32(pSrc)++;
-
-    acc0 = __SMUAD(in1, in1);
-    acc1 = __SMUAD(in2, in2);
-    acc2 = __SMUAD(in3, in3);
-    acc3 = __SMUAD(in4, in4);
-
-    /* store the result in 2.14 format in the destination buffer. */
-    arm_sqrt_q15((q15_t) ((acc0) >> 17), pDst++);
-    arm_sqrt_q15((q15_t) ((acc1) >> 17), pDst++);
-    arm_sqrt_q15((q15_t) ((acc2) >> 17), pDst++);
-    arm_sqrt_q15((q15_t) ((acc3) >> 17), pDst++);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples % 0x4U;
-
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
-    in1 = *__SIMD32(pSrc)++;
-    acc0 = __SMUAD(in1, in1);

-    /* store the result in 2.14 format in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);

-    /* Decrement the loop counter */
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+#else
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+
+    /* store result in 2.14 format in destination buffer. */
+    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
    blkCnt--;
  }

+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
-  q15_t real, imag;                              /* Temporary variables to hold input values */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* out = sqrt(real * real + imag * imag) */
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+#if defined (ARM_MATH_DSP)
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+
+    /* store result in 2.14 format in destination buffer. */
+    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+#else
    real = *pSrc++;
    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);

-    acc0 = (real * real);
-    acc1 = (imag * imag);
-
-    /* store the result in 2.14 format in the destination buffer. */
+    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
+#endif

-    /* Decrement the loop counter */
-    numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of cmplx_mag group
+  @} end of cmplx_mag group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_q31.c
 * Description:  Q31 complex magnitude
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,145 +29,102 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_mag
- * @{
+  @addtogroup cmplx_mag
+  @{
 */

 /**
- * @brief  Q31 complex magnitude
- * @param  *pSrc points to the complex input vector
- * @param  *pDst points to the real output vector
- * @param  numSamples number of complex samples in the input vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
- * Input down scaling is not required.
+  @brief         Q31 complex magnitude.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
+                   Input down scaling is not required.
 */

 void arm_cmplx_mag_q31(
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t numSamples)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
 {
-  q31_t real, imag;                              /* Temporary variables to hold input values */
-  q31_t acc0, acc1;                              /* Accumulators */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t real, imag;                              /* Temporary input variables */
+        q31_t acc0, acc1;                              /* Accumulators */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  q31_t real1, real2, imag1, imag2;              /* Temporary variables to hold input values */
-  q31_t out1, out2, out3, out4;                  /* Accumulators */
-  q63_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
-
-
-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  while (blkCnt > 0U)
-  {
-    /* read complex input from source buffer */
-    real1 = pSrc[0];
-    imag1 = pSrc[1];
-    real2 = pSrc[2];
-    imag2 = pSrc[3];
-
-    /* calculate power of input values */
-    mul1 = (q63_t) real1 *real1;
-    mul2 = (q63_t) imag1 *imag1;
-    mul3 = (q63_t) real2 *real2;
-    mul4 = (q63_t) imag2 *imag2;
-
-    /* get the result to 3.29 format */
-    out1 = (q31_t) (mul1 >> 33);
-    out2 = (q31_t) (mul2 >> 33);
-    out3 = (q31_t) (mul3 >> 33);
-    out4 = (q31_t) (mul4 >> 33);
-
-    /* add real and imaginary accumulators */
-    out1 = out1 + out2;
-    out3 = out3 + out4;
-
-    /* read complex input from source buffer */
-    real1 = pSrc[4];
-    imag1 = pSrc[5];
-    real2 = pSrc[6];
-    imag2 = pSrc[7];
-
-    /* calculate square root */
-    arm_sqrt_q31(out1, &pDst[0]);
-
-    /* calculate power of input values */
-    mul1 = (q63_t) real1 *real1;
-
-    /* calculate square root */
-    arm_sqrt_q31(out3, &pDst[1]);
-
-    /* calculate power of input values */
-    mul2 = (q63_t) imag1 *imag1;
-    mul3 = (q63_t) real2 *real2;
-    mul4 = (q63_t) imag2 *imag2;
-
-    /* get the result to 3.29 format */
-    out1 = (q31_t) (mul1 >> 33);
-    out2 = (q31_t) (mul2 >> 33);
-    out3 = (q31_t) (mul3 >> 33);
-    out4 = (q31_t) (mul4 >> 33);
-
-    /* add real and imaginary accumulators */
-    out1 = out1 + out2;
-    out3 = out3 + out4;
-
-    /* calculate square root */
-    arm_sqrt_q31(out1, &pDst[2]);
-
-    /* increment destination by 8 to process next samples */
-    pSrc += 8U;
-
-    /* calculate square root */
-    arm_sqrt_q31(out3, &pDst[3]);
-
-    /* increment destination by 4 to process next samples */
-    pDst += 4U;
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples % 0x4U;
-
-#else
-
-  /* Run the below code for Cortex-M0 */
-  blkCnt = numSamples;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 2.30 format in the destination buffer. */
+
+    /* store result in 2.30 format in destination buffer. */
    arm_sqrt_q31(acc0 + acc1, pDst++);

-    /* Decrement the loop counter */
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (q31_t) (((q63_t) real * real) >> 33);
+    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+    arm_sqrt_q31(acc0 + acc1, pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (q31_t) (((q63_t) real * real) >> 33);
+    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+    arm_sqrt_q31(acc0 + acc1, pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (q31_t) (((q63_t) real * real) >> 33);
+    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+    arm_sqrt_q31(acc0 + acc1, pDst++);
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (q31_t) (((q63_t) real * real) >> 33);
+    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+
+    /* store result in 2.30 format in destination buffer. */
+    arm_sqrt_q31(acc0 + acc1, pDst++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
 }

 /**
- * @} end of cmplx_mag group
+  @} end of cmplx_mag group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_squared_f32.c
 * Description:  Floating-point complex magnitude squared
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,176 +29,156 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup cmplx_mag_squared Complex Magnitude Squared
- *
- * Computes the magnitude squared of the elements of a complex data vector.
- *
- * The <code>pSrc</code> points to the source data and
- * <code>pDst</code> points to the where the result should be written.
- * <code>numSamples</code> specifies the number of complex samples
- * in the input array and the data is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The input array has a total of <code>2*numSamples</code> values;
- * the output array has a total of <code>numSamples</code> values.
- *
- * The underlying algorithm is used:
- *
- * <pre>
- * for(n=0; n<numSamples; n++) {
- *     pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup cmplx_mag_squared Complex Magnitude Squared
+
+  Computes the magnitude squared of the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  in the input array and the data is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The input array has a total of <code>2*numSamples</code> values;
+  the output array has a total of <code>numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup cmplx_mag_squared
- * @{
+  @addtogroup cmplx_mag_squared
+  @{
 */

-
 /**
- * @brief  Floating-point complex magnitude squared
- * @param[in]  *pSrc points to the complex input vector
- * @param[out]  *pDst points to the real output vector
- * @param[in]  numSamples number of complex samples in the input vector
- * @return none.
+  @brief         Floating-point complex magnitude squared.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
 */

 void arm_cmplx_mag_squared_f32(
-  float32_t * pSrc,
-  float32_t * pDst,
-  uint32_t numSamples)
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
 {
-  float32_t real, imag;                          /* Temporary variables to store real and imaginary values */
-  uint32_t blkCnt;                               /* loop counter */
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t real, imag;                          /* Temporary input variables */

-#if defined (ARM_MATH_DSP)
-  float32_t real1, real2, real3, real4;          /* Temporary variables to hold real values */
-  float32_t imag1, imag2, imag3, imag4;          /* Temporary variables to hold imaginary values */
-  float32_t mul1, mul2, mul3, mul4;              /* Temporary variables */
-  float32_t mul5, mul6, mul7, mul8;              /* Temporary variables */
-  float32_t out1, out2, out3, out4;              /* Temporary variables to hold output values */
+#if defined(ARM_MATH_NEON)
+  float32x4x2_t vecA;
+  float32x4_t vRealA;
+  float32x4_t vImagA;
+  float32x4_t vMagSqA;

-  /*loop Unrolling */
-  blkCnt = numSamples >> 2U;
+  float32x4x2_t vecB;
+  float32x4_t vRealB;
+  float32x4_t vImagB;
+  float32x4_t vMagSqB;
+
+  /* Loop unrolling: Compute 8 outputs at a time */
+  blkCnt = numSamples >> 3;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
-    /* read real input sample from source buffer */
-    real1 = pSrc[0];
-    /* read imaginary input sample from source buffer */
-    imag1 = pSrc[1];
+    /* out = sqrt((real * real) + (imag * imag)) */

-    /* calculate power of real value */
-    mul1 = real1 * real1;
+    vecA = vld2q_f32(pSrc);
+    pSrc += 8;

-    /* read real input sample from source buffer */
-    real2 = pSrc[2];
+    vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
+    vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
+    vMagSqA = vaddq_f32(vRealA, vImagA);

-    /* calculate power of imaginary value */
-    mul2 = imag1 * imag1;
+    vecB = vld2q_f32(pSrc);
+    pSrc += 8;

-    /* read imaginary input sample from source buffer */
-    imag2 = pSrc[3];
+    vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
+    vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
+    vMagSqB = vaddq_f32(vRealB, vImagB);

-    /* calculate power of real value */
-    mul3 = real2 * real2;
+    /* Store the result in the destination buffer. */
+    vst1q_f32(pDst, vMagSqA);
+    pDst += 4;

-    /* read real input sample from source buffer */
-    real3 = pSrc[4];
-
-    /* calculate power of imaginary value */
-    mul4 = imag2 * imag2;
-
-    /* read imaginary input sample from source buffer */
-    imag3 = pSrc[5];
-
-    /* calculate power of real value */
-    mul5 = real3 * real3;
-    /* calculate power of imaginary value */
-    mul6 = imag3 * imag3;
-
-    /* read real input sample from source buffer */
-    real4 = pSrc[6];
-
-    /* accumulate real and imaginary powers */
-    out1 = mul1 + mul2;
-
-    /* read imaginary input sample from source buffer */
-    imag4 = pSrc[7];
-
-    /* accumulate real and imaginary powers */
-    out2 = mul3 + mul4;
-
-    /* calculate power of real value */
-    mul7 = real4 * real4;
-    /* calculate power of imaginary value */
-    mul8 = imag4 * imag4;
-
-    /* store output to destination */
-    pDst[0] = out1;
-
-    /* accumulate real and imaginary powers */
-    out3 = mul5 + mul6;
-
-    /* store output to destination */
-    pDst[1] = out2;
-
-    /* accumulate real and imaginary powers */
-    out4 = mul7 + mul8;
-
-    /* store output to destination */
-    pDst[2] = out3;
-
-    /* increment destination pointer by 8 to process next samples */
-    pSrc += 8U;
-
-    /* store output to destination */
-    pDst[3] = out4;
-
-    /* increment destination pointer by 4 to process next samples */
-    pDst += 4U;
+    vst1q_f32(pDst, vMagSqB);
+    pDst += 4;

    /* Decrement the loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  blkCnt = numSamples & 7;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
-
+  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
    real = *pSrc++;
    imag = *pSrc++;

-    /* out = (real * real) + (imag * imag) */
-    /* store the result in the destination buffer. */
+    /* store result in destination buffer. */
    *pDst++ = (real * real) + (imag * imag);

-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of cmplx_mag_squared group
+  @} end of cmplx_mag_squared group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_squared_q15.c
 * Description:  Q15 complex magnitude squared
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,108 +29,133 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_mag_squared
- * @{
+  @addtogroup cmplx_mag_squared
+  @{
 */

 /**
- * @brief  Q15 complex magnitude squared
- * @param  *pSrc points to the complex input vector
- * @param  *pDst points to the real output vector
- * @param  numSamples number of complex samples in the input vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
+  @brief         Q15 complex magnitude squared.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */

 void arm_cmplx_mag_squared_q15(
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t numSamples)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
 {
-  q31_t acc0, acc1;                              /* Accumulators */
+        uint32_t blkCnt;                               /* Loop counter */

 #if defined (ARM_MATH_DSP)
+        q31_t in;
+        q31_t acc0;                                    /* Accumulators */
+#else
+        q15_t real, imag;                              /* Temporary input variables */
+        q31_t acc0, acc1;                              /* Accumulators */
+#endif

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-  q31_t in1, in2, in3, in4;
-  q31_t acc2, acc3;
+#if defined (ARM_MATH_LOOPUNROLL)

-  /*loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
-    in1 = *__SIMD32(pSrc)++;
-    in2 = *__SIMD32(pSrc)++;
-    in3 = *__SIMD32(pSrc)++;
-    in4 = *__SIMD32(pSrc)++;

-    acc0 = __SMUAD(in1, in1);
-    acc1 = __SMUAD(in2, in2);
-    acc2 = __SMUAD(in3, in3);
-    acc3 = __SMUAD(in4, in4);
-
-    /* store the result in 3.13 format in the destination buffer. */
+#if defined (ARM_MATH_DSP)
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
-    *pDst++ = (q15_t) (acc1 >> 17);
-    *pDst++ = (q15_t) (acc2 >> 17);
-    *pDst++ = (q15_t) (acc3 >> 17);

-    /* Decrement the loop counter */
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    *pDst++ = (q15_t) (acc0 >> 17);
+
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    *pDst++ = (q15_t) (acc0 >> 17);
+
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+    *pDst++ = (q15_t) (acc0 >> 17);
+#else
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
-    in1 = *__SIMD32(pSrc)++;
-    acc0 = __SMUAD(in1, in1);
-
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ = (q15_t) (acc0 >> 17);
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
-  q15_t real, imag;                              /* Temporary variables to store real and imaginary values */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* out = ((real * real) + (imag * imag)) */
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+#if defined (ARM_MATH_DSP)
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) (acc0 >> 17);
+#else
    real = *pSrc++;
    imag = *pSrc++;
-    acc0 = (real * real);
-    acc1 = (imag * imag);
-    /* store the result in 3.13 format in the destination buffer. */
+    acc0 = ((q31_t) real * real);
+    acc1 = ((q31_t) imag * imag);
+
+    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+#endif

-    /* Decrement the loop counter */
-    numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of cmplx_mag_squared group
+  @} end of cmplx_mag_squared group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mag_squared_q31.c
 * Description:  Q31 complex magnitude squared
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,55 +29,43 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup cmplx_mag_squared
- * @{
+  @addtogroup cmplx_mag_squared
+  @{
 */

-
 /**
- * @brief  Q31 complex magnitude squared
- * @param  *pSrc points to the complex input vector
- * @param  *pDst points to the real output vector
- * @param  numSamples number of complex samples in the input vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
- * Input down scaling is not required.
+  @brief         Q31 complex magnitude squared.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
+                   Input down scaling is not required.
 */

 void arm_cmplx_mag_squared_q31(
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t numSamples)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
 {
-  q31_t real, imag;                              /* Temporary variables to store real and imaginary values */
-  q31_t acc0, acc1;                              /* Accumulators */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t real, imag;                              /* Temporary input variables */
+        q31_t acc0, acc1;                              /* Accumulators */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counter */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
-    real = *pSrc++;
-    imag = *pSrc++;
-    acc0 = (q31_t) (((q63_t) real * real) >> 33);
-    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 3.29 format in the destination buffer. */
-    *pDst++ = acc0 + acc1;

    real = *pSrc++;
    imag = *pSrc++;
@ -90,60 +78,52 @@ void arm_cmplx_mag_squared_q31(
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;

    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;

-    /* Decrement the loop counter */
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (q31_t) (((q63_t) real * real) >> 33);
+    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+    *pDst++ = acc0 + acc1;
+
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
-    real = *pSrc++;
-    imag = *pSrc++;
-    acc0 = (q31_t) (((q63_t) real * real) >> 33);
-    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 3.29 format in the destination buffer. */
-    *pDst++ = acc0 + acc1;
-
-    /* Decrement the loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* out = ((real * real) + (imag * imag)) */
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
-    /* store the result in 3.29 format in the destination buffer. */
+
+    /* store result in 3.29 format in destination buffer. */
    *pDst++ = acc0 + acc1;

-    /* Decrement the loop counter */
-    numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of cmplx_mag_squared group
+  @} end of cmplx_mag_squared group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_cmplx_f32.c
 * Description:  Floating-point complex-by-complex multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,168 +29,166 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
- *
- * Multiplies a complex vector by another complex vector and generates a complex result.
- * The data in the complex arrays is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The parameter <code>numSamples</code> represents the number of complex
- * samples processed.  The complex arrays have a total of <code>2*numSamples</code>
- * real values.
- *
- * The underlying algorithm is used:
- *
- * <pre>
- * for(n=0; n<numSamples; n++) {
- *     pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
- *     pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
+
+  Multiplies a complex vector by another complex vector and generates a complex result.
+  The data in the complex arrays is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The parameter <code>numSamples</code> represents the number of complex
+  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
+  real values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
+      pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup CmplxByCmplxMult
- * @{
+  @addtogroup CmplxByCmplxMult
+  @{
 */

-
 /**
- * @brief  Floating-point complex-by-complex multiplication
- * @param[in]  *pSrcA points to the first input vector
- * @param[in]  *pSrcB points to the second input vector
- * @param[out]  *pDst  points to the output vector
- * @param[in]  numSamples number of complex samples in each vector
- * @return none.
+  @brief         Floating-point complex-by-complex multiplication.
+  @param[in]     pSrcA       points to first input vector
+  @param[in]     pSrcB       points to second input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
 */

 void arm_cmplx_mult_cmplx_f32(
-  float32_t * pSrcA,
-  float32_t * pSrcB,
-  float32_t * pDst,
-  uint32_t numSamples)
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t numSamples)
 {
-  float32_t a1, b1, c1, d1;                      /* Temporary variables to store real and imaginary values */
-  uint32_t blkCnt;                               /* loop counters */
+    uint32_t blkCnt;                               /* Loop counter */
+    float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4x2_t va, vb;
+    float32x4_t real, imag;
+    float32x4x2_t outCplx;

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t a2, b2, c2, d2;                      /* Temporary variables to store real and imaginary values */
-  float32_t acc1, acc2, acc3, acc4;
+    /* Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;

+    while (blkCnt > 0U)
+    {
+        va = vld2q_f32(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
+        vb = vld2q_f32(pSrcB);  // load & separate real/imag pSrcB

-  /* loop Unrolling */
+	/* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8;
+	
+	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
+        outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
+        outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
+
+	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
+        outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
+        outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
+
+        vst2q_f32(pDst, outCplx);
+
+	/* Increment pointer */
+        pDst += 8;
+
+	/* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = numSamples & 3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a1 = *pSrcA;                /* A[2 * i] */
-    c1 = *pSrcB;                /* B[2 * i] */
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */

-    b1 = *(pSrcA + 1);          /* A[2 * i + 1] */
-    acc1 = a1 * c1;             /* acc1 = A[2 * i] * B[2 * i] */
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    /* store result in destination buffer. */
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);

-    a2 = *(pSrcA + 2);          /* A[2 * i + 2] */
-    acc2 = (b1 * c1);           /* acc2 = A[2 * i + 1] * B[2 * i] */
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);

-    d1 = *(pSrcB + 1);          /* B[2 * i + 1] */
-    c2 = *(pSrcB + 2);          /* B[2 * i + 2] */
-    acc1 -= b1 * d1;            /* acc1 =      A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);

-    d2 = *(pSrcB + 3);          /* B[2 * i + 3] */
-    acc3 = a2 * c2;             /* acc3 =       A[2 * i + 2] * B[2 * i + 2] */
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);

-    b2 = *(pSrcA + 3);          /* A[2 * i + 3] */
-    acc2 += (a1 * d1);          /* acc2 =      A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
-
-    a1 = *(pSrcA + 4);          /* A[2 * i + 4] */
-    acc4 = (a2 * d2);           /* acc4 =   A[2 * i + 2] * B[2 * i + 3] */
-
-    c1 = *(pSrcB + 4);          /* B[2 * i + 4] */
-    acc3 -= (b2 * d2);          /* acc3 =       A[2 * i + 2] * B[2 * i + 2] - A[2 * i + 3] * B[2 * i + 3] */
-    *pDst = acc1;               /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
-
-    b1 = *(pSrcA + 5);          /* A[2 * i + 5] */
-    acc4 += b2 * c2;            /* acc4 =   A[2 * i + 2] * B[2 * i + 3] + A[2 * i + 3] * B[2 * i + 2] */
-
-    *(pDst + 1) = acc2;         /* C[2 * i + 1] = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1]  */
-    acc1 = (a1 * c1);
-
-    d1 = *(pSrcB + 5);
-    acc2 = (b1 * c1);
-
-    *(pDst + 2) = acc3;
-    *(pDst + 3) = acc4;
-
-    a2 = *(pSrcA + 6);
-    acc1 -= (b1 * d1);
-
-    c2 = *(pSrcB + 6);
-    acc2 += (a1 * d1);
-
-    b2 = *(pSrcA + 7);
-    acc3 = (a2 * c2);
-
-    d2 = *(pSrcB + 7);
-    acc4 = (b2 * c2);
-
-    *(pDst + 4) = acc1;
-    pSrcA += 8U;
-
-    acc3 -= (b2 * d2);
-    acc4 += (a2 * d2);
-
-    *(pDst + 5) = acc2;
-    pSrcB += 8U;
-
-    *(pDst + 6) = acc3;
-    *(pDst + 7) = acc4;
-
-    pDst += 8U;
-
-    /* Decrement the numSamples loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a1 = *pSrcA++;
-    b1 = *pSrcA++;
-    c1 = *pSrcB++;
-    d1 = *pSrcB++;
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */

-    /* store the result in the destination buffer. */
-    *pDst++ = (a1 * c1) - (b1 * d1);
-    *pDst++ = (a1 * d1) + (b1 * c1);
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;

-    /* Decrement the numSamples loop counter */
+    /* store result in destination buffer. */
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of CmplxByCmplxMult group
+  @} end of CmplxByCmplxMult group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_cmplx_q15.c
 * Description:  Q15 complex-by-complex multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,153 +29,108 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup CmplxByCmplxMult
- * @{
+  @addtogroup CmplxByCmplxMult
+  @{
 */

 /**
- * @brief  Q15 complex-by-complex multiplication
- * @param[in]  *pSrcA points to the first input vector
- * @param[in]  *pSrcB points to the second input vector
- * @param[out]  *pDst  points to the output vector
- * @param[in]  numSamples number of complex samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
+  @brief         Q15 complex-by-complex multiplication.
+  @param[in]     pSrcA       points to first input vector
+  @param[in]     pSrcB       points to second input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */

 void arm_cmplx_mult_cmplx_q15(
-  q15_t * pSrcA,
-  q15_t * pSrcB,
-  q15_t * pDst,
-  uint32_t numSamples)
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t numSamples)
 {
-  q15_t a, b, c, d;                              /* Temporary variables to store real and imaginary values */
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t a, b, c, d;                              /* Temporary variables */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counters */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
-
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
-
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );

-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );

-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
-
-    /* Decrement the blockSize loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;

-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
-    /* store the result in 3.13 format in the destination buffer. */
-    *pDst++ =
-      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );

-    /* Decrement the blockSize loop counter */
-    numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of CmplxByCmplxMult group
+  @} end of CmplxByCmplxMult group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_cmplx_q31.c
 * Description:  Q31 complex-by-complex multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,286 +29,109 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup CmplxByCmplxMult
- * @{
+  @addtogroup CmplxByCmplxMult
+  @{
 */

-
 /**
- * @brief  Q31 complex-by-complex multiplication
- * @param[in]  *pSrcA points to the first input vector
- * @param[in]  *pSrcB points to the second input vector
- * @param[out]  *pDst  points to the output vector
- * @param[in]  numSamples number of complex samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
- * Input down scaling is not required.
+  @brief         Q31 complex-by-complex multiplication.
+  @param[in]     pSrcA       points to first input vector
+  @param[in]     pSrcB       points to second input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
+                   Input down scaling is not required.
 */

 void arm_cmplx_mult_cmplx_q31(
-  q31_t * pSrcA,
-  q31_t * pSrcB,
-  q31_t * pDst,
-  uint32_t numSamples)
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t numSamples)
 {
-  q31_t a, b, c, d;                              /* Temporary variables to store real and imaginary values */
-  uint32_t blkCnt;                               /* loop counters */
-  q31_t mul1, mul2, mul3, mul4;
-  q31_t out1, out2;
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t a, b, c, d;                              /* Temporary variables */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
+    /* store result in 3.29 format in destination buffer. */
+    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
+    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );

    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
+    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );

-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );

-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
-
-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
-
-    /* Decrement the blockSize loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  /* loop Unrolling */
-  blkCnt = numSamples >> 1U;
-
-  /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
-   ** a second loop below computes the remaining 1 sample. */
-  while (blkCnt > 0U)
-  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
-
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
-
-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
-
-    /* Decrement the blockSize loop counter */
-    blkCnt--;
-  }
-
-  /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples % 0x2U;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;

-    mul1 = (q31_t) (((q63_t) a * c) >> 32);
-    mul2 = (q31_t) (((q63_t) b * d) >> 32);
-    mul3 = (q31_t) (((q63_t) a * d) >> 32);
-    mul4 = (q31_t) (((q63_t) b * c) >> 32);
+    /* store result in 3.29 format in destination buffer. */
+    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );

-    mul1 = (mul1 >> 1);
-    mul2 = (mul2 >> 1);
-    mul3 = (mul3 >> 1);
-    mul4 = (mul4 >> 1);
-
-    out1 = mul1 - mul2;
-    out2 = mul3 + mul4;
-
-    /* store the real result in 3.29 format in the destination buffer. */
-    *pDst++ = out1;
-    /* store the imag result in 3.29 format in the destination buffer. */
-    *pDst++ = out2;
-
-    /* Decrement the blockSize loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of CmplxByCmplxMult group
+  @} end of CmplxByCmplxMult group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_real_f32.c
 * Description:  Floating-point complex by real multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,185 +29,141 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @defgroup CmplxByRealMult Complex-by-Real Multiplication
- *
- * Multiplies a complex vector by a real vector and generates a complex result.
- * The data in the complex arrays is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * The parameter <code>numSamples</code> represents the number of complex
- * samples processed.  The complex arrays have a total of <code>2*numSamples</code>
- * real values while the real array has a total of <code>numSamples</code>
- * real values.
- *
- * The underlying algorithm is used:
- *
- * <pre>
- * for(n=0; n<numSamples; n++) {
- *     pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
- *     pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
- * }
- * </pre>
- *
- * There are separate functions for floating-point, Q15, and Q31 data types.
+  @defgroup CmplxByRealMult Complex-by-Real Multiplication
+
+  Multiplies a complex vector by a real vector and generates a complex result.
+  The data in the complex arrays is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The parameter <code>numSamples</code> represents the number of complex
+  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
+  real values while the real array has a total of <code>numSamples</code>
+  real values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
+      pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
 */

 /**
- * @addtogroup CmplxByRealMult
- * @{
+  @addtogroup CmplxByRealMult
+  @{
 */

-
 /**
- * @brief  Floating-point complex-by-real multiplication
- * @param[in]  *pSrcCmplx points to the complex input vector
- * @param[in]  *pSrcReal points to the real input vector
- * @param[out]  *pCmplxDst points to the complex output vector
- * @param[in]  numSamples number of samples in each vector
- * @return none.
+  @brief         Floating-point complex-by-real multiplication.
+  @param[in]     pSrcCmplx   points to complex input vector
+  @param[in]     pSrcReal    points to real input vector
+  @param[out]    pCmplxDst   points to complex output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
 */

 void arm_cmplx_mult_real_f32(
-  float32_t * pSrcCmplx,
-  float32_t * pSrcReal,
-  float32_t * pCmplxDst,
-  uint32_t numSamples)
+  const float32_t * pSrcCmplx,
+  const float32_t * pSrcReal,
+        float32_t * pCmplxDst,
+        uint32_t numSamples)
 {
-  float32_t in;                                  /* Temporary variable to store input value */
-  uint32_t blkCnt;                               /* loop counters */
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t in;                                  /* Temporary variable */

-#if defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_NEON)
+    float32x4_t r;
+    float32x4x2_t ab,outCplx;

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  float32_t inA1, inA2, inA3, inA4;              /* Temporary variables to hold input data */
-  float32_t inA5, inA6, inA7, inA8;              /* Temporary variables to hold input data */
-  float32_t inB1, inB2, inB3, inB4;              /* Temporary variables to hold input data */
-  float32_t out1, out2, out3, out4;              /* Temporary variables to hold output data */
-  float32_t out5, out6, out7, out8;              /* Temporary variables to hold output data */
+    /* Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;

-  /* loop Unrolling */
+    while (blkCnt > 0U)
+    {
+        ab = vld2q_f32(pSrcCmplx);  // load & separate real/imag pSrcA (de-interleave 2)
+        r = vld1q_f32(pSrcReal);  // load & separate real/imag pSrcB
+
+	/* Increment pointers */
+        pSrcCmplx += 8;
+        pSrcReal += 4;
+
+        outCplx.val[0] = vmulq_f32(ab.val[0], r);
+        outCplx.val[1] = vmulq_f32(ab.val[1], r);
+
+        vst2q_f32(pCmplxDst, outCplx);
+        pCmplxDst += 8;
+
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = numSamples & 3;
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
-    /* read input from complex input buffer */
-    inA1 = pSrcCmplx[0];
-    inA2 = pSrcCmplx[1];
-    /* read input from real input buffer */
-    inB1 = pSrcReal[0];
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */

-    /* read input from complex input buffer */
-    inA3 = pSrcCmplx[2];
+    in = *pSrcReal++;
+    /* store result in destination buffer. */
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;

-    /* multiply complex buffer real input with real buffer input */
-    out1 = inA1 * inB1;
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;

-    /* read input from complex input buffer */
-    inA4 = pSrcCmplx[3];
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;

-    /* multiply complex buffer imaginary input with real buffer input */
-    out2 = inA2 * inB1;
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++* in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;

-    /* read input from real input buffer */
-    inB2 = pSrcReal[1];
-    /* read input from complex input buffer */
-    inA5 = pSrcCmplx[4];
-
-    /* multiply complex buffer real input with real buffer input */
-    out3 = inA3 * inB2;
-
-    /* read input from complex input buffer */
-    inA6 = pSrcCmplx[5];
-    /* read input from real input buffer */
-    inB3 = pSrcReal[2];
-
-    /* multiply complex buffer imaginary input with real buffer input */
-    out4 = inA4 * inB2;
-
-    /* read input from complex input buffer */
-    inA7 = pSrcCmplx[6];
-
-    /* multiply complex buffer real input with real buffer input */
-    out5 = inA5 * inB3;
-
-    /* read input from complex input buffer */
-    inA8 = pSrcCmplx[7];
-
-    /* multiply complex buffer imaginary input with real buffer input */
-    out6 = inA6 * inB3;
-
-    /* read input from real input buffer */
-    inB4 = pSrcReal[3];
-
-    /* store result to destination bufer */
-    pCmplxDst[0] = out1;
-
-    /* multiply complex buffer real input with real buffer input */
-    out7 = inA7 * inB4;
-
-    /* store result to destination bufer */
-    pCmplxDst[1] = out2;
-
-    /* multiply complex buffer imaginary input with real buffer input */
-    out8 = inA8 * inB4;
-
-    /* store result to destination bufer */
-    pCmplxDst[2] = out3;
-    pCmplxDst[3] = out4;
-    pCmplxDst[4] = out5;
-
-    /* incremnet complex input buffer by 8 to process next samples */
-    pSrcCmplx += 8U;
-
-    /* store result to destination bufer */
-    pCmplxDst[5] = out6;
-
-    /* increment real input buffer by 4 to process next samples */
-    pSrcReal += 4U;
-
-    /* store result to destination bufer */
-    pCmplxDst[6] = out7;
-    pCmplxDst[7] = out8;
-
-    /* increment destination buffer by 8 to process next sampels */
-    pCmplxDst += 8U;
-
-    /* Decrement the numSamples loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;

-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */

  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
-    in = *pSrcReal++;
-    /* store the result in the destination buffer. */
-    *pCmplxDst++ = (*pSrcCmplx++) * (in);
-    *pCmplxDst++ = (*pSrcCmplx++) * (in);
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */

-    /* Decrement the numSamples loop counter */
+    in = *pSrcReal++;
+    /* store result in destination buffer. */
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    /* Decrement loop counter */
    blkCnt--;
  }
+
 }

 /**
- * @} end of CmplxByRealMult group
+  @} end of CmplxByRealMult group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_real_q15.c
 * Description:  Q15 complex by real multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,77 +29,71 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup CmplxByRealMult
- * @{
+  @addtogroup CmplxByRealMult
+  @{
 */

-
 /**
- * @brief  Q15 complex-by-real multiplication
- * @param[in]  *pSrcCmplx points to the complex input vector
- * @param[in]  *pSrcReal points to the real input vector
- * @param[out]  *pCmplxDst points to the complex output vector
- * @param[in]  numSamples number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+  @brief         Q15 complex-by-real multiplication.
+  @param[in]     pSrcCmplx   points to complex input vector
+  @param[in]     pSrcReal    points to real input vector
+  @param[out]    pCmplxDst   points to complex output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

 void arm_cmplx_mult_real_q15(
-  q15_t * pSrcCmplx,
-  q15_t * pSrcReal,
-  q15_t * pCmplxDst,
-  uint32_t numSamples)
+  const q15_t * pSrcCmplx,
+  const q15_t * pSrcReal,
+        q15_t * pCmplxDst,
+        uint32_t numSamples)
 {
-  q15_t in;                                      /* Temporary variable to store input value */
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)

 #if defined (ARM_MATH_DSP)
+        q31_t inA1, inA2;                              /* Temporary variables to hold input data */
+        q31_t inB1;                                    /* Temporary variables to hold input data */
+        q15_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
+        q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables to hold intermediate data */
+#endif

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counters */
-  q31_t inA1, inA2;                              /* Temporary variables to hold input data */
-  q31_t inB1;                                    /* Temporary variables to hold input data */
-  q15_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
-  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables to hold intermediate data */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
-    /* read complex number both real and imaginary from complex input buffer */
-    inA1 = *__SIMD32(pSrcCmplx)++;
-    /* read two real values at a time from real input buffer */
-    inB1 = *__SIMD32(pSrcReal)++;
-    /* read complex number both real and imaginary from complex input buffer */
-    inA2 = *__SIMD32(pSrcCmplx)++;
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 complex numbers both real and imaginary from complex input buffer */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+    /* read 2 real values at a time from real input buffer */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);

    /* multiply complex number with real numbers */
 #ifndef ARM_MATH_BIG_ENDIAN
-
-    mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
+    mul1 = (q31_t) ((q15_t) (inA1)       * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
-    mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
+    mul3 = (q31_t) ((q15_t) (inA2)       * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
-
 #else
-
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
-    mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
+    mul1 = (q31_t) ((q15_t) inA1         * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
-    mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
-
+    mul3 = (q31_t) ((q15_t) inA2         * (q15_t) inB1);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* saturate the result */
@ -109,27 +103,23 @@ void arm_cmplx_mult_real_q15(
    out4 = (q15_t) __SSAT(mul4 >> 15U, 16);

    /* pack real and imaginary outputs and store them to destination */
-    *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
-    *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
+    write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
+    write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));

-    inA1 = *__SIMD32(pSrcCmplx)++;
-    inB1 = *__SIMD32(pSrcReal)++;
-    inA2 = *__SIMD32(pSrcCmplx)++;
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);

 #ifndef ARM_MATH_BIG_ENDIAN
-
-    mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
+    mul1 = (q31_t) ((q15_t) (inA1)       * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
-    mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
+    mul3 = (q31_t) ((q15_t) (inA2)       * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
-
 #else
-
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
-    mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
+    mul1 = (q31_t) ((q15_t) inA1         * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
-
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
@ -137,55 +127,56 @@ void arm_cmplx_mult_real_q15(
    out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15U, 16);

-    *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
-    *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
-
-    /* Decrement the numSamples loop counter */
-    blkCnt--;
-  }
-
-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  blkCnt = numSamples % 0x4U;
-
-  while (blkCnt > 0U)
-  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
+    write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
+    write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
+#else
    in = *pSrcReal++;
-    /* store the result in the destination buffer. */
-    *pCmplxDst++ =
-      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
-    *pCmplxDst++ =
-      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);

-    /* Decrement the numSamples loop counter */
+    in = *pSrcReal++;
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+    in = *pSrcReal++;
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+    in = *pSrcReal++;
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+#endif
+
+    /* Decrement loop counter */
    blkCnt--;
  }

+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* realOut = realA * realB.            */
-    /* imagOut = imagA * realB.                */
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
-    *pCmplxDst++ =
-      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
-    *pCmplxDst++ =
-      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);

-    /* Decrement the numSamples loop counter */
-    numSamples--;
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of CmplxByRealMult group
+  @} end of CmplxByRealMult group
 */
--- a/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cmplx_mult_real_q31.c
 * Description:  Q31 complex by real multiplication
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,183 +29,120 @@
 #include "arm_math.h"

 /**
- * @ingroup groupCmplxMath
+  @ingroup groupCmplxMath
 */

 /**
- * @addtogroup CmplxByRealMult
- * @{
+  @addtogroup CmplxByRealMult
+  @{
 */

-
 /**
- * @brief  Q31 complex-by-real multiplication
- * @param[in]  *pSrcCmplx points to the complex input vector
- * @param[in]  *pSrcReal points to the real input vector
- * @param[out]  *pCmplxDst points to the complex output vector
- * @param[in]  numSamples number of samples in each vector
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+  @brief         Q31 complex-by-real multiplication.
+  @param[in]     pSrcCmplx   points to complex input vector
+  @param[in]     pSrcReal    points to real input vector
+  @param[out]    pCmplxDst   points to complex output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */

 void arm_cmplx_mult_real_q31(
-  q31_t * pSrcCmplx,
-  q31_t * pSrcReal,
-  q31_t * pCmplxDst,
-  uint32_t numSamples)
+  const q31_t * pSrcCmplx,
+  const q31_t * pSrcReal,
+        q31_t * pCmplxDst,
+        uint32_t numSamples)
 {
-  q31_t inA1;                                    /* Temporary variable to store input value */
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary variable */

-#if defined (ARM_MATH_DSP)
+#if defined (ARM_MATH_LOOPUNROLL)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-  uint32_t blkCnt;                               /* loop counters */
-  q31_t inA2, inA3, inA4;                        /* Temporary variables to hold input data */
-  q31_t inB1, inB2;                              /* Temporary variabels to hold input data */
-  q31_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
-
-  /* loop Unrolling */
+  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
-    /* read real input from complex input buffer */
-    inA1 = *pSrcCmplx++;
-    inA2 = *pSrcCmplx++;
-    /* read input from real input bufer */
-    inB1 = *pSrcReal++;
-    inB2 = *pSrcReal++;
-    /* read imaginary input from complex input buffer */
-    inA3 = *pSrcCmplx++;
-    inA4 = *pSrcCmplx++;
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */

-    /* multiply complex input with real input */
-    out1 = ((q63_t) inA1 * inB1) >> 32;
-    out2 = ((q63_t) inA2 * inB1) >> 32;
-    out3 = ((q63_t) inA3 * inB2) >> 32;
-    out4 = ((q63_t) inA4 * inB2) >> 32;
+    in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+    /* store saturated result in 1.31 format to destination buffer */
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+    /* store result in destination buffer. */
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif

-    /* sature the result */
-    out1 = __SSAT(out1, 31);
-    out2 = __SSAT(out2, 31);
-    out3 = __SSAT(out3, 31);
-    out4 = __SSAT(out4, 31);
+    in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif

-    /* get result in 1.31 format */
-    out1 = out1 << 1;
-    out2 = out2 << 1;
-    out3 = out3 << 1;
-    out4 = out4 << 1;
+    in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif

-    /* store the result to destination buffer */
-    *pCmplxDst++ = out1;
-    *pCmplxDst++ = out2;
-    *pCmplxDst++ = out3;
-    *pCmplxDst++ = out4;
+    in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif

-    /* read real input from complex input buffer */
-    inA1 = *pSrcCmplx++;
-    inA2 = *pSrcCmplx++;
-    /* read input from real input bufer */
-    inB1 = *pSrcReal++;
-    inB2 = *pSrcReal++;
-    /* read imaginary input from complex input buffer */
-    inA3 = *pSrcCmplx++;
-    inA4 = *pSrcCmplx++;
-
-    /* multiply complex input with real input */
-    out1 = ((q63_t) inA1 * inB1) >> 32;
-    out2 = ((q63_t) inA2 * inB1) >> 32;
-    out3 = ((q63_t) inA3 * inB2) >> 32;
-    out4 = ((q63_t) inA4 * inB2) >> 32;
-
-    /* sature the result */
-    out1 = __SSAT(out1, 31);
-    out2 = __SSAT(out2, 31);
-    out3 = __SSAT(out3, 31);
-    out4 = __SSAT(out4, 31);
-
-    /* get result in 1.31 format */
-    out1 = out1 << 1;
-    out2 = out2 << 1;
-    out3 = out3 << 1;
-    out4 = out4 << 1;
-
-    /* store the result to destination buffer */
-    *pCmplxDst++ = out1;
-    *pCmplxDst++ = out2;
-    *pCmplxDst++ = out3;
-    *pCmplxDst++ = out4;
-
-    /* Decrement the numSamples loop counter */
+    /* Decrement loop counter */
    blkCnt--;
  }

-  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
+  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;

-  while (blkCnt > 0U)
-  {
-    /* C[2 * i] = A[2 * i] * B[i].            */
-    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
-    /* read real input from complex input buffer */
-    inA1 = *pSrcCmplx++;
-    inA2 = *pSrcCmplx++;
-    /* read input from real input bufer */
-    inB1 = *pSrcReal++;
-
-    /* multiply complex input with real input */
-    out1 = ((q63_t) inA1 * inB1) >> 32;
-    out2 = ((q63_t) inA2 * inB1) >> 32;
-
-    /* sature the result */
-    out1 = __SSAT(out1, 31);
-    out2 = __SSAT(out2, 31);
-
-    /* get result in 1.31 format */
-    out1 = out1 << 1;
-    out2 = out2 << 1;
-
-    /* store the result to destination buffer */
-    *pCmplxDst++ = out1;
-    *pCmplxDst++ = out2;
-
-    /* Decrement the numSamples loop counter */
-    blkCnt--;
-  }
-
 #else

-  /* Run the below code for Cortex-M0 */
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;

-  while (numSamples > 0U)
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
  {
-    /* realOut = realA * realB.            */
-    /* imagReal = imagA * realB.               */
-    inA1 = *pSrcReal++;
-    /* store the result in the destination buffer. */
-    *pCmplxDst++ =
-      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
-    *pCmplxDst++ =
-      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */

-    /* Decrement the numSamples loop counter */
-    numSamples--;
+    in = *pSrcReal++;
+#if defined (ARM_MATH_DSP)
+    /* store saturated result in 1.31 format to destination buffer */
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+#else
+    /* store result in destination buffer. */
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of CmplxByRealMult group
+  @} end of CmplxByRealMult group
 */
--- a/DSP/Source/ControllerFunctions/CMakeLists.txt
+++ b/DSP/Source/ControllerFunctions/CMakeLists.txt
@ -0,0 +1,37 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPController)
+
+add_library(CMSISDSPController STATIC)
+
+configdsp(CMSISDSPController ..)
+
+include(interpol)
+interpol(CMSISDSPController)
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPController PUBLIC ARM_ALL_FAST_TABLES)  
+endif()
+
+target_sources(CMSISDSPController PRIVATE arm_pid_init_f32.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_init_q15.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_init_q31.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_f32.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_q15.c)
+target_sources(CMSISDSPController PRIVATE arm_pid_reset_q31.c)
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_F32)
+target_sources(CMSISDSPController PRIVATE arm_sin_cos_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_Q31)
+target_sources(CMSISDSPController PRIVATE arm_sin_cos_q31.c)
+endif()
+
+
+
+### Includes
+target_include_directories(CMSISDSPController PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/ControllerFunctions/ControllerFunctions.c
+++ b/DSP/Source/ControllerFunctions/ControllerFunctions.c
@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        ControllerFunctions.c
+ * Description:  Combination of all controller function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_pid_init_f32.c"
+#include "arm_pid_init_q15.c"
+#include "arm_pid_init_q31.c"
+#include "arm_pid_reset_f32.c"
+#include "arm_pid_reset_q15.c"
+#include "arm_pid_reset_q31.c"
+#include "arm_sin_cos_f32.c"
+#include "arm_sin_cos_q31.c"
+
--- a/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_init_f32.c
 * Description:  Floating-point PID Control initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,29 +28,30 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
- * @brief  Initialization function for the floating-point PID Control.
- * @param[in,out] *S points to an instance of the PID structure.
- * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state & 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
- * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+  @brief         Initialization function for the floating-point PID Control.
+  @param[in,out] S               points to an instance of the PID structure
+  @param[in]     resetStateFlag
+                   - value = 0: no change in state
+                   - value = 1: reset state
+  @return        none
+
+  @par           Details
+                   The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
+                   The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
+                   using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+                   also sets the state variables to all zeros.
 */

 void arm_pid_init_f32(
  arm_pid_instance_f32 * S,
  int32_t resetStateFlag)
 {
-
  /* Derived coefficient A0 */
  S->A0 = S->Kp + S->Ki + S->Kd;

@ -63,12 +64,12 @@ void arm_pid_init_f32(
  /* Check whether state needs reset or not */
  if (resetStateFlag)
  {
-    /* Clear the state buffer.  The size will be always 3 samples */
+    /* Reset state to zero, The size will be always 3 samples */
    memset(S->state, 0, 3U * sizeof(float32_t));
  }

 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_init_q15.c
 * Description:  Q15 PID Control initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,22 +28,24 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
- * @details
- * @param[in,out] *S points to an instance of the Q15 PID structure.
- * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
- * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+  @brief         Initialization function for the Q15 PID Control.
+  @param[in,out] S               points to an instance of the Q15 PID structure
+  @param[in]     resetStateFlag
+                   - value = 0: no change in state
+                   - value = 1: reset state
+  @return        none
+
+  @par           Details
+                   The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
+                   The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
+                   using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+                   also sets the state variables to all zeros.
 */

 void arm_pid_init_q15(
@ -53,35 +55,20 @@ void arm_pid_init_q15(

 #if defined (ARM_MATH_DSP)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-
  /* Derived coefficient A0 */
  S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);

  /* Derived coefficients and pack into A1 */

 #ifndef  ARM_MATH_BIG_ENDIAN
-
  S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);
-
 #else
-
  S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);
-
-#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
-
-  /* Check whether state needs reset or not */
-  if (resetStateFlag)
-  {
-    /* Clear the state buffer.  The size will be always 3 samples */
-    memset(S->state, 0, 3U * sizeof(q15_t));
-  }
+#endif

 #else

-  /* Run the below code for Cortex-M0 */
-
-  q31_t temp;                                    /*to store the sum */
+  q31_t temp;                                    /* to store the sum */

  /* Derived coefficient A0 */
  temp = S->Kp + S->Ki + S->Kd;
@ -92,19 +79,17 @@ void arm_pid_init_q15(
  S->A1 = (q15_t) __SSAT(temp, 16);
  S->A2 = S->Kd;

-
+#endif /* #if defined (ARM_MATH_DSP) */

  /* Check whether state needs reset or not */
  if (resetStateFlag)
  {
-    /* Clear the state buffer.  The size will be always 3 samples */
+    /* Reset state to zero, The size will be always 3 samples */
    memset(S->state, 0, 3U * sizeof(q15_t));
  }

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_init_q31.c
 * Description:  Q31 PID Control initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,22 +28,24 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
- * @brief  Initialization function for the Q31 PID Control.
- * @param[in,out] *S points to an instance of the Q31 PID structure.
- * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
- * @return none.
- * \par Description:
- * \par
- * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
- * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
- * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
- * also sets the state variables to all zeros.
+  @brief         Initialization function for the Q31 PID Control.
+  @param[in,out] S               points to an instance of the Q31 PID structure
+  @param[in]     resetStateFlag
+                   - value = 0: no change in state
+                   - value = 1: reset state
+  @return        none
+
+  @par           Details
+                   The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n
+                   The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>
+                   using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)
+                   also sets the state variables to all zeros.
 */

 void arm_pid_init_q31(
@ -53,20 +55,15 @@ void arm_pid_init_q31(

 #if defined (ARM_MATH_DSP)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-
  /* Derived coefficient A0 */
  S->A0 = __QADD(__QADD(S->Kp, S->Ki), S->Kd);

  /* Derived coefficient A1 */
  S->A1 = -__QADD(__QADD(S->Kd, S->Kd), S->Kp);

-
 #else

-  /* Run the below code for Cortex-M0 */
-
-  q31_t temp;
+  q31_t temp;                                    /* to store the sum */

  /* Derived coefficient A0 */
  temp = clip_q63_to_q31((q63_t) S->Kp + S->Ki);
@ -84,12 +81,12 @@ void arm_pid_init_q31(
  /* Check whether state needs reset or not */
  if (resetStateFlag)
  {
-    /* Clear the state buffer.  The size will be always 3 samples */
+    /* Reset state to zero, The size will be always 3 samples */
    memset(S->state, 0, 3U * sizeof(q31_t));
  }

 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_reset_f32.c
 * Description:  Floating-point PID Control reset function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,26 +28,27 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
-* @brief  Reset function for the floating-point PID Control.
-* @param[in] *S	Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+  @brief         Reset function for the floating-point PID Control.
+  @param[in,out] S  points to an instance of the floating-point PID structure
+  @return        none
+
+  @par           Details
+                   The function resets the state buffer to zeros.
+ */
+
 void arm_pid_reset_f32(
  arm_pid_instance_f32 * S)
 {
-
-  /* Clear the state buffer.  The size will be always 3 samples */
+  /* Reset state to zero, The size will be always 3 samples */
  memset(S->state, 0, 3U * sizeof(float32_t));
 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_reset_q15.c
 * Description:  Q15 PID Control reset function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,18 +28,20 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
-* @brief  Reset function for the Q15 PID Control.
-* @param[in] *S		Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+  @brief         Reset function for the Q15 PID Control.
+  @param[in,out] S  points to an instance of the Q15 PID structure
+  @return        none
+
+  @par           Details
+                   The function resets the state buffer to zeros.
+ */
+
 void arm_pid_reset_q15(
  arm_pid_instance_q15 * S)
 {
@ -48,5 +50,5 @@ void arm_pid_reset_q15(
 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_pid_reset_q31.c
 * Description:  Q31 PID Control reset function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,26 +28,27 @@

 #include "arm_math.h"

- /**
- * @addtogroup PID
- * @{
+/**
+  @addtogroup PID
+  @{
 */

 /**
-* @brief  Reset function for the Q31 PID Control.
-* @param[in] *S	Instance pointer of PID control data structure.
-* @return none.
-* \par Description:
-* The function resets the state buffer to zeros.
-*/
+  @brief         Reset function for the Q31 PID Control.
+  @param[in,out] S  points to an instance of the Q31 PID structure
+  @return        none
+
+  @par           Details
+                   The function resets the state buffer to zeros.
+ */
+
 void arm_pid_reset_q31(
  arm_pid_instance_q31 * S)
 {
-
-  /* Clear the state buffer.  The size will be always 3 samples */
+  /* Reset state to zero, The size will be always 3 samples */
  memset(S->state, 0, 3U * sizeof(q31_t));
 }

 /**
- * @} end of PID group
+  @} end of PID group
 */
--- a/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
+++ b/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_sin_cos_f32.c
 * Description:  Sine and Cosine calculation for floating-point values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,115 +30,117 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupController
+  @ingroup groupController
 */

 /**
- * @defgroup SinCos Sine Cosine
- *
- * Computes the trigonometric sine and cosine values using a combination of table lookup
- * and linear interpolation.
- * There are separate functions for Q31 and floating-point data types.
- * The input to the floating-point version is in degrees while the
- * fixed-point Q31 have a scaled input with the range
- * [-1 0.9999] mapping to [-180 +180] degrees.
- *
- * The floating point function also allows values that are out of the usual range. When this happens, the function will
- * take extra time to adjust the input value to the range of [-180 180].
- *
- * The result is accurate to 5 digits after the decimal point.
- *
- * The implementation is based on table lookup using 360 values together with linear interpolation.
- * The steps used are:
- *  -# Calculation of the nearest integer table index.
- *  -# Compute the fractional portion (fract) of the input.
- *  -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
- *  -# Sine value is computed as <code> *psinVal = y0 + (fract * (y1 - y0))</code>.
- *  -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
- *  -# Cosine value is computed as <code> *pcosVal = y0 + (fract * (y1 - y0))</code>.
- */
+  @defgroup SinCos Sine Cosine

- /**
- * @addtogroup SinCos
- * @{
+  Computes the trigonometric sine and cosine values using a combination of table lookup
+  and linear interpolation.
+  There are separate functions for Q31 and floating-point data types.
+  The input to the floating-point version is in degrees while the
+  fixed-point Q31 have a scaled input with the range
+  [-1 0.9999] mapping to [-180 +180] degrees.
+
+  The floating point function also allows values that are out of the usual range. When this happens, the function will
+  take extra time to adjust the input value to the range of [-180 180].
+
+  The result is accurate to 5 digits after the decimal point.
+
+  The implementation is based on table lookup using 360 values together with linear interpolation.
+  The steps used are:
+   -# Calculation of the nearest integer table index.
+   -# Compute the fractional portion (fract) of the input.
+   -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.
+   -# Sine value is computed as <code> *psinVal = y0 + (fract * (y1 - y0))</code>.
+   -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.
+   -# Cosine value is computed as <code> *pcosVal = y0 + (fract * (y1 - y0))</code>.
 */

 /**
- * @brief  Floating-point sin_cos function.
- * @param[in]  theta    input value in degrees
- * @param[out] *pSinVal points to the processed sine output.
- * @param[out] *pCosVal points to the processed cos output.
- * @return none.
+  @addtogroup SinCos
+  @{
+ */
+
+/**
+  @brief         Floating-point sin_cos function.
+  @param[in]     theta    input value in degrees
+  @param[out]    pSinVal  points to processed sine output
+  @param[out]    pCosVal  points to processed cosine output
+  @return        none
 */

 void arm_sin_cos_f32(
-                      float32_t theta,
-                      float32_t * pSinVal,
-                      float32_t * pCosVal)
+  float32_t theta,
+  float32_t * pSinVal,
+  float32_t * pCosVal)
 {
-    float32_t fract, in;                             /* Temporary variables for input, output */
-    uint16_t indexS, indexC;                         /* Index variable */
-    float32_t f1, f2, d1, d2;                        /* Two nearest output values */
-    float32_t findex, Dn, Df, temp;
+  float32_t fract, in;                             /* Temporary input, output variables */
+  uint16_t indexS, indexC;                         /* Index variable */
+  float32_t f1, f2, d1, d2;                        /* Two nearest output values */
+  float32_t Dn, Df;
+  float32_t temp, findex;

-    /* input x is in degrees */
-    /* Scale the input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
-    in = theta * 0.00277777777778f;
+  /* input x is in degrees */
+  /* Scale input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
+  in = theta * 0.00277777777778f;

-    if (in < 0.0f)
-    {
-        in = -in;
-    }
+  if (in < 0.0f)
+  {
+    in = -in;
+  }

-    in = in - (int32_t)in;
+  in = in - (int32_t)in;

-    /* Calculation of index of the table */
-    findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
-    indexS = ((uint16_t)findex) & 0x1ff;
-    indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
+  /* Calculate the nearest index */
+  findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+  indexS = ((uint16_t)findex) & 0x1ff;
+  indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;

-    /* fractional value calculation */
-    fract = findex - (float32_t) indexS;
+  /* Calculation of fractional value */
+  fract = findex - (float32_t) indexS;

-    /* Read two nearest values of input value from the cos & sin tables */
-    f1 = sinTable_f32[indexC+0];
-    f2 = sinTable_f32[indexC+1];
-    d1 = -sinTable_f32[indexS+0];
-    d2 = -sinTable_f32[indexS+1];
+  /* Read two nearest values of input value from the cos & sin tables */
+  f1 =  sinTable_f32[indexC  ];
+  f2 =  sinTable_f32[indexC+1];
+  d1 = -sinTable_f32[indexS  ];
+  d2 = -sinTable_f32[indexS+1];

-    temp = (1.0f - fract) * f1 + fract * f2;
+  temp = (1.0f - fract) * f1 + fract * f2;

-    Dn = 0.0122718463030f; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
-    Df = f2 - f1;          // delta between the values of the functions
+  Dn = 0.0122718463030f; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
+  Df = f2 - f1;          /* delta between the values of the functions */

-    temp = Dn *(d1 + d2) - 2 * Df;
-    temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
-    temp = fract * temp + d1 * Dn;
+  temp = Dn * (d1 + d2) - 2 * Df;
+  temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
+  temp = fract * temp + d1 * Dn;

-    /* Calculation of cosine value */
-    *pCosVal = fract * temp + f1;
+  /* Calculation of cosine value */
+  *pCosVal = fract * temp + f1;

-    /* Read two nearest values of input value from the cos & sin tables */
-    f1 = sinTable_f32[indexS+0];
-    f2 = sinTable_f32[indexS+1];
-    d1 = sinTable_f32[indexC+0];
-    d2 = sinTable_f32[indexC+1];
+  /* Read two nearest values of input value from the cos & sin tables */
+  f1 = sinTable_f32[indexS  ];
+  f2 = sinTable_f32[indexS+1];
+  d1 = sinTable_f32[indexC  ];
+  d2 = sinTable_f32[indexC+1];

-    temp = (1.0f - fract) * f1 + fract * f2;
+  temp = (1.0f - fract) * f1 + fract * f2;

-    Df = f2 - f1; // delta between the values of the functions
-    temp = Dn*(d1 + d2) - 2*Df;
-    temp = fract*temp + (3*Df - (d2 + 2*d1)*Dn);
-    temp = fract*temp + d1*Dn;
+  Df = f2 - f1; // delta between the values of the functions
+  temp = Dn * (d1 + d2) - 2 * Df;
+  temp = fract * temp + (3 * Df - (d2 + 2 * d1) * Dn);
+  temp = fract * temp + d1 * Dn;

-    /* Calculation of sine value */
-    *pSinVal = fract*temp + f1;
+  /* Calculation of sine value */
+  *pSinVal = fract * temp + f1;

-    if (theta < 0.0f)
-    {
-        *pSinVal = -*pSinVal;
-    }
+  if (theta < 0.0f)
+  {
+    *pSinVal = -*pSinVal;
+  }
 }
+
 /**
- * @} end of SinCos group
+  @} end of SinCos group
 */
--- a/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
+++ b/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_sin_cos_q31.c
 * Description:  Cosine & Sine calculation for Q31 values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,23 +30,22 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupController
- */
-
- /**
- * @addtogroup SinCos
- * @{
+  @ingroup groupController
 */

 /**
- * @brief  Q31 sin_cos function.
- * @param[in]  theta    scaled input value in degrees
- * @param[out] *pSinVal points to the processed sine output.
- * @param[out] *pCosVal points to the processed cosine output.
- * @return none.
- *
- * The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
- *
+  @addtogroup SinCos
+  @{
+ */
+
+/**
+  @brief         Q31 sin_cos function.
+  @param[in]     theta    scaled input value in degrees
+  @param[out]    pSinVal  points to processed sine output
+  @param[out]    pCosVal  points to processed cosine output
+  @return        none
+
+  The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].
 */

 void arm_sin_cos_q31(
@ -54,9 +53,9 @@ void arm_sin_cos_q31(
  q31_t * pSinVal,
  q31_t * pCosVal)
 {
-  q31_t fract;                                 /* Temporary variables for input, output */
-  uint16_t indexS, indexC;                     /* Index variable */
-  q31_t f1, f2, d1, d2;                        /* Two nearest output values */
+  q31_t fract;                                     /* Temporary input, output variables */
+  uint16_t indexS, indexC;                         /* Index variable */
+  q31_t f1, f2, d1, d2;                            /* Two nearest output values */
  q31_t Dn, Df;
  q63_t temp;

@ -68,43 +67,44 @@ void arm_sin_cos_q31(
  fract = (theta - (indexS << CONTROLLER_Q31_SHIFT)) << 8;

  /* Read two nearest values of input value from the cos & sin tables */
-  f1 = sinTable_q31[indexC+0];
-  f2 = sinTable_q31[indexC+1];
-  d1 = -sinTable_q31[indexS+0];
+  f1 =  sinTable_q31[indexC  ];
+  f2 =  sinTable_q31[indexC+1];
+  d1 = -sinTable_q31[indexS  ];
  d2 = -sinTable_q31[indexS+1];

-  Dn = 0x1921FB5; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
-  Df = f2 - f1; // delta between the values of the functions
-  temp = Dn*((q63_t)d1 + d2);
+  Dn = 0x1921FB5; /* delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE */
+  Df = f2 - f1;   /* delta between the values of the functions */
+
+  temp = Dn * ((q63_t)d1 + d2);
  temp = temp - ((q63_t)Df << 32);
-  temp = (q63_t)fract*(temp >> 31);
-  temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
-  temp = (q63_t)fract*(temp >> 31);
-  temp = temp + (q63_t)d1*Dn;
-  temp = (q63_t)fract*(temp >> 31);
+  temp = (q63_t)fract * (temp >> 31);
+  temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
+  temp = (q63_t)fract * (temp >> 31);
+  temp = temp + (q63_t)d1 * Dn;
+  temp = (q63_t)fract * (temp >> 31);

  /* Calculation of cosine value */
  *pCosVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);

  /* Read two nearest values of input value from the cos & sin tables */
-  f1 = sinTable_q31[indexS+0];
+  f1 = sinTable_q31[indexS ];
  f2 = sinTable_q31[indexS+1];
-  d1 = sinTable_q31[indexC+0];
+  d1 = sinTable_q31[indexC ];
  d2 = sinTable_q31[indexC+1];

  Df = f2 - f1; // delta between the values of the functions
-  temp = Dn*((q63_t)d1 + d2);
+  temp = Dn * ((q63_t)d1 + d2);
  temp = temp - ((q63_t)Df << 32);
-  temp = (q63_t)fract*(temp >> 31);
-  temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
-  temp = (q63_t)fract*(temp >> 31);
-  temp = temp + (q63_t)d1*Dn;
-  temp = (q63_t)fract*(temp >> 31);
+  temp = (q63_t)fract * (temp >> 31);
+  temp = temp + ((3 * (q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1)) * Dn);
+  temp = (q63_t)fract * (temp >> 31);
+  temp = temp + (q63_t)d1 * Dn;
+  temp = (q63_t)fract * (temp >> 31);

  /* Calculation of sine value */
  *pSinVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
 }

 /**
- * @} end of SinCos group
+  @} end of SinCos group
 */
--- a/DSP/Source/FastMathFunctions/CMakeLists.txt
+++ b/DSP/Source/FastMathFunctions/CMakeLists.txt
@ -0,0 +1,51 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPFastMath)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPFastMath STATIC)
+
+include(interpol)
+interpol(CMSISDSPFastMath)
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_ALL_FAST_TABLES)  
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_F32)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q15)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_COS_Q31)
+target_sources(CMSISDSPFastMath PRIVATE arm_cos_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_F32)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q15)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_Q31)
+target_sources(CMSISDSPFastMath PRIVATE arm_sin_q31.c)
+endif()
+
+target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q15.c)
+target_sources(CMSISDSPFastMath PRIVATE arm_sqrt_q31.c)
+
+
+configdsp(CMSISDSPFastMath ..)
+
+### Includes
+target_include_directories(CMSISDSPFastMath PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/FastMathFunctions/FastMathFunctions.c
+++ b/DSP/Source/FastMathFunctions/FastMathFunctions.c
@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        FastMathFunctions.c
+ * Description:  Combination of all fast math function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_cos_f32.c"
+#include "arm_cos_q15.c"
+#include "arm_cos_q31.c"
+#include "arm_sin_f32.c"
+#include "arm_sin_q15.c"
+#include "arm_sin_q31.c"
+#include "arm_sqrt_q15.c"
+#include "arm_sqrt_q31.c"
+
--- a/DSP/Source/FastMathFunctions/arm_cos_f32.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_cos_f32.c
 * Description:  Fast cosine calculation for floating-point values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,56 +28,57 @@

 #include "arm_math.h"
 #include "arm_common_tables.h"
+
 /**
- * @ingroup groupFastMath
+  @ingroup groupFastMath
 */

 /**
- * @defgroup cos Cosine
- *
- * Computes the trigonometric cosine function using a combination of table lookup
- * and linear interpolation.  There are separate functions for
- * Q15, Q31, and floating-point data types.
- * The input to the floating-point version is in radians and in the range [0 2*pi) while the
- * fixed-point Q15 and Q31 have a scaled input with the range
- * [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
- * value of 2*pi wraps around to 0.
- *
- * The implementation is based on table lookup using 256 values together with linear interpolation.
- * The steps used are:
- *  -# Calculation of the nearest integer table index
- *  -# Compute the fractional portion (fract) of the table index.
- *  -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
- *
- * where
- * <pre>
- *    b=Table[index+0];
- *    c=Table[index+1];
- * </pre>
- */
+  @defgroup cos Cosine

- /**
- * @addtogroup cos
- * @{
+  Computes the trigonometric cosine function using a combination of table lookup
+  and linear interpolation.  There are separate functions for
+  Q15, Q31, and floating-point data types.
+  The input to the floating-point version is in radians while the
+  fixed-point Q15 and Q31 have a scaled input with the range
+  [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
+  value of 2*pi wraps around to 0.
+
+  The implementation is based on table lookup using 256 values together with linear interpolation.
+  The steps used are:
+   -# Calculation of the nearest integer table index
+   -# Compute the fractional portion (fract) of the table index.
+   -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
+
+  where
+  <pre>
+     b = Table[index];
+     c = Table[index+1];
+  </pre>
 */

 /**
- * @brief  Fast approximation to the trigonometric cosine function for floating-point data.
- * @param[in] x input value in radians.
- * @return cos(x).
+  @addtogroup cos
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric cosine function for floating-point data.
+  @param[in]     x  input value in radians
+  @return        cos(x)
 */

 float32_t arm_cos_f32(
  float32_t x)
 {
-  float32_t cosVal, fract, in;                   /* Temporary variables for input, output */
+  float32_t cosVal, fract, in;                   /* Temporary input, output variables */
  uint16_t index;                                /* Index variable */
  float32_t a, b;                                /* Two nearest output values */
  int32_t n;
  float32_t findex;

  /* input x is in radians */
-  /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
+  /* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
  in = x * 0.159154943092f + 0.25f;

  /* Calculation of floor value of input */
@ -93,8 +94,14 @@ float32_t arm_cos_f32(
  in = in - (float32_t) n;

  /* Calculation of index of the table */
-  findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
-  index = ((uint16_t)findex) & 0x1ff;
+  findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+  index = (uint16_t)findex;
+
+  /* when "in" is exactly 1, we need to rotate the index down to 0 */
+  if (index >= FAST_MATH_TABLE_SIZE) {
+    index = 0;
+    findex -= (float32_t)FAST_MATH_TABLE_SIZE;
+  }

  /* fractional value calculation */
  fract = findex - (float32_t) index;
@ -104,12 +111,12 @@ float32_t arm_cos_f32(
  b = sinTable_f32[index+1];

  /* Linear interpolation process */
-  cosVal = (1.0f-fract)*a + fract*b;
+  cosVal = (1.0f - fract) * a + fract * b;

-  /* Return the output value */
+  /* Return output value */
  return (cosVal);
 }

 /**
- * @} end of cos group
+  @} end of cos group
 */
--- a/DSP/Source/FastMathFunctions/arm_cos_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_cos_q15.c
 * Description:  Fast cosine calculation for Q15 values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,36 +30,35 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup cos
- * @{
+  @ingroup groupFastMath
 */

 /**
- * @brief Fast approximation to the trigonometric cosine function for Q15 data.
- * @param[in] x Scaled input value in radians.
- * @return  cos(x).
- *
- * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian
- * value in the range [0 2*pi).
+  @addtogroup cos
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric cosine function for Q15 data.
+  @param[in]     x  Scaled input value in radians
+  @return        cos(x)
+
+  The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
 */

 q15_t arm_cos_q15(
  q15_t x)
 {
-  q15_t cosVal;                                  /* Temporary variables for input, output */
-  int32_t index;                                 /* Index variables */
-  q15_t a, b;                                    /* Four nearest output values */
+  q15_t cosVal;                                  /* Temporary input, output variables */
+  int32_t index;                                 /* Index variable */
+  q15_t a, b;                                    /* Two nearest output values */
  q15_t fract;                                   /* Temporary values for fractional values */

  /* add 0.25 (pi/2) to read sine table */
  x = (uint16_t)x + 0x2000;
  if (x < 0)
-  {   /* convert negative numbers to corresponding positive ones */
-      x = (uint16_t)x + 0x8000;
+  { /* convert negative numbers to corresponding positive ones */
+    x = (uint16_t)x + 0x8000;
  }

  /* Calculate the nearest index */
@ -73,12 +72,13 @@ q15_t arm_cos_q15(
  b = sinTable_q15[index+1];

  /* Linear interpolation process */
-  cosVal = (q31_t)(0x8000-fract)*a >> 16;
-  cosVal = (q15_t)((((q31_t)cosVal << 16) + ((q31_t)fract*b)) >> 16);
+  cosVal = (q31_t) (0x8000 - fract) * a >> 16;
+  cosVal = (q15_t) ((((q31_t) cosVal << 16) + ((q31_t) fract * b)) >> 16);

-  return cosVal << 1;
+  /* Return output value */
+  return (cosVal << 1);
 }

 /**
- * @} end of cos group
+  @} end of cos group
 */
--- a/DSP/Source/FastMathFunctions/arm_cos_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_cos_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_cos_q31.c
 * Description:  Fast cosine calculation for Q31 values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,36 +30,35 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup cos
- * @{
+  @ingroup groupFastMath
 */

 /**
- * @brief Fast approximation to the trigonometric cosine function for Q31 data.
- * @param[in] x Scaled input value in radians.
- * @return  cos(x).
- *
- * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian
- * value in the range [0 2*pi).
+  @addtogroup cos
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric cosine function for Q31 data.
+  @param[in]     x  Scaled input value in radians
+  @return        cos(x)
+
+  The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
 */

 q31_t arm_cos_q31(
  q31_t x)
 {
-  q31_t cosVal;                                  /* Temporary variables for input, output */
-  int32_t index;                                 /* Index variables */
-  q31_t a, b;                                    /* Four nearest output values */
+  q31_t cosVal;                                  /* Temporary input, output variables */
+  int32_t index;                                 /* Index variable */
+  q31_t a, b;                                    /* Two nearest output values */
  q31_t fract;                                   /* Temporary values for fractional values */

  /* add 0.25 (pi/2) to read sine table */
  x = (uint32_t)x + 0x20000000;
  if (x < 0)
-  {   /* convert negative numbers to corresponding positive ones */
-      x = (uint32_t)x + 0x80000000;
+  { /* convert negative numbers to corresponding positive ones */
+    x = (uint32_t)x + 0x80000000;
  }

  /* Calculate the nearest index */
@ -73,12 +72,13 @@ q31_t arm_cos_q31(
  b = sinTable_q31[index+1];

  /* Linear interpolation process */
-  cosVal = (q63_t)(0x80000000-fract)*a >> 32;
-  cosVal = (q31_t)((((q63_t)cosVal << 32) + ((q63_t)fract*b)) >> 32);
+  cosVal = (q63_t) (0x80000000 - fract) * a >> 32;
+  cosVal = (q31_t) ((((q63_t) cosVal << 32) + ((q63_t) fract * b)) >> 32);

-  return cosVal << 1;
+  /* Return output value */
+  return (cosVal << 1);
 }

 /**
- * @} end of cos group
+  @} end of cos group
 */
--- a/DSP/Source/FastMathFunctions/arm_sin_f32.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_sin_f32.c
 * Description:  Fast sine calculation for floating-point values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -28,70 +28,64 @@

 #include "arm_math.h"
 #include "arm_common_tables.h"
-#include <math.h>

 /**
- * @ingroup groupFastMath
+  @ingroup groupFastMath
 */

 /**
- * @defgroup sin Sine
- *
- * Computes the trigonometric sine function using a combination of table lookup
- * and linear interpolation.  There are separate functions for
- * Q15, Q31, and floating-point data types.
- * The input to the floating-point version is in radians and in the range [0 2*pi) while the
- * fixed-point Q15 and Q31 have a scaled input with the range
- * [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
- * value of 2*pi wraps around to 0.
- *
- * The implementation is based on table lookup using 256 values together with linear interpolation.
- * The steps used are:
- *  -# Calculation of the nearest integer table index
- *  -# Compute the fractional portion (fract) of the table index.
- *  -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
- *
- * where
- * <pre>
- *    b=Table[index+0];
- *    c=Table[index+1];
- * </pre>
+  @defgroup sin Sine
+
+  Computes the trigonometric sine function using a combination of table lookup
+  and linear interpolation.  There are separate functions for
+  Q15, Q31, and floating-point data types.
+  The input to the floating-point version is in radians while the
+  fixed-point Q15 and Q31 have a scaled input with the range
+  [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
+  value of 2*pi wraps around to 0.
+
+  The implementation is based on table lookup using 256 values together with linear interpolation.
+  The steps used are:
+   -# Calculation of the nearest integer table index
+   -# Compute the fractional portion (fract) of the table index.
+   -# The final result equals <code>(1.0f-fract)*a + fract*b;</code>
+
+  where
+  <pre>
+     b = Table[index];
+     c = Table[index+1];
+  </pre>
 */

 /**
- * @addtogroup sin
- * @{
+  @addtogroup sin
+  @{
 */

 /**
- * @brief  Fast approximation to the trigonometric sine function for floating-point data.
- * @param[in] x input value in radians.
- * @return  sin(x).
+  @brief         Fast approximation to the trigonometric sine function for floating-point data.
+  @param[in]     x  input value in radians.
+  @return        sin(x)
 */

 float32_t arm_sin_f32(
  float32_t x)
 {
-  float32_t sinVal, fract, in;                           /* Temporary variables for input, output */
-  uint16_t index;                                        /* Index variable */
-  float32_t a, b;                                        /* Two nearest output values */
+  float32_t sinVal, fract, in;                   /* Temporary input, output variables */
+  uint16_t index;                                /* Index variable */
+  float32_t a, b;                                /* Two nearest output values */
  int32_t n;
  float32_t findex;

-  /* Special case for small negative inputs */
-  if ((x < 0.0f) && (x >= -1.9e-7f)) {
-     return x;
-  }
-
  /* input x is in radians */
-  /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi */
+  /* Scale input to [0 1] range from [0 2*PI] , divide input by 2*pi */
  in = x * 0.159154943092f;

  /* Calculation of floor value of input */
  n = (int32_t) in;

  /* Make negative values towards -infinity */
-  if (x < 0.0f)
+  if (in < 0.0f)
  {
    n--;
  }
@ -100,9 +94,14 @@ float32_t arm_sin_f32(
  in = in - (float32_t) n;

  /* Calculation of index of the table */
-  findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
+  findex = (float32_t)FAST_MATH_TABLE_SIZE * in;
+  index = (uint16_t)findex;

-  index = ((uint16_t)findex) & 0x1ff;
+  /* when "in" is exactly 1, we need to rotate the index down to 0 */
+  if (index >= FAST_MATH_TABLE_SIZE) {
+    index = 0;
+    findex -= (float32_t)FAST_MATH_TABLE_SIZE;
+  }

  /* fractional value calculation */
  fract = findex - (float32_t) index;
@ -112,12 +111,12 @@ float32_t arm_sin_f32(
  b = sinTable_f32[index+1];

  /* Linear interpolation process */
-  sinVal = (1.0f-fract)*a + fract*b;
+  sinVal = (1.0f - fract) * a + fract * b;

-  /* Return the output value */
+  /* Return output value */
  return (sinVal);
 }

 /**
- * @} end of sin group
+  @} end of sin group
 */
--- a/DSP/Source/FastMathFunctions/arm_sin_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_sin_q15.c
 * Description:  Fast sine calculation for Q15 values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,28 +30,28 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup sin
- * @{
+  @ingroup groupFastMath
 */

 /**
- * @brief Fast approximation to the trigonometric sine function for Q15 data.
- * @param[in] x Scaled input value in radians.
- * @return  sin(x).
- *
- * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi).
+  @addtogroup sin
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric sine function for Q15 data.
+  @param[in]     x  Scaled input value in radians
+  @return        sin(x)
+
+  The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
 */

 q15_t arm_sin_q15(
  q15_t x)
 {
-  q15_t sinVal;                                  /* Temporary variables for input, output */
-  int32_t index;                                 /* Index variables */
-  q15_t a, b;                                    /* Four nearest output values */
+  q15_t sinVal;                                  /* Temporary input, output variables */
+  int32_t index;                                 /* Index variable */
+  q15_t a, b;                                    /* Two nearest output values */
  q15_t fract;                                   /* Temporary values for fractional values */

  /* Calculate the nearest index */
@ -65,12 +65,13 @@ q15_t arm_sin_q15(
  b = sinTable_q15[index+1];

  /* Linear interpolation process */
-  sinVal = (q31_t)(0x8000-fract)*a >> 16;
-  sinVal = (q15_t)((((q31_t)sinVal << 16) + ((q31_t)fract*b)) >> 16);
+  sinVal = (q31_t) (0x8000 - fract) * a >> 16;
+  sinVal = (q15_t) ((((q31_t) sinVal << 16) + ((q31_t) fract * b)) >> 16);

-  return sinVal << 1;
+  /* Return output value */
+  return (sinVal << 1);
 }

 /**
- * @} end of sin group
+  @} end of sin group
 */
--- a/DSP/Source/FastMathFunctions/arm_sin_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_sin_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_sin_q31.c
 * Description:  Fast sine calculation for Q31 values
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,27 +30,28 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupFastMath
- */
-
- /**
- * @addtogroup sin
- * @{
+  @ingroup groupFastMath
 */

 /**
- * @brief Fast approximation to the trigonometric sine function for Q31 data.
- * @param[in] x Scaled input value in radians.
- * @return  sin(x).
- *
- * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi). */
+  @addtogroup sin
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric sine function for Q31 data.
+  @param[in]     x  Scaled input value in radians
+  @return        sin(x)
+
+  The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
+ */

 q31_t arm_sin_q31(
  q31_t x)
 {
  q31_t sinVal;                                  /* Temporary variables for input, output */
-  int32_t index;                                 /* Index variables */
-  q31_t a, b;                                    /* Four nearest output values */
+  int32_t index;                                 /* Index variable */
+  q31_t a, b;                                    /* Two nearest output values */
  q31_t fract;                                   /* Temporary values for fractional values */

  /* Calculate the nearest index */
@ -64,12 +65,13 @@ q31_t arm_sin_q31(
  b = sinTable_q31[index+1];

  /* Linear interpolation process */
-  sinVal = (q63_t)(0x80000000-fract)*a >> 32;
-  sinVal = (q31_t)((((q63_t)sinVal << 32) + ((q63_t)fract*b)) >> 32);
+  sinVal = (q63_t) (0x80000000 - fract) * a >> 32;
+  sinVal = (q31_t) ((((q63_t) sinVal << 32) + ((q63_t) fract * b)) >> 32);

-  return sinVal << 1;
+  /* Return output value */
+  return (sinVal << 1);
 }

 /**
- * @} end of sin group
+  @} end of sin group
 */
--- a/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
+++ b/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_sqrt_q15.c
 * Description:  Q15 square root function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,31 +29,30 @@
 #include "arm_math.h"
 #include "arm_common_tables.h"

-
 /**
- * @ingroup groupFastMath
+  @ingroup groupFastMath
 */

 /**
- * @addtogroup SQRT
- * @{
+  @addtogroup SQRT
+  @{
 */

-  /**
-   * @brief  Q15 square root function.
-   * @param[in]   in     input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF.
-   * @param[out]  *pOut  square root of input value.
-   * @return The function returns ARM_MATH_SUCCESS if the input value is positive
-   * and ARM_MATH_ARGUMENT_ERROR if the input is negative.  For
-   * negative inputs, the function returns *pOut = 0.
-   */
+/**
+  @brief         Q15 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */

 arm_status arm_sqrt_q15(
  q15_t in,
  q15_t * pOut)
 {
-  q15_t number, temp1, var1, signBits1, half;
  q31_t bits_val1;
+  q15_t number, temp1, var1, signBits1, half;
  float32_t temp_float1;
  union
  {
@ -85,7 +84,7 @@ arm_status arm_sqrt_q15(

    /* Convert to float */
    temp_float1 = number * 3.051757812500000e-005f;
-    /*Store as integer */
+    /* Store as integer */
    tempconv.floatval = temp_float1;
    bits_val1 = tempconv.fracval;
    /* Subtract the shifted value from the magic number to give intial guess */
@ -135,10 +134,11 @@ arm_status arm_sqrt_q15(
  else
  {
    *pOut = 0;
+
    return (ARM_MATH_ARGUMENT_ERROR);
  }
 }

 /**
- * @} end of SQRT group
+  @} end of SQRT group
 */
--- a/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
+++ b/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_sqrt_q31.c
 * Description:  Q31 square root function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -30,33 +30,34 @@
 #include "arm_common_tables.h"

 /**
- * @ingroup groupFastMath
+  @ingroup groupFastMath
 */

 /**
- * @addtogroup SQRT
- * @{
+  @addtogroup SQRT
+  @{
 */

 /**
- * @brief Q31 square root function.
- * @param[in]   in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF.
- * @param[out]  *pOut square root of input value.
- * @return The function returns ARM_MATH_SUCCESS if the input value is positive
- * and ARM_MATH_ARGUMENT_ERROR if the input is negative.  For
- * negative inputs, the function returns *pOut = 0.
+  @brief         Q31 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
 */

 arm_status arm_sqrt_q31(
  q31_t in,
  q31_t * pOut)
 {
-  q31_t number, temp1, bits_val1, var1, signBits1, half;
+  q31_t bits_val1;
+  q31_t number, temp1, var1, signBits1, half;
  float32_t temp_float1;
  union
  {
-      q31_t fracval;
-      float32_t floatval;
+    q31_t fracval;
+    float32_t floatval;
  } tempconv;

  number = in;
@ -81,9 +82,9 @@ arm_status arm_sqrt_q31(
    /* Store the number for later use */
    temp1 = number;

-    /*Convert to float */
+    /* Convert to float */
    temp_float1 = number * 4.6566128731e-010f;
-    /*Store as integer */
+    /* Store as integer */
    tempconv.floatval = temp_float1;
    bits_val1 = tempconv.fracval;
    /* Subtract the shifted value from the magic number to give intial guess */
@ -133,10 +134,11 @@ arm_status arm_sqrt_q31(
  else
  {
    *pOut = 0;
+
    return (ARM_MATH_ARGUMENT_ERROR);
  }
 }

 /**
- * @} end of SQRT group
+  @} end of SQRT group
 */
--- a/DSP/Source/FilteringFunctions/CMakeLists.txt
+++ b/DSP/Source/FilteringFunctions/CMakeLists.txt
@ -0,0 +1,128 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPFiltering)
+
+
+add_library(CMSISDSPFiltering STATIC)
+
+include(interpol)
+interpol(CMSISDSPFiltering)
+
+configdsp(CMSISDSPFiltering ..)
+
+if (CONFIGTABLE AND ALLFAST)
+target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_ALL_FAST_TABLES)  
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q31)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_LMS_NORM_Q15)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_q15.c)
+endif()
+
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_32x64_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df1_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_f64.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_df2T_init_f64.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_biquad_cascade_stereo_df2T_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_partial_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_conv_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_opt_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_correlate_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_decimate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_fast_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_init_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_interpolate_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_lattice_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_init_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_fir_sparse_q7.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_iir_lattice_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_init_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_init_f32.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_norm_q31.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_q15.c)
+target_sources(CMSISDSPFiltering PRIVATE arm_lms_q31.c)
+
+
+### Includes
+target_include_directories(CMSISDSPFiltering PUBLIC "${DSP}/../../Include")
+
+
+
--- a/DSP/Source/FilteringFunctions/FilteringFunctions.c
+++ b/DSP/Source/FilteringFunctions/FilteringFunctions.c
@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        FilteringFunctions.c
+ * Description:  Combination of all filtering function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_biquad_cascade_df1_32x64_init_q31.c"
+#include "arm_biquad_cascade_df1_32x64_q31.c"
+#include "arm_biquad_cascade_df1_f32.c"
+#include "arm_biquad_cascade_df1_fast_q15.c"
+#include "arm_biquad_cascade_df1_fast_q31.c"
+#include "arm_biquad_cascade_df1_init_f32.c"
+#include "arm_biquad_cascade_df1_init_q15.c"
+#include "arm_biquad_cascade_df1_init_q31.c"
+#include "arm_biquad_cascade_df1_q15.c"
+#include "arm_biquad_cascade_df1_q31.c"
+#include "arm_biquad_cascade_df2T_f32.c"
+#include "arm_biquad_cascade_df2T_f64.c"
+#include "arm_biquad_cascade_df2T_init_f32.c"
+#include "arm_biquad_cascade_df2T_init_f64.c"
+#include "arm_biquad_cascade_stereo_df2T_f32.c"
+#include "arm_biquad_cascade_stereo_df2T_init_f32.c"
+#include "arm_conv_f32.c"
+#include "arm_conv_fast_opt_q15.c"
+#include "arm_conv_fast_q15.c"
+#include "arm_conv_fast_q31.c"
+#include "arm_conv_opt_q15.c"
+#include "arm_conv_opt_q7.c"
+#include "arm_conv_partial_f32.c"
+#include "arm_conv_partial_fast_opt_q15.c"
+#include "arm_conv_partial_fast_q15.c"
+#include "arm_conv_partial_fast_q31.c"
+#include "arm_conv_partial_opt_q15.c"
+#include "arm_conv_partial_opt_q7.c"
+#include "arm_conv_partial_q15.c"
+#include "arm_conv_partial_q31.c"
+#include "arm_conv_partial_q7.c"
+#include "arm_conv_q15.c"
+#include "arm_conv_q31.c"
+#include "arm_conv_q7.c"
+#include "arm_correlate_f32.c"
+#include "arm_correlate_fast_opt_q15.c"
+#include "arm_correlate_fast_q15.c"
+#include "arm_correlate_fast_q31.c"
+#include "arm_correlate_opt_q15.c"
+#include "arm_correlate_opt_q7.c"
+#include "arm_correlate_q15.c"
+#include "arm_correlate_q31.c"
+#include "arm_correlate_q7.c"
+#include "arm_fir_decimate_f32.c"
+#include "arm_fir_decimate_fast_q15.c"
+#include "arm_fir_decimate_fast_q31.c"
+#include "arm_fir_decimate_init_f32.c"
+#include "arm_fir_decimate_init_q15.c"
+#include "arm_fir_decimate_init_q31.c"
+#include "arm_fir_decimate_q15.c"
+#include "arm_fir_decimate_q31.c"
+#include "arm_fir_f32.c"
+#include "arm_fir_fast_q15.c"
+#include "arm_fir_fast_q31.c"
+#include "arm_fir_init_f32.c"
+#include "arm_fir_init_q15.c"
+#include "arm_fir_init_q31.c"
+#include "arm_fir_init_q7.c"
+#include "arm_fir_interpolate_f32.c"
+#include "arm_fir_interpolate_init_f32.c"
+#include "arm_fir_interpolate_init_q15.c"
+#include "arm_fir_interpolate_init_q31.c"
+#include "arm_fir_interpolate_q15.c"
+#include "arm_fir_interpolate_q31.c"
+#include "arm_fir_lattice_f32.c"
+#include "arm_fir_lattice_init_f32.c"
+#include "arm_fir_lattice_init_q15.c"
+#include "arm_fir_lattice_init_q31.c"
+#include "arm_fir_lattice_q15.c"
+#include "arm_fir_lattice_q31.c"
+#include "arm_fir_q15.c"
+#include "arm_fir_q31.c"
+#include "arm_fir_q7.c"
+#include "arm_fir_sparse_f32.c"
+#include "arm_fir_sparse_init_f32.c"
+#include "arm_fir_sparse_init_q15.c"
+#include "arm_fir_sparse_init_q31.c"
+#include "arm_fir_sparse_init_q7.c"
+#include "arm_fir_sparse_q15.c"
+#include "arm_fir_sparse_q31.c"
+#include "arm_fir_sparse_q7.c"
+#include "arm_iir_lattice_f32.c"
+#include "arm_iir_lattice_init_f32.c"
+#include "arm_iir_lattice_init_q15.c"
+#include "arm_iir_lattice_init_q31.c"
+#include "arm_iir_lattice_q15.c"
+#include "arm_iir_lattice_q31.c"
+#include "arm_lms_f32.c"
+#include "arm_lms_init_f32.c"
+#include "arm_lms_init_q15.c"
+#include "arm_lms_init_q31.c"
+#include "arm_lms_norm_f32.c"
+#include "arm_lms_norm_init_f32.c"
+#include "arm_lms_norm_init_q15.c"
+#include "arm_lms_norm_init_q31.c"
+#include "arm_lms_norm_q15.c"
+#include "arm_lms_norm_q31.c"
+#include "arm_lms_q15.c"
+#include "arm_lms_q31.c"
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_32x64_init_q31.c
 * Description:  High precision Q31 Biquad cascade filter initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,53 +29,49 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1_32x64
- * @{
+  @addtogroup BiquadCascadeDF1_32x64
+  @{
 */

 /**
- * @details
- *
- * @param[in,out] *S           	points to an instance of the high precision Q31 Biquad cascade filter structure.
- * @param[in]     numStages     number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs      points to the filter coefficients.
- * @param[in]     *pState       points to the state buffer.
- * @param[in]     postShift     Shift to be applied after the accumulator.  Varies according to the coefficients format.
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- *
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> points to state variables array and size of each state variable is 1.63 format.
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
- * The state variables are arranged in the state array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the Q31 Biquad cascade 32x64 filter.
+  @param[in,out] S           points to an instance of the high precision Q31 Biquad cascade filter structure
+  @param[in]     numStages    number of 2nd order stages in the filter
+  @param[in]     pCoeffs      points to the filter coefficients
+  @param[in]     pState       points to the state buffer
+  @param[in]     postShift    Shift to be applied after the accumulator.  Varies according to the coefficients format
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> points to state variables array and size of each state variable is 1.63 format.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the state array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cas_df1_32x64_init_q31(
-  arm_biquad_cas_df1_32x64_ins_q31 * S,
-  uint8_t numStages,
-  q31_t * pCoeffs,
-  q63_t * pState,
-  uint8_t postShift)
+        arm_biquad_cas_df1_32x64_ins_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q63_t * pState,
+        uint8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -94,5 +90,5 @@ void arm_biquad_cas_df1_32x64_init_q31(
 }

 /**
- * @} end of BiquadCascadeDF1_32x64 group
+  @} end of BiquadCascadeDF1_32x64 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_32x64_q31.c
 * Description:  High precision Q31 Biquad cascade filter processing function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,174 +29,169 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
- *
- * This function implements a high precision Biquad cascade filter which operates on
- * Q31 data values.  The filter coefficients are in 1.31 format and the state variables
- * are in 1.63 format.  The double precision state variables reduce quantization noise
- * in the filter and provide a cleaner output.
- * These filters are particularly useful when implementing filters in which the
- * singularities are close to the unit circle.  This is common for low pass or high
- * pass filters with very low cutoff frequencies.
- *
- * The function operates on blocks of input and output data
- * and each call to the function processes <code>blockSize</code> samples through
- * the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
- * containing <code>blockSize</code> Q31 values.
- *
- * \par Algorithm
- * Each Biquad stage implements a second order filter using the difference equation:
- * <pre>
- *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * </pre>
- * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
- * \image html Biquad.gif "Single Biquad filter stage"
- * Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
- * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
- * Pay careful attention to the sign of the feedback coefficients.
- * Some design tools use the difference equation
- * <pre>
- *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
- * </pre>
- * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
- *
- * \par
- * Higher order filters are realized as a cascade of second order sections.
- * <code>numStages</code> refers to the number of second order stages used.
- * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
- * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
- * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
- *
- * \par
- * The <code>pState</code> points to state variables array .
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
- * The state variables are arranged in the array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- *
- * \par
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- *
- * \par Init Function
- * There is also an associated initialization function which performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * For example, to statically initialize the filter instance structure use
- * <pre>
- *     arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
- * </pre>
- * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;
- * <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied which is described in detail below.
- * \par Fixed-Point Behavior
- * Care must be taken while using Biquad Cascade 32x64 filter function.
- * Following issues must be considered:
- * - Scaling of coefficients
- * - Filter gain
- * - Overflow and saturation
- *
- * \par
- * Filter coefficients are represented as fractional values and
- * restricted to lie in the range <code>[-1 +1)</code>.
- * The processing function has an additional scaling parameter <code>postShift</code>
- * which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
- * At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
- * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
- * This essentially scales the filter coefficients by <code>2^postShift</code>.
- * For example, to realize the coefficients
- * <pre>
- *    {1.5, -0.8, 1.2, 1.6, -0.9}
- * </pre>
- * set the Coefficient array to:
- * <pre>
- *    {0.75, -0.4, 0.6, 0.8, -0.45}
- * </pre>
- * and set <code>postShift=1</code>
- *
- * \par
- * The second thing to keep in mind is the gain through the filter.
- * The frequency response of a Biquad filter is a function of its coefficients.
- * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
- * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
- * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
- *
- * \par
- * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
- * This is described in the function specific documentation below.
+  @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
+
+  This function implements a high precision Biquad cascade filter which operates on
+  Q31 data values.  The filter coefficients are in 1.31 format and the state variables
+  are in 1.63 format.  The double precision state variables reduce quantization noise
+  in the filter and provide a cleaner output.
+  These filters are particularly useful when implementing filters in which the
+  singularities are close to the unit circle.  This is common for low pass or high
+  pass filters with very low cutoff frequencies.
+
+  The function operates on blocks of input and output data
+  and each call to the function processes <code>blockSize</code> samples through
+  the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
+  containing <code>blockSize</code> Q31 values.
+
+  @par           Algorithm
+                   Each Biquad stage implements a second order filter using the difference equation:
+  <pre>
+      y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+  </pre>
+                   A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+                   \image html Biquad.gif "Single Biquad filter stage"
+                   Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
+                   Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
+                   Pay careful attention to the sign of the feedback coefficients.
+                   Some design tools use the difference equation
+  <pre>
+      y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
+  </pre>
+                   In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
+  @par
+                   Higher order filters are realized as a cascade of second order sections.
+                   <code>numStages</code> refers to the number of second order stages used.
+                   For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
+                   \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+                   A 9th order filter would be realized with <code>numStages=5</code> second order stages
+                   with the coefficients for one of the stages configured as a first order filter
+                   (<code>b2=0</code> and <code>a2=0</code>).
+  @par
+                   The <code>pState</code> points to state variables array.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
+                   The state variables are arranged in the array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+  @par
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
+                   The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+  @par           Instance Structure
+                   The coefficients and state variables for a filter are stored together in an instance data structure.
+                   A separate instance structure must be defined for each filter.
+                   Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+  @par           Init Function
+                   There is also an associated initialization function which performs the following operations:
+                   - Sets the values of the internal structure fields.
+                   - Zeros out the values in the state buffer.
+                   To do this manually without calling the init function, assign the follow subfields of the instance structure:
+                   numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
+
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+                   To place an instance structure into a const data section, the instance structure must be manually initialized.
+                   Set the values in the state buffer to zeros before static initialization.
+                   For example, to statically initialize the filter instance structure use
+  <pre>
+      arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
+  </pre>
+                   where <code>numStages</code> is the number of Biquad stages in the filter;
+                   <code>pState</code> is the address of the state buffer;
+                   <code>pCoeffs</code> is the address of the coefficient buffer;
+                   <code>postShift</code> shift to be applied which is described in detail below.
+  @par           Fixed-Point Behavior
+                   Care must be taken while using Biquad Cascade 32x64 filter function.
+                   Following issues must be considered:
+                   - Scaling of coefficients
+                   - Filter gain
+                   - Overflow and saturation
+
+  @par
+                   Filter coefficients are represented as fractional values and
+                   restricted to lie in the range <code>[-1 +1)</code>.
+                   The processing function has an additional scaling parameter <code>postShift</code>
+                   which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
+                   At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
+                   \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+                   This essentially scales the filter coefficients by <code>2^postShift</code>.
+                   For example, to realize the coefficients
+  <pre>
+     {1.5, -0.8, 1.2, 1.6, -0.9}
+  </pre>
+                   set the Coefficient array to:
+  <pre>
+     {0.75, -0.4, 0.6, 0.8, -0.45}
+  </pre>
+                   and set <code>postShift=1</code>
+  @par
+                   The second thing to keep in mind is the gain through the filter.
+                   The frequency response of a Biquad filter is a function of its coefficients.
+                   It is possible for the gain through the filter to exceed 1.0 meaning that the
+                   filter increases the amplitude of certain frequencies.
+                   This means that an input signal with amplitude < 1.0 may result in an output > 1.0
+                   and these are saturated or overflowed based on the implementation of the filter.
+                   To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
+                   or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+  @par
+                   The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
+                   This is described in the function specific documentation below.
 */

 /**
- * @addtogroup BiquadCascadeDF1_32x64
- * @{
+  @addtogroup BiquadCascadeDF1_32x64
+  @{
 */

 /**
- * @details
+  @brief         Processing function for the Q31 Biquad cascade 32x64 filter.
+  @param[in]     S         points to an instance of the high precision Q31 Biquad cascade filter
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none

- * @param[in]  *S points to an instance of the high precision Q31 Biquad cascade filter.
- * @param[in]  *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in]  blockSize number of samples to process.
- * @return none.
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
- * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
- * 1.31 format by discarding the low 32 bits.
- *
- * \par
- * Two related functions are provided in the CMSIS DSP library.
- * <code>arm_biquad_cascade_df1_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
- * <code>arm_biquad_cascade_df1_fast_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
+  @par           Details
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around rather than clip.
+                   In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+                   After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
+                   1.31 format by discarding the low 32 bits.
+  @par
+                   Two related functions are provided in the CMSIS DSP library.
+                   - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
+                   - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
 */

 void arm_biquad_cas_df1_32x64_q31(
  const arm_biquad_cas_df1_32x64_ins_q31 * S,
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
-  q31_t *pOut = pDst;                            /*  output pointer initialization */
-  q63_t *pState = S->pState;                     /*  state pointer initialization  */
-  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
-  q63_t acc;                                     /*  accumulator                   */
-  q31_t Xn1, Xn2;                                /*  Input Filter state variables        */
-  q63_t Yn1, Yn2;                                /*  Output Filter state variables        */
-  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
-  q31_t Xn;                                      /*  temporary input               */
-  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
-  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
-  q31_t acc_l, acc_h;                            /*  temporary output               */
-  uint32_t uShift = ((uint32_t) S->postShift + 1U);
-  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
-
-
-#if defined (ARM_MATH_DSP)
-
-  /* Run the below code for Cortex-M4 and Cortex-M3 */
+        q31_t *pIn = pSrc;                             /* input pointer initialization */
+        q31_t *pOut = pDst;                            /* output pointer initialization */
+        q63_t *pState = S->pState;                     /* state pointer initialization */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
+        q63_t acc;                                     /* accumulator */
+        q31_t Xn1, Xn2;                                /* Input Filter state variables */
+        q63_t Yn1, Yn2;                                /* Output Filter state variables */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn;                                      /* temporary input */
+        int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* loop counters */
+        q31_t acc_l, acc_h;                            /* temporary output */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */

  do
  {
@ -213,16 +208,16 @@ void arm_biquad_cas_df1_32x64_q31(
    Yn1 = pState[2];
    Yn2 = pState[3];

+#if defined (ARM_MATH_LOOPUNROLL)
+
    /* Apply loop unrolling and compute 4 output values simultaneously. */
-    /* The variable acc hold output value that is being computed and
-     * stored in the destination buffer
+    /* Variable acc hold output value that is being computed and stored in destination buffer
     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */

+    /* Loop unrolling: Compute 4 outputs at a time */
    sample = blockSize >> 2U;

-    /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (sample > 0U)
    {
      /* Read the input */
@ -231,13 +226,13 @@ void arm_biquad_cas_df1_32x64_q31(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */

      /* acc =  b0 * x[n] */
-      acc = (q63_t) Xn *b0;
+      acc = (q63_t) Xn * b0;

      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) Xn1 *b1;
+      acc += (q63_t) Xn1 * b1;

      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn2 *b2;
+      acc += (q63_t) Xn2 * b2;

      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
@ -266,13 +261,13 @@ void arm_biquad_cas_df1_32x64_q31(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */

      /* acc +=  b1 * x[n-1] */
-      acc = (q63_t) Xn *b1;
+      acc = (q63_t) Xn * b1;

      /* acc =  b0 * x[n] */
-      acc += (q63_t) Xn2 *b0;
+      acc += (q63_t) Xn2 * b0;

      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn1 *b2;
+      acc += (q63_t) Xn1 * b2;

      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn2, a1);
@ -302,13 +297,13 @@ void arm_biquad_cas_df1_32x64_q31(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */

      /* acc =  b0 * x[n] */
-      acc = (q63_t) Xn1 *b0;
+      acc = (q63_t) Xn1 * b0;

      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) Xn2 *b1;
+      acc += (q63_t) Xn2 * b1;

      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn *b2;
+      acc += (q63_t) Xn * b2;

      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
@ -336,13 +331,13 @@ void arm_biquad_cas_df1_32x64_q31(

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      acc = (q63_t) Xn *b0;
+      acc = (q63_t) Xn * b0;

      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) Xn1 *b1;
+      acc += (q63_t) Xn1 * b1;

      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn2 *b2;
+      acc += (q63_t) Xn2 * b2;

      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn2, a1);
@ -366,139 +361,55 @@ void arm_biquad_cas_df1_32x64_q31(
      *(pOut + 3U) = acc_h;

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;

      /* update output pointer */
      pOut += 4U;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    sample = (blockSize & 0x3U);
-
-    while (sample > 0U)
-    {
-      /* Read the input */
-      Xn = *pIn++;
-
-      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) Xn *b0;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) Xn1 *b1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn2 *b2;
-      /* acc +=  a1 * y[n-1] */
-      acc += mult32x64(Yn1, a1);
-      /* acc +=  a2 * y[n-2] */
-      acc += mult32x64(Yn2, a2);
-
-      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
-      Xn2 = Xn1;
-      Xn1 = Xn;
-      Yn2 = Yn1;
-      /* The result is converted to 1.63, Yn1 variable is reused  */
-      Yn1 = acc << shift;
-
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
-
-      /* Apply shift for lower part of acc and upper part of acc */
-      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
-
-      /* Store the output in the destination buffer in 1.31 format. */
-      *pOut++ = acc_h;
-      /* Yn1 = acc << shift; */
-
-      /* Store the output in the destination buffer in 1.31 format. */
-/*      *pOut++ = (q31_t) (acc >> (32 - shift));  */
-
-      /* decrement the loop counter */
-      sample--;
-    }
-
-    /*  The first stage output is given as input to the second stage. */
-    pIn = pDst;
-
-    /* Reset to destination buffer working pointer */
-    pOut = pDst;
-
-    /*  Store the updated state variables back into the pState array */
-    /*  Store the updated state variables back into the pState array */
-    *pState++ = (q63_t) Xn1;
-    *pState++ = (q63_t) Xn2;
-    *pState++ = Yn1;
-    *pState++ = Yn2;
-
-  } while (--stage);
+    /* Loop unrolling: Compute remaining outputs */
+    sample = blockSize & 0x3U;

 #else

-  /* Run the below code for Cortex-M0 */
-
-  do
-  {
-    /* Reading the coefficients */
-    b0 = *pCoeffs++;
-    b1 = *pCoeffs++;
-    b2 = *pCoeffs++;
-    a1 = *pCoeffs++;
-    a2 = *pCoeffs++;
-
-    /* Reading the state values */
-    Xn1 = pState[0];
-    Xn2 = pState[1];
-    Yn1 = pState[2];
-    Yn2 = pState[3];
-
-    /* The variable acc hold output value that is being computed and
-     * stored in the destination buffer
-     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
-     */
-
+    /* Initialize blkCnt with number of samples */
    sample = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
    while (sample > 0U)
    {
      /* Read the input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
      /* acc =  b0 * x[n] */
-      acc = (q63_t) Xn *b0;
+      acc = (q63_t) Xn * b0;
      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) Xn1 *b1;
+      acc += (q63_t) Xn1 * b1;
      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) Xn2 *b2;
+      acc += (q63_t) Xn2 * b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn2, a2);

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
@ -517,17 +428,16 @@ void arm_biquad_cas_df1_32x64_q31(

      /* Store the output in the destination buffer in 1.31 format. */
      *pOut++ = acc_h;
-
      /* Yn1 = acc << shift; */

      /* Store the output in the destination buffer in 1.31 format. */
-      /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
+/*    *pOut++ = (q31_t) (acc >> (32 - shift));  */

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /*  The first stage output is given as input to the second stage. */
+    /* The first stage output is given as input to the second stage. */
    pIn = pDst;

    /* Reset to destination buffer working pointer */
@ -541,9 +451,8 @@ void arm_biquad_cas_df1_32x64_q31(

  } while (--stage);

-#endif /*    #if defined (ARM_MATH_DSP)     */
 }

-  /**
-   * @} end of BiquadCascadeDF1_32x64 group
-   */
+/**
+  @} end of BiquadCascadeDF1_32x64 group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_f32.c
 * Description:  Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,159 +29,308 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
- *
- * This set of functions implements arbitrary order recursive (IIR) filters.
- * The filters are implemented as a cascade of second order Biquad sections.
- * The functions support Q15, Q31 and floating-point data types.
- * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3.
- *
- * The functions operate on blocks of input and output data and each call to the function
- * processes <code>blockSize</code> samples through the filter.
- * <code>pSrc</code> points to the array of input data and
- * <code>pDst</code> points to the array of output data.
- * Both arrays contain <code>blockSize</code> values.
- *
- * \par Algorithm
- * Each Biquad stage implements a second order filter using the difference equation:
- * <pre>
- *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
- * </pre>
- * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
- * \image html Biquad.gif "Single Biquad filter stage"
- * Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
- * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
- * Pay careful attention to the sign of the feedback coefficients.
- * Some design tools use the difference equation
- * <pre>
- *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
- * </pre>
- * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
- *
- * \par
- * Higher order filters are realized as a cascade of second order sections.
- * <code>numStages</code> refers to the number of second order stages used.
- * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
- * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
- * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
- *
- * \par
- * The <code>pState</code> points to state variables array.
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
- * The state variables are arranged in the <code>pState</code> array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- *
- * \par
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values.
- * The state variables are updated after each block of data is processed, the coefficients are untouched.
- *
- * \par Instance Structure
- * The coefficients and state variables for a filter are stored together in an instance data structure.
- * A separate instance structure must be defined for each filter.
- * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Init Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs following operations:
- * - Sets the values of the internal structure fields.
- * - Zeros out the values in the state buffer.
- * To do this manually without calling the init function, assign the follow subfields of the instance structure:
- * numStages, pCoeffs, pState. Also set all of the values in pState to zero.
- *
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Set the values in the state buffer to zeros before static initialization.
- * The code below statically initializes each of the 3 different data type filter instance structures
- * <pre>
- *     arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
- *     arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
- *     arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
- * </pre>
- * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;
- * <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied.
- *
- * \par Fixed-Point Behavior
- * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
- * Following issues must be considered:
- * - Scaling of coefficients
- * - Filter gain
- * - Overflow and saturation
- *
- * \par
- * <b>Scaling of coefficients: </b>
- * Filter coefficients are represented as fractional values and
- * coefficients are restricted to lie in the range <code>[-1 +1)</code>.
- * The fixed-point functions have an additional scaling parameter <code>postShift</code>
- * which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
- * At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
- * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
- * This essentially scales the filter coefficients by <code>2^postShift</code>.
- * For example, to realize the coefficients
- * <pre>
- *    {1.5, -0.8, 1.2, 1.6, -0.9}
- * </pre>
- * set the pCoeffs array to:
- * <pre>
- *    {0.75, -0.4, 0.6, 0.8, -0.45}
- * </pre>
- * and set <code>postShift=1</code>
- *
- * \par
- * <b>Filter gain: </b>
- * The frequency response of a Biquad filter is a function of its coefficients.
- * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
- * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
- * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
- *
- * \par
- * <b>Overflow and saturation: </b>
- * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
+  @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
+
+  This set of functions implements arbitrary order recursive (IIR) filters.
+  The filters are implemented as a cascade of second order Biquad sections.
+  The functions support Q15, Q31 and floating-point data types.
+  Fast version of Q15 and Q31 also available.
+
+  The functions operate on blocks of input and output data and each call to the function
+  processes <code>blockSize</code> samples through the filter.
+  <code>pSrc</code> points to the array of input data and
+  <code>pDst</code> points to the array of output data.
+  Both arrays contain <code>blockSize</code> values.
+
+  @par           Algorithm
+                   Each Biquad stage implements a second order filter using the difference equation:
+  <pre>
+      y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+  </pre>
+                  A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+                  \image html Biquad.gif "Single Biquad filter stage"
+                  Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
+                  Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
+                  Pay careful attention to the sign of the feedback coefficients.
+                  Some design tools use the difference equation
+  <pre>
+      y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
+  </pre>
+                  In this case the feedback coefficients <code>a1</code> and <code>a2</code>
+                  must be negated when used with the CMSIS DSP Library.
+
+  @par
+                   Higher order filters are realized as a cascade of second order sections.
+                   <code>numStages</code> refers to the number of second order stages used.
+                   For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
+                   \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+                   A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
+
+  @par
+                   The <code>pState</code> points to state variables array.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+
+  @par
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+  @par           Instance Structure
+                   The coefficients and state variables for a filter are stored together in an instance data structure.
+                   A separate instance structure must be defined for each filter.
+                   Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+                   There are separate instance structure declarations for each of the 3 supported data types.
+
+  @par           Init Function
+                   There is also an associated initialization function for each data type.
+                   The initialization function performs following operations:
+                   - Sets the values of the internal structure fields.
+                   - Zeros out the values in the state buffer.
+                   To do this manually without calling the init function, assign the follow subfields of the instance structure:
+                   numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+                   To place an instance structure into a const data section, the instance structure must be manually initialized.
+                   Set the values in the state buffer to zeros before static initialization.
+                   The code below statically initializes each of the 3 different data type filter instance structures
+  <pre>
+      arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
+      arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
+      arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
+  </pre>
+                   where <code>numStages</code> is the number of Biquad stages in the filter;
+                   <code>pState</code> is the address of the state buffer;
+                   <code>pCoeffs</code> is the address of the coefficient buffer;
+                   <code>postShift</code> shift to be applied.
+
+  @par           Fixed-Point Behavior
+                   Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
+                   Following issues must be considered:
+                   - Scaling of coefficients
+                   - Filter gain
+                   - Overflow and saturation
+
+  @par           Scaling of coefficients
+                   Filter coefficients are represented as fractional values and
+                   coefficients are restricted to lie in the range <code>[-1 +1)</code>.
+                   The fixed-point functions have an additional scaling parameter <code>postShift</code>
+                   which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
+                   At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
+                   \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+                   This essentially scales the filter coefficients by <code>2^postShift</code>.
+                   For example, to realize the coefficients
+  <pre>
+     {1.5, -0.8, 1.2, 1.6, -0.9}
+  </pre>
+                   set the pCoeffs array to:
+  <pre>
+     {0.75, -0.4, 0.6, 0.8, -0.45}
+  </pre>
+                   and set <code>postShift=1</code>
+
+  @par           Filter gain
+                   The frequency response of a Biquad filter is a function of its coefficients.
+                   It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
+                   This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
+                   To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+
+  @par           Overflow and saturation
+                   For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @param[in]  *S         points to an instance of the floating-point Biquad cascade structure.
- * @param[in]  *pSrc      points to the block of input data.
- * @param[out] *pDst      points to the block of output data.
- * @param[in]  blockSize  number of samples to process per call.
- * @return     none.
- *
+  @brief         Processing function for the floating-point Biquad cascade filter.
+  @param[in]     S         points to an instance of the floating-point Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
 */

+#if defined(ARM_MATH_NEON) 
 void arm_biquad_cascade_df1_f32(
  const arm_biquad_casd_df1_inst_f32 * S,
-  float32_t * pSrc,
+  const float32_t * pSrc,
  float32_t * pDst,
  uint32_t blockSize)
 {
-  float32_t *pIn = pSrc;                         /*  source pointer            */
+
+  const float32_t *pIn = pSrc;                         /*  source pointer            */
  float32_t *pOut = pDst;                        /*  destination pointer       */
  float32_t *pState = S->pState;                 /*  pState pointer            */
-  float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+  const float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
  float32_t acc;                                 /*  Simulates the accumulator */
-  float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
-  float32_t Xn1, Xn2, Yn1, Yn2;                  /*  Filter pState variables   */
-  float32_t Xn;                                  /*  temporary input           */
+  
  uint32_t sample, stage = S->numStages;         /*  loop counters             */

+  float32x4_t Xn;
+  float32x2_t Yn;
+  float32x2_t a;
+  float32x4_t b;
+  
+  float32x4_t x,tmp;
+  float32x2_t t;
+  float32x2x2_t y;

-#if defined (ARM_MATH_DSP)
+  float32_t Xns;

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
+  while (stage > 0U)
+  {
+    /* Reading the coefficients */
+    Xn = vld1q_f32(pState);
+    Yn = vld1_f32(pState + 2);
+
+    b = vld1q_f32(pCoeffs);
+    b = vrev64q_f32(b);  
+    b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
+
+    a = vld1_f32(pCoeffs + 3);
+    a = vrev64_f32(a);
+    b[0] = 0.0;
+    pCoeffs += 5;
+    
+    /* Reading the pState values */
+   
+    /* Apply loop unrolling and compute 4 output values simultaneously. */
+    /*      The variable acc hold output values that are being computed:
+     *
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     */
+
+    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+     ** a second loop below computes the remaining 1 to 3 samples. */
+    sample = blockSize >> 2U;
+
+    while (sample > 0U)
+    {
+      /* Read the first 4 inputs */
+      x = vld1q_f32(pIn);
+
+      pIn += 4;
+
+      tmp = vextq_f32(Xn, x, 1);
+      t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+      t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+      t = vmla_f32(t, a, Yn);
+      t = vpadd_f32(t, t);
+      Yn = vext_f32(Yn, t, 1);
+
+      tmp = vextq_f32(Xn, x, 2);
+      t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+      t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+      t = vmla_f32(t, a, Yn);
+      t = vpadd_f32(t, t);
+      Yn = vext_f32(Yn, t, 1);
+
+      y.val[0] = Yn;
+
+      tmp = vextq_f32(Xn, x, 3);
+      t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
+      t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
+      t = vmla_f32(t, a, Yn);
+      t = vpadd_f32(t, t);
+      Yn = vext_f32(Yn, t, 1);
+
+      Xn = x;
+      t = vmul_f32(vget_high_f32(b), vget_high_f32(Xn));
+      t = vmla_f32(t, vget_low_f32(b), vget_low_f32(Xn));
+      t = vmla_f32(t, a, Yn);
+      t = vpadd_f32(t, t);
+      Yn = vext_f32(Yn, t, 1);
+      
+      y.val[1] = Yn;
+
+      tmp = vcombine_f32(y.val[0], y.val[1]);
+
+      /* Store the 4 outputs and increment the pointer */
+      vst1q_f32(pOut, tmp);
+      pOut += 4;
+
+      /* Decrement the loop counter */
+      sample--;
+    }
+
+    /* If the block size is not a multiple of 4, compute any remaining output samples here.
+     ** No loop unrolling is used. */
+    sample = blockSize & 0x3U;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xns = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc =  (b[1] * Xn[2]) + (b[2] * Xn[3]) + (b[3] * Xns) + (a[0] * Yn[0]) + (a[1] * Yn[1]);
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut++ = acc;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as:    */
+      /* Xn2 = Xn1   */
+      /* Xn1 = Xn    */
+      /* Yn2 = Yn1   */
+      /* Yn1 = acc   */
+      Xn[2] = Xn[3];
+      Xn[3] = Xns;
+      Yn[0] = Yn[1];
+      Yn[1] = acc;
+
+      /* Decrement the loop counter */
+      sample--;
+
+    }
+
+    vst1q_f32(pState,vcombine_f32(vrev64_f32(vget_high_f32(Xn)),vrev64_f32(Yn)));
+    pState += 4;
+    /*  Store the updated state variables back into the pState array */
+   
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent numStages  occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset the output pointer */
+    pOut = pDst;
+
+    /* Decrement the loop counter */
+    stage--;
+  }
+}
+
+#else
+void arm_biquad_cascade_df1_f32(
+  const arm_biquad_casd_df1_inst_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+        float32_t *pOut = pDst;                        /* Destination pointer */
+        float32_t *pState = S->pState;                 /* pState pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t acc;                                 /* Accumulator */
+        float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        float32_t Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
+        float32_t Xn;                                  /* Temporary input */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
  {
@ -198,19 +347,20 @@ void arm_biquad_cascade_df1_f32(
    Yn1 = pState[2];
    Yn2 = pState[3];

+#if defined (ARM_MATH_LOOPUNROLL)
+
    /* Apply loop unrolling and compute 4 output values simultaneously. */
-    /*      The variable acc hold output values that are being computed:
+    /* Variable acc hold output values that are being computed:
     *
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */

+    /* Loop unrolling: Compute 4 outputs at a time */
    sample = blockSize >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (sample > 0U)
    {
      /* Read the first input */
@ -219,15 +369,15 @@ void arm_biquad_cascade_df1_f32(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);

-      /* Store the result in the accumulator in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn2;

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */

      /* Read the second input */
      Xn2 = *pIn++;
@ -235,15 +385,15 @@ void arm_biquad_cascade_df1_f32(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);

-      /* Store the result in the accumulator in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn1;

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */

      /* Read the third input */
      Xn1 = *pIn++;
@ -251,15 +401,15 @@ void arm_biquad_cascade_df1_f32(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);

-      /* Store the result in the accumulator in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn2;

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as: */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */

      /* Read the forth input */
      Xn = *pIn++;
@ -267,97 +417,32 @@ void arm_biquad_cascade_df1_f32(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);

-      /* Store the result in the accumulator in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn1;

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
-
    }

-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
+    /* Loop unrolling: Compute remaining outputs */
    sample = blockSize & 0x3U;

-    while (sample > 0U)
-    {
-      /* Read the input */
-      Xn = *pIn++;
-
-      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
-
-      /* Store the result in the accumulator in the destination buffer. */
-      *pOut++ = acc;
-
-      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:    */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
-      Xn2 = Xn1;
-      Xn1 = Xn;
-      Yn2 = Yn1;
-      Yn1 = acc;
-
-      /* decrement the loop counter */
-      sample--;
-
-    }
-
-    /*  Store the updated state variables back into the pState array */
-    *pState++ = Xn1;
-    *pState++ = Xn2;
-    *pState++ = Yn1;
-    *pState++ = Yn2;
-
-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent numStages  occur in-place in the output buffer */
-    pIn = pDst;
-
-    /* Reset the output pointer */
-    pOut = pDst;
-
-    /* decrement the loop counter */
-    stage--;
-
-  } while (stage > 0U);
-
 #else

-  /* Run the below code for Cortex-M0 */
-
-  do
-  {
-    /* Reading the coefficients */
-    b0 = *pCoeffs++;
-    b1 = *pCoeffs++;
-    b2 = *pCoeffs++;
-    a1 = *pCoeffs++;
-    a2 = *pCoeffs++;
-
-    /* Reading the pState values */
-    Xn1 = pState[0];
-    Xn2 = pState[1];
-    Yn1 = pState[2];
-    Yn2 = pState[3];
-
-    /*      The variables acc holds the output value that is computed:
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]
-     */
-
+    /* Initialize blkCnt with number of samples */
    sample = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
    while (sample > 0U)
    {
      /* Read the input */
@ -366,47 +451,45 @@ void arm_biquad_cascade_df1_f32(
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);

-      /* Store the result in the accumulator in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = acc;

      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:    */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /*  Store the updated state variables back into the pState array */
+    /* Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;

-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent numStages  occur in-place in the output buffer */
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent numStages occur in-place in the output buffer */
    pIn = pDst;

-    /* Reset the output pointer */
+    /* Reset output pointer */
    pOut = pDst;

-    /* decrement the loop counter */
+    /* decrement loop counter */
    stage--;

  } while (stage > 0U);

-#endif /* #if defined (ARM_MATH_DSP) */
-
 }

-
-  /**
-   * @} end of BiquadCascadeDF1 group
-   */
+#endif /* #if defined(ARM_MATH_NEON) */
+/**
+  @} end of BiquadCascadeDF1 group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_fast_q15.c
 * Description:  Fast processing function for the Q15 Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,90 +29,87 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @details
- * @param[in]  *S points to an instance of the Q15 Biquad cascade structure.
- * @param[in]  *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data.
- * @param[in]  blockSize number of samples to process per call.
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * This fast version uses a 32-bit accumulator with 2.30 format.
- * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
- * The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
- *
- * \par
- * Refer to the function <code>arm_biquad_cascade_df1_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.
- * Use the function <code>arm_biquad_cascade_df1_init_q15()</code> to initialize the filter structure.
- *
+  @brief         Processing function for the Q15 Biquad cascade filter (fast variant).
+  @param[in]     S         points to an instance of the Q15 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process per call
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   This fast version uses a 32-bit accumulator with 2.30 format.
+                   The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around and distorts the result.
+                   In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
+                   The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
+ @remark
+                   Refer to \ref arm_biquad_cascade_df1_q15() for a slower implementation of this function
+                   which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+                   Use the function \ref arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
 */

 void arm_biquad_cascade_df1_fast_q15(
  const arm_biquad_casd_df1_inst_q15 * S,
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
-  q15_t *pIn = pSrc;                             /*  Source pointer                               */
-  q15_t *pOut = pDst;                            /*  Destination pointer                          */
-  q31_t in;                                      /*  Temporary variable to hold input value       */
-  q31_t out;                                     /*  Temporary variable to hold output value      */
-  q31_t b0;                                      /*  Temporary variable to hold bo value          */
-  q31_t b1, a1;                                  /*  Filter coefficients                          */
-  q31_t state_in, state_out;                     /*  Filter state variables                       */
-  q31_t acc;                                     /*  Accumulator                                  */
-  int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
-  q15_t *pState = S->pState;                     /*  State pointer                                */
-  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
-  uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */
-
-
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+        q15_t *pOut = pDst;                            /* Destination pointer */
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t acc;                                     /* Accumulator */
+        q31_t in;                                      /* Temporary variable to hold input value */
+        q31_t out;                                     /* Temporary variable to hold output value */
+        q31_t b0;                                      /* Temporary variable to hold bo value */
+        q31_t b1, a1;                                  /* Filter coefficients */
+        q31_t state_in, state_out;                     /* Filter state variables */
+        int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
  {
-
    /* Read the b0 and 0 coefficients using SIMD  */
-    b0 = *__SIMD32(pCoeffs)++;
+    b0 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the b1 and b2 coefficients using SIMD */
-    b1 = *__SIMD32(pCoeffs)++;
+    b1 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the a1 and a2 coefficients using SIMD */
-    a1 = *__SIMD32(pCoeffs)++;
+    a1 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
-    state_in = *__SIMD32(pState)++;
+    state_in = read_q15x2_ia (&pState);

    /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
-    state_out = *__SIMD32(pState)--;
+    state_out = read_q15x2_da (&pState);
+
+#if defined (ARM_MATH_LOOPUNROLL)

    /* Apply loop unrolling and compute 2 output values simultaneously. */
-    /*      The variable acc hold output values that are being computed:
+    /* Variable acc hold output values that are being computed:
     *
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */
+
+    /* Loop unrolling: Compute 2 outputs at a time */
    sample = blockSize >> 1U;

-    /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
-     ** a second loop below computes the remaining 1 sample. */
    while (sample > 0U)
    {

      /* Read the input */
-      in = *__SIMD32(pIn)++;
+      in = read_q15x2_ia ((q15_t **) &pIn);

      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUAD(b0, in);
@ -126,24 +123,20 @@ void arm_biquad_cascade_df1_fast_q15(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      state_in = __PKHBT(in, state_in, 16);
+      state_in  = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
-
 #else
-
-      state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+      state_in  = __PKHBT(state_in >> 16, (in >> 16), 16);
      state_out = __PKHBT(state_out >> 16, (out), 16);
-
-#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
+#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUADX(b0, in);
@ -155,64 +148,53 @@ void arm_biquad_cascade_df1_fast_q15(
      /* The result is converted from 3.29 to 1.31 and then saturation is applied */
      out = __SSAT((acc >> shift), 16);

-
      /* Store the output in the destination buffer. */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
-
+      write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
 #else
-
-      *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
-
+      write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      state_in = __PKHBT(in >> 16, state_in, 16);
+      state_in  = __PKHBT(in >> 16, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
+#else
+      state_in  = __PKHBT(state_in >> 16, in, 16);
+      state_out = __PKHBT(state_out >> 16, out, 16);
+#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
+
+      /* Decrement loop counter */
+      sample--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    sample = (blockSize & 0x1U);

 #else

-      state_in = __PKHBT(state_in >> 16, in, 16);
-      state_out = __PKHBT(state_out >> 16, out, 16);
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;

-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

-
-      /* Decrement the loop counter */
-      sample--;
-
-    }
-
-    /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-
-    if ((blockSize & 0x1U) != 0U)
+    while (sample > 0U)
    {
      /* Read the input */
      in = *pIn++;

      /* out =  b0 * x[n] + 0 * 0 */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
      out = __SMUAD(b0, in);
-
 #else
-
      out = __SMUADX(b0, in);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* acc =  b1 * x[n-1], acc +=  b2 * x[n-2] + out */
@ -228,46 +210,41 @@ void arm_biquad_cascade_df1_fast_q15(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
-
 #else
-
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

+      /* decrement loop counter */
+      sample--;
    }

-    /*  The first stage goes from the input buffer to the output buffer.  */
-    /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent (numStages - 1) occur in-place in the output buffer */
    pIn = pDst;

    /* Reset the output pointer */
    pOut = pDst;

-    /*  Store the updated state variables back into the state array */
-    *__SIMD32(pState)++ = state_in;
-    *__SIMD32(pState)++ = state_out;
+    /* Store the updated state variables back into the state array */
+    write_q15x2_ia(&pState, state_in);
+    write_q15x2_ia(&pState, state_out);

-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    stage--;

  } while (stage > 0U);
 }

-
 /**
- * @} end of BiquadCascadeDF1 group
+  @} end of BiquadCascadeDF1 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_fast_q31.c
 * Description:  Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,55 +29,52 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @details
- *
- * @param[in]  *S        points to an instance of the Q31 Biquad cascade structure.
- * @param[in]  *pSrc     points to the block of input data.
- * @param[out] *pDst     points to the block of output data.
- * @param[in]  blockSize number of samples to process per call.
- * @return     none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
- * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
- * These intermediate results are added to a 2.30 accumulator.
- * Finally, the accumulator is saturated and converted to a 1.31 result.
- * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
- * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
- * arm_biquad_cascade_df1_init_q31() to initialize filter structure.
- *
- * \par
- * Refer to the function <code>arm_biquad_cascade_df1_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.  Both the slow and the fast versions use the same instance structure.
- * Use the function <code>arm_biquad_cascade_df1_init_q31()</code> to initialize the filter structure.
+  @brief         Processing function for the Q31 Biquad cascade filter (fast variant).
+  @param[in]     S         points to an instance of the Q31 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process per call
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+                   The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+                   These intermediate results are added to a 2.30 accumulator.
+                   Finally, the accumulator is saturated and converted to a 1.31 result.
+                   The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+                   In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
+                   arm_biquad_cascade_df1_init_q31() to initialize filter structure.
+  @remark
+                   Refer to \ref arm_biquad_cascade_df1_q31() for a slower implementation of this function
+                   which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
+                   Use the function \ref arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
 */

 void arm_biquad_cascade_df1_fast_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  q31_t acc = 0;                                 /*  accumulator                   */
-  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
-  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
-  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
-  q31_t *pOut = pDst;                            /*  output pointer initialization */
-  q31_t *pState = S->pState;                     /*  pState pointer initialization */
-  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
-  q31_t Xn;                                      /*  temporary input               */
-  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
-  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
-
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+        q31_t *pOut = pDst;                            /* Destination pointer */
+        q31_t *pState = S->pState;                     /* pState pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t acc = 0;                                 /* Accumulator */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /* Filter pState variables */
+        q31_t Xn;                                      /* Temporary input */
+        int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
  {
@ -88,22 +85,23 @@ void arm_biquad_cascade_df1_fast_q31(
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;

-    /* Reading the state values */
+    /* Reading the pState values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];

+#if defined (ARM_MATH_LOOPUNROLL)
+
    /* Apply loop unrolling and compute 4 output values simultaneously. */
-    /*      The variables acc ... acc3 hold output values that are being computed:
+    /* Variables acc ... acc3 hold output values that are being computed:
     *
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */

+    /* Loop unrolling: Compute 4 outputs at a time */
    sample = blockSize >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (sample > 0U)
    {
      /* Read the input */
@ -111,19 +109,19 @@ void arm_biquad_cascade_df1_fast_q31(

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      /*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
+      /* acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
      mult_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b1 * x[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b[2] * x[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31 , Yn2 variable is reused */
@ -137,19 +135,19 @@ void arm_biquad_cascade_df1_fast_q31(

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      /*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
+      /* acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn2);
      /* acc +=  b1 * x[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn);
      /* acc +=  b[2] * x[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn1);
      /* acc +=  a1 * y[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn1);

      /* The result is converted to 1.31, Yn1 variable is reused  */
@ -163,19 +161,19 @@ void arm_biquad_cascade_df1_fast_q31(

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      /*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
+      /* acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn1);
      /* acc +=  b1 * x[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn2);
      /* acc +=  b[2] * x[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn);
      /* acc +=  a1 * y[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31, Yn2 variable is reused  */
@ -190,7 +188,7 @@ void arm_biquad_cascade_df1_fast_q31(

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
+      /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
@ -207,47 +205,53 @@ void arm_biquad_cascade_df1_fast_q31(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
+      /* Xn2 = Xn1 */
      Xn2 = Xn1;

      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;

-      /* Xn1 = Xn     */
+      /* Xn1 = Xn */
      Xn1 = Xn;

      /* Store the output in the destination buffer. */
      *(pOut + 3U) = Yn1;
      pOut += 4U;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
+    /* Loop unrolling: Compute remaining outputs */
    sample = (blockSize & 0x3U);

-   while (sample > 0U)
-   {
+#else
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (sample > 0U)
+    {
      /* Read the input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
-      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
+      /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
-      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
+      /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31  */
@ -255,10 +259,10 @@ void arm_biquad_cascade_df1_fast_q31(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
@ -267,18 +271,18 @@ void arm_biquad_cascade_df1_fast_q31(
      /* Store the output in the destination buffer. */
      *pOut++ = acc;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
-   }
+    }

-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent stages occur in-place in the output buffer */
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent stages occur in-place in the output buffer */
    pIn = pDst;

    /* Reset to destination pointer */
    pOut = pDst;

-    /*  Store the updated state variables back into the pState array */
+    /* Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
@ -288,5 +292,5 @@ void arm_biquad_cascade_df1_fast_q31(
 }

 /**
-  * @} end of BiquadCascadeDF1 group
-  */
+  @} end of BiquadCascadeDF1 group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_init_f32.c
 * Description:  Floating-point Biquad cascade DirectFormI(DF1) filter initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,55 +29,49 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @details
- * @brief  Initialization function for the floating-point Biquad cascade filter.
- * @param[in,out] *S           points to an instance of the floating-point Biquad cascade structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients array.
- * @param[in]     *pState      points to the state array.
- * @return        none
- *
- *
- * <b>Coefficient and State Ordering:</b>
- *
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- *
- * \par
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> is a pointer to state array.
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
- * The state variables are arranged in the <code>pState</code> array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
- *
+  @brief         Initialization function for the floating-point Biquad cascade filter.
+  @param[in,out] S           points to an instance of the floating-point Biquad cascade structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cascade_df1_init_f32(
-  arm_biquad_casd_df1_inst_f32 * S,
-  uint8_t numStages,
-  float32_t * pCoeffs,
-  float32_t * pState)
+        arm_biquad_casd_df1_inst_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -93,5 +87,5 @@ void arm_biquad_cascade_df1_init_f32(
 }

 /**
- * @} end of BiquadCascadeDF1 group
+  @} end of BiquadCascadeDF1 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_init_q15.c
 * Description:  Q15 Biquad cascade DirectFormI(DF1) filter initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,54 +29,51 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @details
- *
- * @param[in,out] *S           points to an instance of the Q15 Biquad cascade structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients.
- * @param[in]     *pState      points to the state buffer.
- * @param[in]     postShift    Shift to be applied to the accumulator result. Varies according to the coefficients format
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- *
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}
- * </pre>
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>6*numStages</code> values.
- * The zero coefficient between <code>b1</code> and <code>b2</code> facilities  use of 16-bit SIMD instructions on the Cortex-M4.
- *
- * \par
- * The state variables are stored in the array <code>pState</code>.
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
- * The state variables are arranged in the <code>pState</code> array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the Q15 Biquad cascade filter.
+  @param[in,out] S           points to an instance of the Q15 Biquad cascade structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @param[in]     postShift   Shift to be applied to the accumulator result. Varies according to the coefficients format
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}
+  </pre>
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>6*numStages</code> values.
+                   The zero coefficient between <code>b1</code> and <code>b2</code> facilities  use of 16-bit SIMD instructions on the Cortex-M4.
+  @par
+                   The state variables are stored in the array <code>pState</code>.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cascade_df1_init_q15(
-  arm_biquad_casd_df1_inst_q15 * S,
-  uint8_t numStages,
-  q15_t * pCoeffs,
-  q15_t * pState,
-  int8_t postShift)
+        arm_biquad_casd_df1_inst_q15 * S,
+        uint8_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -95,5 +92,5 @@ void arm_biquad_cascade_df1_init_q15(
 }

 /**
- * @} end of BiquadCascadeDF1 group
+  @} end of BiquadCascadeDF1 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_init_q31.c
 * Description:  Q31 Biquad cascade DirectFormI(DF1) filter initialization function
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,53 +29,50 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @details
- *
- * @param[in,out] *S           points to an instance of the Q31 Biquad cascade structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients buffer.
- * @param[in]     *pState      points to the state buffer.
- * @param[in]     postShift    Shift to be applied after the accumulator.  Varies according to the coefficients format
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- *
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> points to state variables array.
- * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
- * The state variables are arranged in the <code>pState</code> array as:
- * <pre>
- *     {x[n-1], x[n-2], y[n-1], y[n-2]}
- * </pre>
- * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
- * The state array has a total length of <code>4*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the Q31 Biquad cascade filter.
+  @param[in,out] S           points to an instance of the Q31 Biquad cascade structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @param[in]     postShift   Shift to be applied after the accumulator.  Varies according to the coefficients format
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> points to state variables array.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cascade_df1_init_q31(
-  arm_biquad_casd_df1_inst_q31 * S,
-  uint8_t numStages,
-  q31_t * pCoeffs,
-  q31_t * pState,
-  int8_t postShift)
+        arm_biquad_casd_df1_inst_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -94,5 +91,5 @@ void arm_biquad_cascade_df1_init_q31(
 }

 /**
- * @} end of BiquadCascadeDF1 group
+  @} end of BiquadCascadeDF1 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_q15.c
 * Description:  Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,79 +29,74 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @brief Processing function for the Q15 Biquad cascade filter.
- * @param[in]  *S points to an instance of the Q15 Biquad cascade structure.
- * @param[in]  *pSrc points to the block of input data.
- * @param[out] *pDst points to the location where the output result is written.
- * @param[in]  blockSize number of samples to process per call.
- * @return none.
- *
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
- * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
- * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
- * The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
- * Finally, the result is saturated to 1.15 format.
- *
- * \par
- * Refer to the function <code>arm_biquad_cascade_df1_fast_q15()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+  @brief         Processing function for the Q15 Biquad cascade filter.
+  @param[in]     S         points to an instance of the Q15 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the location where the output result is written
+  @param[in]     blockSize number of samples to process
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using a 64-bit internal accumulator.
+                   Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+                   The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+                   There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+                   The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
+                   Finally, the result is saturated to 1.15 format.
+  @remark
+                   Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
 */

 void arm_biquad_cascade_df1_q15(
  const arm_biquad_casd_df1_inst_q15 * S,
-  q15_t * pSrc,
-  q15_t * pDst,
-  uint32_t blockSize)
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {


 #if defined (ARM_MATH_DSP)

-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q15_t *pIn = pSrc;                             /*  Source pointer                               */
-  q15_t *pOut = pDst;                            /*  Destination pointer                          */
-  q31_t in;                                      /*  Temporary variable to hold input value       */
-  q31_t out;                                     /*  Temporary variable to hold output value      */
-  q31_t b0;                                      /*  Temporary variable to hold bo value          */
-  q31_t b1, a1;                                  /*  Filter coefficients                          */
-  q31_t state_in, state_out;                     /*  Filter state variables                       */
-  q31_t acc_l, acc_h;
-  q63_t acc;                                     /*  Accumulator                                  */
-  int32_t lShift = (15 - (int32_t) S->postShift);       /*  Post shift                                   */
-  q15_t *pState = S->pState;                     /*  State pointer                                */
-  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
-  uint32_t sample, stage = (uint32_t) S->numStages;     /*  Stage loop counter                           */
-  int32_t uShift = (32 - lShift);
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+        q15_t *pOut = pDst;                            /* Destination pointer */
+        q31_t in;                                      /* Temporary variable to hold input value */
+        q31_t out;                                     /* Temporary variable to hold output value */
+        q31_t b0;                                      /* Temporary variable to hold bo value */
+        q31_t b1, a1;                                  /* Filter coefficients */
+        q31_t state_in, state_out;                     /* Filter state variables */
+        q31_t acc_l, acc_h;
+        q63_t acc;                                     /* Accumulator */
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        int32_t lShift = (15 - (int32_t) S->postShift);       /* Post shift */
+        uint32_t sample, stage = (uint32_t) S->numStages;     /* Stage loop counter */
+        int32_t uShift = (32 - lShift);

  do
  {
    /* Read the b0 and 0 coefficients using SIMD  */
-    b0 = *__SIMD32(pCoeffs)++;
+    b0 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the b1 and b2 coefficients using SIMD */
-    b1 = *__SIMD32(pCoeffs)++;
+    b1 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the a1 and a2 coefficients using SIMD */
-    a1 = *__SIMD32(pCoeffs)++;
+    a1 = read_q15x2_ia ((q15_t **) &pCoeffs);

    /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
-    state_in = *__SIMD32(pState)++;
+    state_in = read_q15x2_ia (&pState);

    /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
-    state_out = *__SIMD32(pState)--;
+    state_out = read_q15x2_da (&pState);

    /* Apply loop unrolling and compute 2 output values simultaneously. */
    /*      The variable acc hold output values that are being computed:
@ -117,7 +112,7 @@ void arm_biquad_cascade_df1_q15(
    {

      /* Read the input */
-      in = *__SIMD32(pIn)++;
+      in = read_q15x2_ia ((q15_t **) &pIn);

      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUAD(b0, in);
@ -141,23 +136,19 @@ void arm_biquad_cascade_df1_q15(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      state_in = __PKHBT(in, state_in, 16);
+      state_in  = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
-
 #else
-
-      state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+      state_in  = __PKHBT(state_in >> 16, (in >> 16), 16);
      state_out = __PKHBT(state_out >> 16, (out), 16);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* out =  b0 * x[n] + 0 * 0 */
@ -180,41 +171,30 @@ void arm_biquad_cascade_df1_q15(
      out = __SSAT(out, 16);

      /* Store the output in the destination buffer. */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
-
+      write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
 #else
-
-      *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
-
+      write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
-
-      state_in = __PKHBT(in >> 16, state_in, 16);
+      state_in  = __PKHBT(in >> 16, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
-
 #else
-
-      state_in = __PKHBT(state_in >> 16, in, 16);
+      state_in  = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
      sample--;
-
    }

    /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
@ -226,15 +206,10 @@ void arm_biquad_cascade_df1_q15(
      in = *pIn++;

      /* out =  b0 * x[n] + 0 * 0 */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
      out = __SMUAD(b0, in);
-
 #else
-
      out = __SMUADX(b0, in);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

      /* acc =  b1 * x[n-1] + b2 * x[n-2] + out */
@ -259,58 +234,49 @@ void arm_biquad_cascade_df1_q15(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc   */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
-
 #ifndef  ARM_MATH_BIG_ENDIAN
-
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
-
 #else
-
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
-
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
-
    }

-    /*  The first stage goes from the input wire to the output wire.  */
-    /*  Subsequent numStages occur in-place in the output wire  */
+    /* The first stage goes from the input wire to the output wire.  */
+    /* Subsequent numStages occur in-place in the output wire  */
    pIn = pDst;

    /* Reset the output pointer */
    pOut = pDst;

-    /*  Store the updated state variables back into the state array */
-    *__SIMD32(pState)++ = state_in;
-    *__SIMD32(pState)++ = state_out;
+    /* Store the updated state variables back into the state array */
+    write_q15x2_ia (&pState, state_in);
+    write_q15x2_ia (&pState, state_out);

-
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
    stage--;

  } while (stage > 0U);

 #else

-  /* Run the below code for Cortex-M0 */
-
-  q15_t *pIn = pSrc;                             /*  Source pointer                               */
-  q15_t *pOut = pDst;                            /*  Destination pointer                          */
-  q15_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
-  q15_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
-  q15_t Xn;                                      /*  temporary input               */
-  q63_t acc;                                     /*  Accumulator                                  */
-  int32_t shift = (15 - (int32_t) S->postShift); /*  Post shift                                   */
-  q15_t *pState = S->pState;                     /*  State pointer                                */
-  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
-  uint32_t sample, stage = (uint32_t) S->numStages;     /*  Stage loop counter                           */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+        q15_t *pOut = pDst;                            /* Destination pointer */
+        q15_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q15_t Xn1, Xn2, Yn1, Yn2;                      /* Filter state variables */
+        q15_t Xn;                                      /* temporary input */
+        q63_t acc;                                     /* Accumulator */
+        int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        uint32_t sample, stage = (uint32_t) S->numStages;     /* Stage loop counter */

  do
  {
@ -328,7 +294,7 @@ void arm_biquad_cascade_df1_q15(
    Yn1 = pState[2];
    Yn2 = pState[3];

-    /*      The variables acc holds the output value that is computed:
+    /* The variables acc holds the output value that is computed:
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */

@ -357,10 +323,10 @@ void arm_biquad_cascade_df1_q15(

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
@ -392,7 +358,6 @@ void arm_biquad_cascade_df1_q15(

 }

-
 /**
- * @} end of BiquadCascadeDF1 group
+  @} end of BiquadCascadeDF1 group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df1_q31.c
 * Description:  Processing function for the Q31 Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,59 +29,54 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
 */

 /**
- * @brief Processing function for the Q31 Biquad cascade filter.
- * @param[in]  *S         points to an instance of the Q31 Biquad cascade structure.
- * @param[in]  *pSrc      points to the block of input data.
- * @param[out] *pDst      points to the block of output data.
- * @param[in]  blockSize  number of samples to process per call.
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
- * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
- * 1.31 format by discarding the low 32 bits.
- *
- * \par
- * Refer to the function <code>arm_biquad_cascade_df1_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+  @brief         Processing function for the Q31 Biquad cascade filter.
+  @param[in]     S         points to an instance of the Q31 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around rather than clip.
+                   In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+                   After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
+                   1.31 format by discarding the low 32 bits.
+  @remark
+                   Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
 */

 void arm_biquad_cascade_df1_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  q63_t acc;                                     /*  accumulator                   */
-  uint32_t uShift = ((uint32_t) S->postShift + 1U);
-  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
-  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
-  q31_t *pOut = pDst;                            /*  output pointer initialization */
-  q31_t *pState = S->pState;                     /*  pState pointer initialization */
-  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
-  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
-  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
-  q31_t Xn;                                      /*  temporary input               */
-  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+        q31_t *pOut = pDst;                            /* Destination pointer */
+        q31_t *pState = S->pState;                     /* pState pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q63_t acc;                                     /* Accumulator */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /* Filter pState variables */
+        q31_t Xn;                                      /* Temporary input */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

-
-#if defined (ARM_MATH_DSP)
-
-  q31_t acc_l, acc_h;                            /*  temporary output variables    */
-
-  /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t acc_l, acc_h;                            /* temporary output variables */
+#endif

  do
  {
@ -92,301 +87,161 @@ void arm_biquad_cascade_df1_q31(
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;

-    /* Reading the state values */
+    /* Reading the pState values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];

+#if defined (ARM_MATH_LOOPUNROLL)
+
    /* Apply loop unrolling and compute 4 output values simultaneously. */
-    /*      The variable acc hold output values that are being computed:
+    /* Variable acc hold output values that are being computed:
     *
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
     */

+    /* Loop unrolling: Compute 4 outputs at a time */
    sample = blockSize >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
    while (sample > 0U)
    {
-      /* Read the input */
+      /* Read the first input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);

      /* The result is converted to 1.31 , Yn2 variable is reused */
-
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */

      /* Apply shift for lower part of acc and upper part of acc */
      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;

-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn2;

      /* Read the second input */
      Xn2 = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn2;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn1;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn2;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn1;
-
+      acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);

      /* The result is converted to 1.31, Yn1 variable is reused  */
-
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
-
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */

      /* Apply shift for lower part of acc and upper part of acc */
      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;

-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn1;

-      /* Read the third input  */
+      /* Read the third input */
      Xn1 = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn1;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn2;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);

      /* The result is converted to 1.31, Yn2 variable is reused  */
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
-
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */

      /* Apply shift for lower part of acc and upper part of acc */
      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;

-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn2;

      /* Read the forth input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn2;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn1;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);

      /* The result is converted to 1.31, Yn1 variable is reused  */
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */

      /* Apply shift for lower part of acc and upper part of acc */
      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;

-      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
-      Xn2 = Xn1;
-      Xn1 = Xn;
-
-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
      *pOut++ = Yn1;

-      /* decrement the loop counter */
-      sample--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    sample = (blockSize & 0x3U);
-
-    while (sample > 0U)
-    {
-      /* Read the input */
-      Xn = *pIn++;
-
-      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
-
-      /* The result is converted to 1.31  */
-      acc = acc >> lShift;
-
      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
-      Yn2 = Yn1;
-      Yn1 = (q31_t) acc;

-      /* Store the output in the destination buffer. */
-      *pOut++ = (q31_t) acc;
-
-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent stages occur in-place in the output buffer */
-    pIn = pDst;
-
-    /* Reset to destination pointer */
-    pOut = pDst;
-
-    /*  Store the updated state variables back into the pState array */
-    *pState++ = Xn1;
-    *pState++ = Xn2;
-    *pState++ = Yn1;
-    *pState++ = Yn2;
-
-  } while (--stage);
+    /* Loop unrolling: Compute remaining outputs */
+    sample = blockSize & 0x3U;

 #else

-  /* Run the below code for Cortex-M0 */
-
-  do
-  {
-    /* Reading the coefficients */
-    b0 = *pCoeffs++;
-    b1 = *pCoeffs++;
-    b2 = *pCoeffs++;
-    a1 = *pCoeffs++;
-    a2 = *pCoeffs++;
-
-    /* Reading the state values */
-    Xn1 = pState[0];
-    Xn2 = pState[1];
-    Yn1 = pState[2];
-    Yn2 = pState[3];
-
-    /*      The variables acc holds the output value that is computed:
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
-     */
-
+    /* Initialize blkCnt with number of samples */
    sample = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
    while (sample > 0U)
    {
      /* Read the input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);

      /* The result is converted to 1.31  */
      acc = acc >> lShift;

+      /* Store output in destination buffer. */
+      *pOut++ = (q31_t) acc;
+
      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = (q31_t) acc;

-      /* Store the output in the destination buffer. */
-      *pOut++ = (q31_t) acc;
-
-      /* decrement the loop counter */
+      /* decrement loop counter */
      sample--;
    }

-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent stages occur in-place in the output buffer */
-    pIn = pDst;
-
-    /* Reset to destination pointer */
-    pOut = pDst;
-
-    /*  Store the updated state variables back into the pState array */
+    /* Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;

-  } while (--stage);
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent numStages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset output pointer */
+    pOut = pDst;
+
+    /* decrement loop counter */
+    stage--;
+
+  } while (stage > 0U);

-#endif /*  #if defined (ARM_MATH_DSP) */
 }

-
-
-
 /**
-  * @} end of BiquadCascadeDF1 group
-  */
+  @} end of BiquadCascadeDF1 group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df2T_f32.c
 * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,381 +29,141 @@
 #include "arm_math.h"

 /**
-* @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes <code>blockSize</code> samples through the filter.
-* <code>pSrc</code> points to the array of input data and
-* <code>pDst</code> points to the array of output data.
-* Both arrays contain <code>blockSize</code> values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-* <pre>
-*    y[n] = b0 * x[n] + d1
-*    d1 = b1 * x[n] + a1 * y[n] + d2
-*    d2 = b2 * x[n] + a2 * y[n]
-* </pre>
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
-* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-* <pre>
-*    y[n] = b0 * x[n] + d1;
-*    d1 = b1 * x[n] - a1 * y[n] + d2;
-*    d2 = b2 * x[n] - a2 * y[n];
-* </pre>
-* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* <code>numStages</code> refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
-* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
-* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
-*
-* \par
-* <code>pState</code> points to the state variable array.
-* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
-* The state variables are arranged in the <code>pState</code> array as:
-* <pre>
-*     {d11, d12, d21, d22, ...}
-* </pre>
-* where <code>d1x</code> refers to the state variables for the first Biquad and
-* <code>d2x</code> refers to the state variables for the second Biquad.
-* The state array has a total length of <code>2*numStages</code> values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-* <pre>
-*     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
-* </pre>
-* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
-* <code>pCoeffs</code> is the address of the coefficient buffer;
-*
-*/
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */

 /**
-* @addtogroup BiquadCascadeDF2T
-* @{
-*/
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */

-/**
-* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-* @param[in]  *S        points to an instance of the filter data structure.
-* @param[in]  *pSrc     points to the block of input data.
-* @param[out] *pDst     points to the block of output data
-* @param[in]  blockSize number of samples to process.
-* @return none.
-*/
+#if defined(ARM_MATH_NEON) 

-
-LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_df2T_f32(
-const arm_biquad_cascade_df2T_instance_f32 * S,
-float32_t * pSrc,
-float32_t * pDst,
-uint32_t blockSize)
+  const arm_biquad_cascade_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-
-   float32_t *pIn = pSrc;                         /*  source pointer            */
+   const float32_t *pIn = pSrc;                   /*  source pointer            */
   float32_t *pOut = pDst;                        /*  destination pointer       */
   float32_t *pState = S->pState;                 /*  State pointer             */
-   float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+   const float32_t *pCoeffs = S->pCoeffs;         /*  coefficient pointer       */
   float32_t acc1;                                /*  accumulator               */
   float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
   float32_t Xn1;                                 /*  temporary input           */
   float32_t d1, d2;                              /*  state variables           */
-   uint32_t sample, stage = S->numStages;         /*  loop counters             */
+   uint32_t sample, stageCnt,stage = S->numStages;         /*   loop counters   */

-#if defined(ARM_MATH_CM7)

-   float32_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8;   /*  Input State variables     */
-   float32_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
-   float32_t acc2, acc3, acc4, acc5, acc6, acc7;  /*  Simulates the accumulator */
-   float32_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
+   float32_t Xn2, Xn3, Xn4;                       /*  Input State variables     */
+   float32_t acc2, acc3, acc4;                    /*  accumulator               */

-   do
+
+   float32_t p0, p1, p2, p3, p4, A1;
+
+   float32x4_t XnV, YnV;
+   float32x4x2_t dV;
+   float32x4_t zeroV = vdupq_n_f32(0.0);
+   float32x4_t t1,t2,t3,t4,b1V,b2V,a1V,a2V,s;
+
+   /* Loop unrolling. Compute 4 outputs at a time */
+   stageCnt = stage >> 2;
+
+   while (stageCnt > 0U)
   {
      /* Reading the coefficients */
-      b0 = pCoeffs[0];
-      b1 = pCoeffs[1];
-      b2 = pCoeffs[2];
-      a1 = pCoeffs[3];
-      /* Apply loop unrolling and compute 16 output values simultaneously. */
-      sample = blockSize >> 4U;
-      a2 = pCoeffs[4];
+      t1 = vld1q_f32(pCoeffs);
+      pCoeffs += 4;

-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
+      t2 = vld1q_f32(pCoeffs);
+      pCoeffs += 4;

-      pCoeffs += 5U;
+      t3 = vld1q_f32(pCoeffs);
+      pCoeffs += 4;

+      t4 = vld1q_f32(pCoeffs);
+      pCoeffs += 4;

-      /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.
-       ** a second loop below computes the remaining 1 to 15 samples. */
+      b1V = vld1q_f32(pCoeffs);
+      pCoeffs += 4;
+
+      b2V = vld1q_f32(pCoeffs);
+      pCoeffs += 4;
+
+      a1V = vld1q_f32(pCoeffs);
+      pCoeffs += 4;
+
+      a2V = vld1q_f32(pCoeffs);
+      pCoeffs += 4;
+
+      /* Reading the state values */
+      dV = vld2q_f32(pState);
+
+      sample = blockSize;
+      
      while (sample > 0U) {
-
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */

-         /* Read the first 2 inputs. 2 cycles */
-         Xn1  = pIn[0 ];
-         Xn2  = pIn[1 ];
+         XnV = vdupq_n_f32(*pIn++);

-         /* Sample 1. 5 cycles */
-         Xn3  = pIn[2 ];
-         acc1 = b0 * Xn1 + d1;
+         s = dV.val[0];
+         YnV = s;

-         Xn4  = pIn[3 ];
-         d1 = b1 * Xn1 + d2;
+         s = vextq_f32(zeroV,dV.val[0],3);
+         YnV = vmlaq_f32(YnV, t1, s);

-         Xn5  = pIn[4 ];
-         d2 = b2 * Xn1;
+         s = vextq_f32(zeroV,dV.val[0],2);
+         YnV = vmlaq_f32(YnV, t2, s);

-         Xn6  = pIn[5 ];
-         d1 += a1 * acc1;
+         s = vextq_f32(zeroV,dV.val[0],1);
+         YnV = vmlaq_f32(YnV, t3, s);

-         Xn7  = pIn[6 ];
-         d2 += a2 * acc1;
+         YnV = vmlaq_f32(YnV, t4, XnV);

-         /* Sample 2. 5 cycles */
-         Xn8  = pIn[7 ];
-         acc2 = b0 * Xn2 + d1;
+         s = vextq_f32(XnV,YnV,3);

-         Xn9  = pIn[8 ];
-         d1 = b1 * Xn2 + d2;
+         dV.val[0] = vmlaq_f32(dV.val[1], s, b1V);
+         dV.val[0] = vmlaq_f32(dV.val[0], YnV, a1V);

-         Xn10 = pIn[9 ];
-         d2 = b2 * Xn2;
+         dV.val[1] = vmulq_f32(s, b2V);
+         dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);

-         Xn11 = pIn[10];
-         d1 += a1 * acc2;
-
-         Xn12 = pIn[11];
-         d2 += a2 * acc2;
-
-         /* Sample 3. 5 cycles */
-         Xn13 = pIn[12];
-         acc3 = b0 * Xn3 + d1;
-
-         Xn14 = pIn[13];
-         d1 = b1 * Xn3 + d2;
-
-         Xn15 = pIn[14];
-         d2 = b2 * Xn3;
-
-         Xn16 = pIn[15];
-         d1 += a1 * acc3;
-
-         pIn += 16;
-         d2 += a2 * acc3;
-
-         /* Sample 4. 5 cycles */
-         acc4 = b0 * Xn4 + d1;
-         d1 = b1 * Xn4 + d2;
-         d2 = b2 * Xn4;
-         d1 += a1 * acc4;
-         d2 += a2 * acc4;
-
-         /* Sample 5. 5 cycles */
-         acc5 = b0 * Xn5 + d1;
-         d1 = b1 * Xn5 + d2;
-         d2 = b2 * Xn5;
-         d1 += a1 * acc5;
-         d2 += a2 * acc5;
-
-         /* Sample 6. 5 cycles */
-         acc6 = b0 * Xn6 + d1;
-         d1 = b1 * Xn6 + d2;
-         d2 = b2 * Xn6;
-         d1 += a1 * acc6;
-         d2 += a2 * acc6;
-
-         /* Sample 7. 5 cycles */
-         acc7 = b0 * Xn7 + d1;
-         d1 = b1 * Xn7 + d2;
-         d2 = b2 * Xn7;
-         d1 += a1 * acc7;
-         d2 += a2 * acc7;
-
-         /* Sample 8. 5 cycles */
-         acc8 = b0 * Xn8 + d1;
-         d1 = b1 * Xn8 + d2;
-         d2 = b2 * Xn8;
-         d1 += a1 * acc8;
-         d2 += a2 * acc8;
-
-         /* Sample 9. 5 cycles */
-         acc9 = b0 * Xn9 + d1;
-         d1 = b1 * Xn9 + d2;
-         d2 = b2 * Xn9;
-         d1 += a1 * acc9;
-         d2 += a2 * acc9;
-
-         /* Sample 10. 5 cycles */
-         acc10 = b0 * Xn10 + d1;
-         d1 = b1 * Xn10 + d2;
-         d2 = b2 * Xn10;
-         d1 += a1 * acc10;
-         d2 += a2 * acc10;
-
-         /* Sample 11. 5 cycles */
-         acc11 = b0 * Xn11 + d1;
-         d1 = b1 * Xn11 + d2;
-         d2 = b2 * Xn11;
-         d1 += a1 * acc11;
-         d2 += a2 * acc11;
-
-         /* Sample 12. 5 cycles */
-         acc12 = b0 * Xn12 + d1;
-         d1 = b1 * Xn12 + d2;
-         d2 = b2 * Xn12;
-         d1 += a1 * acc12;
-         d2 += a2 * acc12;
-
-         /* Sample 13. 5 cycles */
-         acc13 = b0 * Xn13 + d1;
-         d1 = b1 * Xn13 + d2;
-         d2 = b2 * Xn13;
-
-         pOut[0 ] = acc1 ;
-         d1 += a1 * acc13;
-
-         pOut[1 ] = acc2 ;
-         d2 += a2 * acc13;
-
-         /* Sample 14. 5 cycles */
-         pOut[2 ] = acc3 ;
-         acc14 = b0 * Xn14 + d1;
-
-         pOut[3 ] = acc4 ;
-         d1 = b1 * Xn14 + d2;
-
-         pOut[4 ] = acc5 ;
-         d2 = b2 * Xn14;
-
-         pOut[5 ] = acc6 ;
-         d1 += a1 * acc14;
-
-         pOut[6 ] = acc7 ;
-         d2 += a2 * acc14;
-
-         /* Sample 15. 5 cycles */
-         pOut[7 ] = acc8 ;
-         pOut[8 ] = acc9 ;
-         acc15 = b0 * Xn15 + d1;
-
-         pOut[9 ] = acc10;
-         d1 = b1 * Xn15 + d2;
-
-         pOut[10] = acc11;
-         d2 = b2 * Xn15;
-
-         pOut[11] = acc12;
-         d1 += a1 * acc15;
-
-         pOut[12] = acc13;
-         d2 += a2 * acc15;
-
-         /* Sample 16. 5 cycles */
-         pOut[13] = acc14;
-         acc16 = b0 * Xn16 + d1;
-
-         pOut[14] = acc15;
-         d1 = b1 * Xn16 + d2;
-
-         pOut[15] = acc16;
-         d2 = b2 * Xn16;
+         *pOut++ = YnV[3];

         sample--;
-         d1 += a1 * acc16;
-
-         pOut += 16;
-         d2 += a2 * acc16;
      }
-
-      sample = blockSize & 0xFu;
-      while (sample > 0U) {
-         Xn1 = *pIn;
-         acc1 = b0 * Xn1 + d1;
-
-         pIn++;
-         d1 = b1 * Xn1 + d2;
-
-         *pOut = acc1;
-         d2 = b2 * Xn1;
-
-         pOut++;
-         d1 += a1 * acc1;
-
-         sample--;
-         d2 += a2 * acc1;
-      }
-
+     
      /* Store the updated state variables back into the state array */
-      pState[0] = d1;
+      vst2q_f32(pState,dV);
+      pState += 8;
+
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;

-      pState[1] = d2;
-      /* decrement the loop counter */
-      stage--;
-
-      pState += 2U;
-
      /*Reset the output working pointer */
      pOut = pDst;

-   } while (stage > 0U);
+      /* decrement the loop counter */
+      stageCnt--;

-#elif defined(ARM_MATH_CM0_FAMILY)
+   } 

-   /* Run the below code for Cortex-M0 */
-
-   do
+   /* Tail */
+   stageCnt = stage & 3;
+   
+   while (stageCnt > 0U)
   {
      /* Reading the coefficients */
      b0 = *pCoeffs++;
@ -416,7 +176,6 @@ uint32_t blockSize)
      d1 = pState[0];
      d2 = pState[1];

-
      sample = blockSize;

      while (sample > 0U)
@ -452,139 +211,313 @@ uint32_t blockSize)
      pOut = pDst;

      /* decrement the loop counter */
-      stage--;
+      stageCnt--;
+   }
+}
+#else
+LOW_OPTIMIZATION_ENTER
+void arm_biquad_cascade_df2T_f32(
+  const arm_biquad_cascade_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+        float32_t *pOut = pDst;                        /* Destination pointer */
+        float32_t *pState = S->pState;                 /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t acc1;                                /* Accumulator */
+        float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        float32_t Xn1;                                 /* Temporary input */
+        float32_t d1, d2;                              /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

-   } while (stage > 0U);
+  do
+  {
+     /* Reading the coefficients */
+     b0 = pCoeffs[0];
+     b1 = pCoeffs[1];
+     b2 = pCoeffs[2];
+     a1 = pCoeffs[3];
+     a2 = pCoeffs[4];
+
+     /* Reading the state values */
+     d1 = pState[0];
+     d2 = pState[1];
+
+     pCoeffs += 5U;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+     /* Loop unrolling: Compute 16 outputs at a time */
+     sample = blockSize >> 4U;
+
+     while (sample > 0U) {
+
+       /* y[n] = b0 * x[n] + d1 */
+       /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+       /* d2 = b2 * x[n] + a2 * y[n] */
+
+/*  1 */
+       Xn1 = *pIn++;
+
+       acc1 = b0 * Xn1 + d1;
+
+       d1 = b1 * Xn1 + d2;
+       d1 += a1 * acc1;
+
+       d2 = b2 * Xn1;
+       d2 += a2 * acc1;
+
+       *pOut++ = acc1;
+
+/*  2 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  3 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  4 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  5 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  6 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  7 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  8 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  9 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 10 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 11 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 12 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 13 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 14 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 15 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 16 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+        /* decrement loop counter */
+        sample--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      sample = blockSize & 0xFU;

 #else

-   float32_t Xn2, Xn3, Xn4;                  	  /*  Input State variables     */
-   float32_t acc2, acc3, acc4;              		  /*  accumulator               */
+      /* Initialize blkCnt with number of samples */
+      sample = blockSize;

+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

-   float32_t p0, p1, p2, p3, p4, A1;
-
-   /* Run the below code for Cortex-M4 and Cortex-M3 */
-   do
-   {
-      /* Reading the coefficients */
-      b0 = *pCoeffs++;
-      b1 = *pCoeffs++;
-      b2 = *pCoeffs++;
-      a1 = *pCoeffs++;
-      a2 = *pCoeffs++;
-
-
-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
-
-      /* Apply loop unrolling and compute 4 output values simultaneously. */
-      sample = blockSize >> 2U;
-
-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
      while (sample > 0U) {
+        Xn1 = *pIn++;

-         /* y[n] = b0 * x[n] + d1 */
-         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-         /* d2 = b2 * x[n] + a2 * y[n] */
+        acc1 = b0 * Xn1 + d1;

-         /* Read the four inputs */
-         Xn1 = pIn[0];
-         Xn2 = pIn[1];
-         Xn3 = pIn[2];
-         Xn4 = pIn[3];
-         pIn += 4;
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;

-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p0 = b0 * Xn2;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;

-         p1 = b1 * Xn2;
-         acc2 = p0 + d1;
-         p0 = b0 * Xn3;
-         p3 = a1 * acc2;
-         p2 = b2 * Xn2;
-         A1 = p1 + p3;
-         p4 = a2 * acc2;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
+        *pOut++ = acc1;

-         p1 = b1 * Xn3;
-         acc3 = p0 + d1;
-         p0 = b0 * Xn4;
-         p3 = a1 * acc3;
-         p2 = b2 * Xn3;
-         A1 = p1 + p3;
-         p4 = a2 * acc3;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         acc4 = p0 + d1;
-         p1 = b1 * Xn4;
-         p3 = a1 * acc4;
-         p2 = b2 * Xn4;
-         A1 = p1 + p3;
-         p4 = a2 * acc4;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         pOut[0] = acc1;
-         pOut[1] = acc2;
-         pOut[2] = acc3;
-         pOut[3] = acc4;
-		 pOut += 4;
-
-         sample--;
-      }
-
-      sample = blockSize & 0x3U;
-      while (sample > 0U) {
-         Xn1 = *pIn++;
-
-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         *pOut++ = acc1;
-
-         sample--;
+        /* decrement loop counter */
+        sample--;
      }

      /* Store the updated state variables back into the state array */
-      *pState++ = d1;
-      *pState++ = d2;
+      pState[0] = d1;
+      pState[1] = d2;
+
+      pState += 2U;

      /* The current stage input is given as the output to the next stage */
      pIn = pDst;

-      /*Reset the output working pointer */
+      /* Reset the output working pointer */
      pOut = pDst;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      stage--;

   } while (stage > 0U);

-#endif
-
 }
 LOW_OPTIMIZATION_EXIT
+#endif /* #if defined(ARM_MATH_NEON) */

 /**
-   * @} end of BiquadCascadeDF2T group
-   */
+  @} end of BiquadCascadeDF2T group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df2T_f64.c
 * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,139 +29,128 @@
 #include "arm_math.h"

 /**
-* @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes <code>blockSize</code> samples through the filter.
-* <code>pSrc</code> points to the array of input data and
-* <code>pDst</code> points to the array of output data.
-* Both arrays contain <code>blockSize</code> values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-* <pre>
-*    y[n] = b0 * x[n] + d1
-*    d1 = b1 * x[n] + a1 * y[n] + d2
-*    d2 = b2 * x[n] + a2 * y[n]
-* </pre>
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
-* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-* <pre>
-*    y[n] = b0 * x[n] + d1;
-*    d1 = b1 * x[n] - a1 * y[n] + d2;
-*    d2 = b2 * x[n] - a2 * y[n];
-* </pre>
-* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* <code>numStages</code> refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
-* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
-* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
-*
-* \par
-* <code>pState</code> points to the state variable array.
-* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
-* The state variables are arranged in the <code>pState</code> array as:
-* <pre>
-*     {d11, d12, d21, d22, ...}
-* </pre>
-* where <code>d1x</code> refers to the state variables for the first Biquad and
-* <code>d2x</code> refers to the state variables for the second Biquad.
-* The state array has a total length of <code>2*numStages</code> values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-* <pre>
-*     arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
-* </pre>
-* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
-* <code>pCoeffs</code> is the address of the coefficient buffer;
-*
+  @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
+
+  This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
+  The filters are implemented as a cascade of second order Biquad sections.
+  These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
+  Only floating-point data is supported.
+
+  This function operate on blocks of input and output data and each call to the function
+  processes <code>blockSize</code> samples through the filter.
+  <code>pSrc</code> points to the array of input data and
+  <code>pDst</code> points to the array of output data.
+  Both arrays contain <code>blockSize</code> values.
+
+  @par           Algorithm
+                   Each Biquad stage implements a second order filter using the difference equation:
+  <pre>
+     y[n] = b0 * x[n] + d1
+     d1 = b1 * x[n] + a1 * y[n] + d2
+     d2 = b2 * x[n] + a2 * y[n]
+  </pre>
+                   where d1 and d2 represent the two state values.
+  @par
+                   A Biquad filter using a transposed Direct Form II structure is shown below.
+                   \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
+                   Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
+                   Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
+                   Pay careful attention to the sign of the feedback coefficients.
+                   Some design tools flip the sign of the feedback coefficients:
+  <pre>
+     y[n] = b0 * x[n] + d1;
+     d1 = b1 * x[n] - a1 * y[n] + d2;
+     d2 = b2 * x[n] - a2 * y[n];
+  </pre>
+                   In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
+  @par
+                   Higher order filters are realized as a cascade of second order sections.
+                   <code>numStages</code> refers to the number of second order stages used.
+                   For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
+                   A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
+                   coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
+  @par
+                   <code>pState</code> points to the state variable array.
+                   Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {d11, d12, d21, d22, ...}
+  </pre>
+                   where <code>d1x</code> refers to the state variables for the first Biquad and
+                   <code>d2x</code> refers to the state variables for the second Biquad.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @par
+                   The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
+                   The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
+                   That is why the Direct Form I structure supports Q15 and Q31 data types.
+                   The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
+                   Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
+                   The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
+
+  @par           Instance Structure
+                   The coefficients and state variables for a filter are stored together in an instance data structure.
+                   A separate instance structure must be defined for each filter.
+                   Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+  @par           Init Functions
+                   There is also an associated initialization function.
+                   The initialization function performs following operations:
+                   - Sets the values of the internal structure fields.
+                   - Zeros out the values in the state buffer.
+                   To do this manually without calling the init function, assign the follow subfields of the instance structure:
+                   numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+                   To place an instance structure into a const data section, the instance structure must be manually initialized.
+                   Set the values in the state buffer to zeros before static initialization.
+                   For example, to statically initialize the instance structure use
+  <pre>
+      arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
+      arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+  </pre>
+                   where <code>numStages</code> is the number of Biquad stages in the filter;
+                   <code>pState</code> is the address of the state buffer.
+                   <code>pCoeffs</code> is the address of the coefficient buffer;
 */

 /**
-* @addtogroup BiquadCascadeDF2T
-* @{
-*/
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */

 /**
-* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-* @param[in]  *S        points to an instance of the filter data structure.
-* @param[in]  *pSrc     points to the block of input data.
-* @param[out] *pDst     points to the block of output data
-* @param[in]  blockSize number of samples to process.
-* @return none.
-*/
-
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */

 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_df2T_f64(
-const arm_biquad_cascade_df2T_instance_f64 * S,
-float64_t * pSrc,
-float64_t * pDst,
-uint32_t blockSize)
+  const arm_biquad_cascade_df2T_instance_f64 * S,
+        float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize)
 {

-   float64_t *pIn = pSrc;                         /*  source pointer            */
-   float64_t *pOut = pDst;                        /*  destination pointer       */
-   float64_t *pState = S->pState;                 /*  State pointer             */
-   float64_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
-   float64_t acc1;                                /*  accumulator               */
-   float64_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
-   float64_t Xn1;                                 /*  temporary input           */
-   float64_t d1, d2;                              /*  state variables           */
-   uint32_t sample, stage = S->numStages;         /*  loop counters             */
+        float64_t *pIn = pSrc;                         /* Source pointer */
+        float64_t *pOut = pDst;                        /* Destination pointer */
+        float64_t *pState = S->pState;                 /* State pointer */
+        float64_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float64_t acc1;                                /* Accumulator */
+        float64_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        float64_t Xn1;                                 /* Temporary input */
+        float64_t d1, d2;                              /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

-#if defined(ARM_MATH_CM7)
-
-   float64_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8;   /*  Input State variables     */
-   float64_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
-   float64_t acc2, acc3, acc4, acc5, acc6, acc7;  /*  Simulates the accumulator */
-   float64_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;

   do
   {
@ -170,421 +159,285 @@ uint32_t blockSize)
      b1 = pCoeffs[1];
      b2 = pCoeffs[2];
      a1 = pCoeffs[3];
-      /* Apply loop unrolling and compute 16 output values simultaneously. */
-      sample = blockSize >> 4U;
      a2 = pCoeffs[4];

-      /*Reading the state values */
+      /* Reading the state values */
      d1 = pState[0];
      d2 = pState[1];

      pCoeffs += 5U;

+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 16 outputs at a time */
+      sample = blockSize >> 4U;

-      /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.
-       ** a second loop below computes the remaining 1 to 15 samples. */
      while (sample > 0U) {

         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */

-         /* Read the first 2 inputs. 2 cycles */
-         Xn1  = pIn[0 ];
-         Xn2  = pIn[1 ];
+/*  1 */
+         Xn1 = *pIn++;

-         /* Sample 1. 5 cycles */
-         Xn3  = pIn[2 ];
         acc1 = b0 * Xn1 + d1;

-         Xn4  = pIn[3 ];
         d1 = b1 * Xn1 + d2;
-
-         Xn5  = pIn[4 ];
-         d2 = b2 * Xn1;
-
-         Xn6  = pIn[5 ];
         d1 += a1 * acc1;

-         Xn7  = pIn[6 ];
+         d2 = b2 * Xn1;
         d2 += a2 * acc1;

-         /* Sample 2. 5 cycles */
-         Xn8  = pIn[7 ];
-         acc2 = b0 * Xn2 + d1;
+         *pOut++ = acc1;

-         Xn9  = pIn[8 ];
-         d1 = b1 * Xn2 + d2;

-         Xn10 = pIn[9 ];
-         d2 = b2 * Xn2;
+/*  2 */
+         Xn1 = *pIn++;

-         Xn11 = pIn[10];
-         d1 += a1 * acc2;
+         acc1 = b0 * Xn1 + d1;

-         Xn12 = pIn[11];
-         d2 += a2 * acc2;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         /* Sample 3. 5 cycles */
-         Xn13 = pIn[12];
-         acc3 = b0 * Xn3 + d1;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         Xn14 = pIn[13];
-         d1 = b1 * Xn3 + d2;
+         *pOut++ = acc1;

-         Xn15 = pIn[14];
-         d2 = b2 * Xn3;
+/*  3 */
+         Xn1 = *pIn++;

-         Xn16 = pIn[15];
-         d1 += a1 * acc3;
+         acc1 = b0 * Xn1 + d1;

-         pIn += 16;
-         d2 += a2 * acc3;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         /* Sample 4. 5 cycles */
-         acc4 = b0 * Xn4 + d1;
-         d1 = b1 * Xn4 + d2;
-         d2 = b2 * Xn4;
-         d1 += a1 * acc4;
-         d2 += a2 * acc4;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         /* Sample 5. 5 cycles */
-         acc5 = b0 * Xn5 + d1;
-         d1 = b1 * Xn5 + d2;
-         d2 = b2 * Xn5;
-         d1 += a1 * acc5;
-         d2 += a2 * acc5;
+         *pOut++ = acc1;

-         /* Sample 6. 5 cycles */
-         acc6 = b0 * Xn6 + d1;
-         d1 = b1 * Xn6 + d2;
-         d2 = b2 * Xn6;
-         d1 += a1 * acc6;
-         d2 += a2 * acc6;
+/*  4 */
+         Xn1 = *pIn++;

-         /* Sample 7. 5 cycles */
-         acc7 = b0 * Xn7 + d1;
-         d1 = b1 * Xn7 + d2;
-         d2 = b2 * Xn7;
-         d1 += a1 * acc7;
-         d2 += a2 * acc7;
+         acc1 = b0 * Xn1 + d1;

-         /* Sample 8. 5 cycles */
-         acc8 = b0 * Xn8 + d1;
-         d1 = b1 * Xn8 + d2;
-         d2 = b2 * Xn8;
-         d1 += a1 * acc8;
-         d2 += a2 * acc8;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         /* Sample 9. 5 cycles */
-         acc9 = b0 * Xn9 + d1;
-         d1 = b1 * Xn9 + d2;
-         d2 = b2 * Xn9;
-         d1 += a1 * acc9;
-         d2 += a2 * acc9;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         /* Sample 10. 5 cycles */
-         acc10 = b0 * Xn10 + d1;
-         d1 = b1 * Xn10 + d2;
-         d2 = b2 * Xn10;
-         d1 += a1 * acc10;
-         d2 += a2 * acc10;
+         *pOut++ = acc1;

-         /* Sample 11. 5 cycles */
-         acc11 = b0 * Xn11 + d1;
-         d1 = b1 * Xn11 + d2;
-         d2 = b2 * Xn11;
-         d1 += a1 * acc11;
-         d2 += a2 * acc11;
+/*  5 */
+         Xn1 = *pIn++;

-         /* Sample 12. 5 cycles */
-         acc12 = b0 * Xn12 + d1;
-         d1 = b1 * Xn12 + d2;
-         d2 = b2 * Xn12;
-         d1 += a1 * acc12;
-         d2 += a2 * acc12;
+         acc1 = b0 * Xn1 + d1;

-         /* Sample 13. 5 cycles */
-         acc13 = b0 * Xn13 + d1;
-         d1 = b1 * Xn13 + d2;
-         d2 = b2 * Xn13;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         pOut[0 ] = acc1 ;
-         d1 += a1 * acc13;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         pOut[1 ] = acc2 ;
-         d2 += a2 * acc13;
+         *pOut++ = acc1;

-         /* Sample 14. 5 cycles */
-         pOut[2 ] = acc3 ;
-         acc14 = b0 * Xn14 + d1;
+/*  6 */
+         Xn1 = *pIn++;

-         pOut[3 ] = acc4 ;
-         d1 = b1 * Xn14 + d2;
+         acc1 = b0 * Xn1 + d1;

-         pOut[4 ] = acc5 ;
-         d2 = b2 * Xn14;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         pOut[5 ] = acc6 ;
-         d1 += a1 * acc14;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         pOut[6 ] = acc7 ;
-         d2 += a2 * acc14;
+         *pOut++ = acc1;

-         /* Sample 15. 5 cycles */
-         pOut[7 ] = acc8 ;
-         pOut[8 ] = acc9 ;
-         acc15 = b0 * Xn15 + d1;
+/*  7 */
+         Xn1 = *pIn++;

-         pOut[9 ] = acc10;
-         d1 = b1 * Xn15 + d2;
+         acc1 = b0 * Xn1 + d1;

-         pOut[10] = acc11;
-         d2 = b2 * Xn15;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

-         pOut[11] = acc12;
-         d1 += a1 * acc15;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;

-         pOut[12] = acc13;
-         d2 += a2 * acc15;
+         *pOut++ = acc1;

-         /* Sample 16. 5 cycles */
-         pOut[13] = acc14;
-         acc16 = b0 * Xn16 + d1;
+/*  8 */
+         Xn1 = *pIn++;

-         pOut[14] = acc15;
-         d1 = b1 * Xn16 + d2;
+         acc1 = b0 * Xn1 + d1;

-         pOut[15] = acc16;
-         d2 = b2 * Xn16;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;

+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/*  9 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 10 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 11 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 12 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 13 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 14 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 15 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 16 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+         /* decrement loop counter */
         sample--;
-         d1 += a1 * acc16;
-
-         pOut += 16;
-         d2 += a2 * acc16;
      }

-      sample = blockSize & 0xFu;
+    /* Loop unrolling: Compute remaining outputs */
+      sample = blockSize & 0xFU;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
      while (sample > 0U) {
-         Xn1 = *pIn;
+         Xn1 = *pIn++;
+
         acc1 = b0 * Xn1 + d1;

-         pIn++;
         d1 = b1 * Xn1 + d2;
-
-         *pOut = acc1;
-         d2 = b2 * Xn1;
-
-         pOut++;
         d1 += a1 * acc1;

-         sample--;
+         d2 = b2 * Xn1;
         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+         /* decrement loop counter */
+         sample--;
      }

      /* Store the updated state variables back into the state array */
      pState[0] = d1;
-      /* The current stage input is given as the output to the next stage */
-      pIn = pDst;
-
      pState[1] = d2;
-      /* decrement the loop counter */
-      stage--;

      pState += 2U;

-      /*Reset the output working pointer */
-      pOut = pDst;
-
-   } while (stage > 0U);
-
-#elif defined(ARM_MATH_CM0_FAMILY)
-
-   /* Run the below code for Cortex-M0 */
-
-   do
-   {
-      /* Reading the coefficients */
-      b0 = *pCoeffs++;
-      b1 = *pCoeffs++;
-      b2 = *pCoeffs++;
-      a1 = *pCoeffs++;
-      a2 = *pCoeffs++;
-
-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
-
-
-      sample = blockSize;
-
-      while (sample > 0U)
-      {
-         /* Read the input */
-         Xn1 = *pIn++;
-
-         /* y[n] = b0 * x[n] + d1 */
-         acc1 = (b0 * Xn1) + d1;
-
-         /* Store the result in the accumulator in the destination buffer. */
-         *pOut++ = acc1;
-
-         /* Every time after the output is computed state should be updated. */
-         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-         d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
-
-         /* d2 = b2 * x[n] + a2 * y[n] */
-         d2 = (b2 * Xn1) + (a2 * acc1);
-
-         /* decrement the loop counter */
-         sample--;
-      }
-
-      /* Store the updated state variables back into the state array */
-      *pState++ = d1;
-      *pState++ = d2;
-
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;

-      /*Reset the output working pointer */
+      /* Reset the output working pointer */
      pOut = pDst;

-      /* decrement the loop counter */
+      /* decrement loop counter */
      stage--;

   } while (stage > 0U);

-#else
-
-   float64_t Xn2, Xn3, Xn4;                  	  /*  Input State variables     */
-   float64_t acc2, acc3, acc4;              		  /*  accumulator               */
-
-
-   float64_t p0, p1, p2, p3, p4, A1;
-
-   /* Run the below code for Cortex-M4 and Cortex-M3 */
-   do
-   {
-      /* Reading the coefficients */
-      b0 = *pCoeffs++;
-      b1 = *pCoeffs++;
-      b2 = *pCoeffs++;
-      a1 = *pCoeffs++;
-      a2 = *pCoeffs++;
-
-
-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
-
-      /* Apply loop unrolling and compute 4 output values simultaneously. */
-      sample = blockSize >> 2U;
-
-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-      while (sample > 0U) {
-
-         /* y[n] = b0 * x[n] + d1 */
-         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-         /* d2 = b2 * x[n] + a2 * y[n] */
-
-         /* Read the four inputs */
-         Xn1 = pIn[0];
-         Xn2 = pIn[1];
-         Xn3 = pIn[2];
-         Xn4 = pIn[3];
-         pIn += 4;
-
-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p0 = b0 * Xn2;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         p1 = b1 * Xn2;
-         acc2 = p0 + d1;
-         p0 = b0 * Xn3;
-         p3 = a1 * acc2;
-         p2 = b2 * Xn2;
-         A1 = p1 + p3;
-         p4 = a2 * acc2;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         p1 = b1 * Xn3;
-         acc3 = p0 + d1;
-         p0 = b0 * Xn4;
-         p3 = a1 * acc3;
-         p2 = b2 * Xn3;
-         A1 = p1 + p3;
-         p4 = a2 * acc3;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         acc4 = p0 + d1;
-         p1 = b1 * Xn4;
-         p3 = a1 * acc4;
-         p2 = b2 * Xn4;
-         A1 = p1 + p3;
-         p4 = a2 * acc4;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         pOut[0] = acc1;
-         pOut[1] = acc2;
-         pOut[2] = acc3;
-         pOut[3] = acc4;
-				 pOut += 4;
-
-         sample--;
-      }
-
-      sample = blockSize & 0x3U;
-      while (sample > 0U) {
-         Xn1 = *pIn++;
-
-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         *pOut++ = acc1;
-
-         sample--;
-      }
-
-      /* Store the updated state variables back into the state array */
-      *pState++ = d1;
-      *pState++ = d2;
-
-      /* The current stage input is given as the output to the next stage */
-      pIn = pDst;
-
-      /*Reset the output working pointer */
-      pOut = pDst;
-
-      /* decrement the loop counter */
-      stage--;
-
-   } while (stage > 0U);
-
-#endif
-
 }
 LOW_OPTIMIZATION_EXIT

 /**
-   * @} end of BiquadCascadeDF2T group
-   */
+  @} end of BiquadCascadeDF2T group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df2T_init_f32.c
 * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,47 +29,169 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF2T
- * @{
+  @addtogroup BiquadCascadeDF2T
+  @{
 */

 /**
- * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
- * @param[in,out] *S           points to an instance of the filter data structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients.
- * @param[in]     *pState      points to the state buffer.
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- *
- * \par
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> is a pointer to state array.
- * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of <code>2*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
+                   in the not Neon version.
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+                   
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+
+                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
+                   32*x + 5*y
+                   and it must be initialized using the function
+                   arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
+                   standard array coefficient as parameters.
+
+                   But, an array of 8*numstages is a good approximation.
+
+                   Then, the initialization can be done with:
+  <pre>
+                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, neonCoefs, stateNeon);
+                   arm_biquad_cascade_df2T_compute_coefs_f32(&SNeon,nbCascade,coefs);
+  </pre>
+
+  @par             In this example, neonCoefs is a bigger array of size 8 * numStages.
+                   coefs is the standard array:
+
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

-void arm_biquad_cascade_df2T_init_f32(
+#if defined(ARM_MATH_NEON) 
+/*
+
+Must be called after initializing the biquad instance.
+pCoeffs has size 5 * nbCascade
+Whereas the pCoeffs for the init has size (4*4 + 4*4)* nbCascade 
+
+So this pCoeffs is the one which would be used for the not Neon version.
+The pCoeffs passed in init is bigger than the one for the not Neon version.
+
+*/
+void arm_biquad_cascade_df2T_compute_coefs_f32(
  arm_biquad_cascade_df2T_instance_f32 * S,
  uint8_t numStages,
-  float32_t * pCoeffs,
-  float32_t * pState)
+  float32_t * pCoeffs)
+{
+   uint8_t cnt;
+   float32_t *pDstCoeffs;
+   float32_t b0[4],b1[4],b2[4],a1[4],a2[4];
+
+   pDstCoeffs = S->pCoeffs;
+
+   cnt = numStages >> 2; 
+   while(cnt > 0)
+   {
+      for(int i=0;i<4;i++)
+      {
+        b0[i] = pCoeffs[0];
+        b1[i] = pCoeffs[1];
+        b2[i] = pCoeffs[2];
+        a1[i] = pCoeffs[3];
+        a2[i] = pCoeffs[4];
+        pCoeffs += 5;
+      }
+
+      /* Vec 1 */
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = b0[1];
+      *pDstCoeffs++ = b0[2];
+      *pDstCoeffs++ = b0[3];
+
+      /* Vec 2 */
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = b0[1] * b0[2];
+      *pDstCoeffs++ = b0[2] * b0[3];
+
+      /* Vec 3 */
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = 0;
+      *pDstCoeffs++ = b0[1] * b0[2] * b0[3];
+      
+      /* Vec 4 */
+      *pDstCoeffs++ = b0[0];
+      *pDstCoeffs++ = b0[0] * b0[1];
+      *pDstCoeffs++ = b0[0] * b0[1] * b0[2];
+      *pDstCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];
+
+      /* Vec 5 */
+      *pDstCoeffs++ = b1[0];
+      *pDstCoeffs++ = b1[1];
+      *pDstCoeffs++ = b1[2];
+      *pDstCoeffs++ = b1[3];
+
+      /* Vec 6 */
+      *pDstCoeffs++ = b2[0];
+      *pDstCoeffs++ = b2[1];
+      *pDstCoeffs++ = b2[2];
+      *pDstCoeffs++ = b2[3];
+
+      /* Vec 7 */
+      *pDstCoeffs++ = a1[0];
+      *pDstCoeffs++ = a1[1];
+      *pDstCoeffs++ = a1[2];
+      *pDstCoeffs++ = a1[3];
+
+      /* Vec 8 */
+      *pDstCoeffs++ = a2[0];
+      *pDstCoeffs++ = a2[1];
+      *pDstCoeffs++ = a2[2];
+      *pDstCoeffs++ = a2[3];
+
+      cnt--;
+   }
+
+   cnt = numStages & 0x3;
+   while(cnt > 0)
+   {
+      *pDstCoeffs++ = *pCoeffs++;
+      *pDstCoeffs++ = *pCoeffs++;
+      *pDstCoeffs++ = *pCoeffs++;
+      *pDstCoeffs++ = *pCoeffs++;
+      *pDstCoeffs++ = *pCoeffs++;
+      cnt--;
+   }
+
+}
+#endif 
+
+void arm_biquad_cascade_df2T_init_f32(
+        arm_biquad_cascade_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -85,5 +207,5 @@ void arm_biquad_cascade_df2T_init_f32(
 }

 /**
- * @} end of BiquadCascadeDF2T group
+  @} end of BiquadCascadeDF2T group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_df2T_init_f64.c
 * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,47 +29,44 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF2T
- * @{
+  @addtogroup BiquadCascadeDF2T
+  @{
 */

 /**
- * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
- * @param[in,out] *S           points to an instance of the filter data structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients.
- * @param[in]     *pState      points to the state buffer.
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- *
- * \par
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> is a pointer to state array.
- * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of <code>2*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure
+  @param[in]     numStages   number of 2nd order stages in the filter
+  @param[in]     pCoeffs     points to the filter coefficients
+  @param[in]     pState      points to the state buffer
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cascade_df2T_init_f64(
-  arm_biquad_cascade_df2T_instance_f64 * S,
-  uint8_t numStages,
-  float64_t * pCoeffs,
-  float64_t * pState)
+        arm_biquad_cascade_df2T_instance_f64 * S,
+        uint8_t numStages,
+        float64_t * pCoeffs,
+        float64_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -85,5 +82,5 @@ void arm_biquad_cascade_df2T_init_f64(
 }

 /**
- * @} end of BiquadCascadeDF2T group
+  @} end of BiquadCascadeDF2T group
 */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_stereo_df2T_f32.c
 * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,139 +29,39 @@
 #include "arm_math.h"

 /**
-* @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes <code>blockSize</code> samples through the filter.
-* <code>pSrc</code> points to the array of input data and
-* <code>pDst</code> points to the array of output data.
-* Both arrays contain <code>blockSize</code> values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-* <pre>
-*    y[n] = b0 * x[n] + d1
-*    d1 = b1 * x[n] + a1 * y[n] + d2
-*    d2 = b2 * x[n] + a2 * y[n]
-* </pre>
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
-* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-* <pre>
-*    y[n] = b0 * x[n] + d1;
-*    d1 = b1 * x[n] - a1 * y[n] + d2;
-*    d2 = b2 * x[n] - a2 * y[n];
-* </pre>
-* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* <code>numStages</code> refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
-* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
-* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
-*
-* \par
-* <code>pState</code> points to the state variable array.
-* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
-* The state variables are arranged in the <code>pState</code> array as:
-* <pre>
-*     {d11, d12, d21, d22, ...}
-* </pre>
-* where <code>d1x</code> refers to the state variables for the first Biquad and
-* <code>d2x</code> refers to the state variables for the second Biquad.
-* The state array has a total length of <code>2*numStages</code> values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-* <pre>
-*     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
-* </pre>
-* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
-* <code>pCoeffs</code> is the address of the coefficient buffer;
-*
-*/
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */

 /**
-* @addtogroup BiquadCascadeDF2T
-* @{
-*/
-
-/**
-* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-* @param[in]  *S        points to an instance of the filter data structure.
-* @param[in]  *pSrc     points to the block of input data.
-* @param[out] *pDst     points to the block of output data
-* @param[in]  blockSize number of samples to process.
-* @return none.
-*/
-
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */

 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_stereo_df2T_f32(
-const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
-float32_t * pSrc,
-float32_t * pDst,
-uint32_t blockSize)
+  const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
 {
-
-    float32_t *pIn = pSrc;                         /*  source pointer            */
-    float32_t *pOut = pDst;                        /*  destination pointer       */
-    float32_t *pState = S->pState;                 /*  State pointer             */
-    float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
-    float32_t acc1a, acc1b;                        /*  accumulator               */
-    float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
-    float32_t Xn1a, Xn1b;                          /*  temporary input           */
-    float32_t d1a, d2a, d1b, d2b;                  /*  state variables           */
-    uint32_t sample, stage = S->numStages;         /*  loop counters             */
-
-#if defined(ARM_MATH_CM7)
-
-    float32_t Xn2a, Xn3a, Xn4a, Xn5a, Xn6a, Xn7a, Xn8a;         /*  Input State variables     */
-    float32_t Xn2b, Xn3b, Xn4b, Xn5b, Xn6b, Xn7b, Xn8b;         /*  Input State variables     */
-    float32_t acc2a, acc3a, acc4a, acc5a, acc6a, acc7a, acc8a;  /*  Simulates the accumulator */
-    float32_t acc2b, acc3b, acc4b, acc5b, acc6b, acc7b, acc8b;  /*  Simulates the accumulator */
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+        float32_t *pOut = pDst;                        /* Destination pointer */
+        float32_t *pState = S->pState;                 /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t acc1a, acc1b;                        /* Accumulator */
+        float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        float32_t Xn1a, Xn1b;                          /* Temporary input */
+        float32_t d1a, d2a, d1b, d2b;                  /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */

    do
    {
@ -170,11 +70,9 @@ uint32_t blockSize)
        b1 = pCoeffs[1];
        b2 = pCoeffs[2];
        a1 = pCoeffs[3];
-        /* Apply loop unrolling and compute 8 output values simultaneously. */
-        sample = blockSize >> 3U;
        a2 = pCoeffs[4];

-        /*Reading the state values */
+        /* Reading the state values */
        d1a = pState[0];
        d2a = pState[1];
        d1b = pState[2];
@ -182,215 +80,182 @@ uint32_t blockSize)

        pCoeffs += 5U;

-        /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
-        ** a second loop below computes the remaining 1 to 7 samples. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 8 outputs at a time */
+        sample = blockSize >> 3U;
+
        while (sample > 0U) {
+          /* y[n] = b0 * x[n] + d1 */
+          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+          /* d2 = b2 * x[n] + a2 * y[n] */

-            /* y[n] = b0 * x[n] + d1 */
-            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-            /* d2 = b2 * x[n] + a2 * y[n] */
+/*  1 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            /* Read the first 2 inputs. 2 cycles */
-            Xn1a  = pIn[0 ];
-            Xn1b  = pIn[1 ];
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Sample 1. 5 cycles */
-            Xn2a  = pIn[2 ];
-            acc1a = b0 * Xn1a + d1a;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            Xn2b  = pIn[3 ];
-            d1a = b1 * Xn1a + d2a;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            Xn3a  = pIn[4 ];
-            d2a = b2 * Xn1a;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            Xn3b  = pIn[5 ];
-            d1a += a1 * acc1a;
+/*  2 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            Xn4a  = pIn[6 ];
-            d2a += a2 * acc1a;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Sample 2. 5 cycles */
-            Xn4b  = pIn[7 ];
-            acc1b = b0 * Xn1b + d1b;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            Xn5a  = pIn[8 ];
-            d1b = b1 * Xn1b + d2b;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            Xn5b = pIn[9 ];
-            d2b = b2 * Xn1b;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            Xn6a = pIn[10];
-            d1b += a1 * acc1b;
+/*  3 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            Xn6b = pIn[11];
-            d2b += a2 * acc1b;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Sample 3. 5 cycles */
-            Xn7a = pIn[12];
-            acc2a = b0 * Xn2a + d1a;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            Xn7b = pIn[13];
-            d1a = b1 * Xn2a + d2a;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            Xn8a = pIn[14];
-            d2a = b2 * Xn2a;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            Xn8b = pIn[15];
-            d1a += a1 * acc2a;
+/*  4 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            pIn += 16;
-            d2a += a2 * acc2a;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Sample 4. 5 cycles */
-            acc2b = b0 * Xn2b + d1b;
-            d1b = b1 * Xn2b + d2b;
-            d2b = b2 * Xn2b;
-            d1b += a1 * acc2b;
-            d2b += a2 * acc2b;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            /* Sample 5. 5 cycles */
-            acc3a = b0 * Xn3a + d1a;
-            d1a = b1 * Xn3a + d2a;
-            d2a = b2 * Xn3a;
-            d1a += a1 * acc3a;
-            d2a += a2 * acc3a;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            /* Sample 6. 5 cycles */
-            acc3b = b0 * Xn3b + d1b;
-            d1b = b1 * Xn3b + d2b;
-            d2b = b2 * Xn3b;
-            d1b += a1 * acc3b;
-            d2b += a2 * acc3b;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            /* Sample 7. 5 cycles */
-            acc4a = b0 * Xn4a + d1a;
-            d1a = b1 * Xn4a + d2a;
-            d2a = b2 * Xn4a;
-            d1a += a1 * acc4a;
-            d2a += a2 * acc4a;
+/*  5 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            /* Sample 8. 5 cycles */
-            acc4b = b0 * Xn4b + d1b;
-            d1b = b1 * Xn4b + d2b;
-            d2b = b2 * Xn4b;
-            d1b += a1 * acc4b;
-            d2b += a2 * acc4b;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Sample 9. 5 cycles */
-            acc5a = b0 * Xn5a + d1a;
-            d1a = b1 * Xn5a + d2a;
-            d2a = b2 * Xn5a;
-            d1a += a1 * acc5a;
-            d2a += a2 * acc5a;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            /* Sample 10. 5 cycles */
-            acc5b = b0 * Xn5b + d1b;
-            d1b = b1 * Xn5b + d2b;
-            d2b = b2 * Xn5b;
-            d1b += a1 * acc5b;
-            d2b += a2 * acc5b;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            /* Sample 11. 5 cycles */
-            acc6a = b0 * Xn6a + d1a;
-            d1a = b1 * Xn6a + d2a;
-            d2a = b2 * Xn6a;
-            d1a += a1 * acc6a;
-            d2a += a2 * acc6a;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            /* Sample 12. 5 cycles */
-            acc6b = b0 * Xn6b + d1b;
-            d1b = b1 * Xn6b + d2b;
-            d2b = b2 * Xn6b;
-            d1b += a1 * acc6b;
-            d2b += a2 * acc6b;
+/*  6 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            /* Sample 13. 5 cycles */
-            acc7a = b0 * Xn7a + d1a;
-            d1a = b1 * Xn7a + d2a;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            pOut[0 ] = acc1a ;
-            d2a = b2 * Xn7a;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            pOut[1 ] = acc1b ;
-            d1a += a1 * acc7a;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            pOut[2 ] = acc2a ;
-            d2a += a2 * acc7a;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            /* Sample 14. 5 cycles */
-            pOut[3 ] = acc2b ;
-            acc7b = b0 * Xn7b + d1b;
+/*  7 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            pOut[4 ] = acc3a ;
-            d1b = b1 * Xn7b + d2b;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            pOut[5 ] = acc3b ;
-            d2b = b2 * Xn7b;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            pOut[6 ] = acc4a ;
-            d1b += a1 * acc7b;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            pOut[7 ] = acc4b ;
-            d2b += a2 * acc7b;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            /* Sample 15. 5 cycles */
-            pOut[8 ] = acc5a ;
-            acc8a = b0 * Xn8a + d1a;
+/*  8 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            pOut[9 ] = acc5b;
-            d1a = b1 * Xn8a + d2a;
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            pOut[10] = acc6a;
-            d2a = b2 * Xn8a;
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            pOut[11] = acc6b;
-            d1a += a1 * acc8a;
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            pOut[12] = acc7a;
-            d2a += a2 * acc8a;
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            /* Sample 16. 5 cycles */
-            pOut[13] = acc7b;
-            acc8b = b0 * Xn8b + d1b;
-
-            pOut[14] = acc8a;
-            d1b = b1 * Xn8b + d2b;
-
-            pOut[15] = acc8b;
-            d2b = b2 * Xn8b;
-
-            sample--;
-            d1b += a1 * acc8b;
-
-            pOut += 16;
-            d2b += a2 * acc8b;
+          /* decrement loop counter */
+          sample--;
        }

+        /* Loop unrolling: Compute remaining outputs */
        sample = blockSize & 0x7U;
+
+#else
+
+        /* Initialize blkCnt with number of samples */
+        sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
        while (sample > 0U) {
-            /* Read the input */
-            Xn1a = *pIn++; //Channel a
-            Xn1b = *pIn++; //Channel b
+          /* Read the input */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */

-            /* y[n] = b0 * x[n] + d1 */
-            acc1a = (b0 * Xn1a) + d1a;
-            acc1b = (b0 * Xn1b) + d1b;
+          /* y[n] = b0 * x[n] + d1 */
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;

-            /* Store the result in the accumulator in the destination buffer. */
-            *pOut++ = acc1a;
-            *pOut++ = acc1b;
+          /* Store the result in the accumulator in the destination buffer. */
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;

-            /* Every time after the output is computed state should be updated. */
-            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
-            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+          /* Every time after the output is computed state should be updated. */
+          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;

-            /* d2 = b2 * x[n] + a2 * y[n] */
-            d2a = (b2 * Xn1a) + (a2 * acc1a);
-            d2b = (b2 * Xn1b) + (a2 * acc1b);
+          /* d2 = b2 * x[n] + a2 * y[n] */
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);

-            sample--;
+          /* decrement loop counter */
+          sample--;
        }

        /* Store the updated state variables back into the state array */
@ -400,271 +265,21 @@ uint32_t blockSize)
        pState[2] = d1b;
        pState[3] = d2b;

-        /* The current stage input is given as the output to the next stage */
-        pIn = pDst;
-        /* decrement the loop counter */
-        stage--;
-
        pState += 4U;
-        /*Reset the output working pointer */
-        pOut = pDst;
-
-    } while (stage > 0U);
-
-#elif defined(ARM_MATH_CM0_FAMILY)
-
-    /* Run the below code for Cortex-M0 */
-
-    do
-    {
-        /* Reading the coefficients */
-        b0 = *pCoeffs++;
-        b1 = *pCoeffs++;
-        b2 = *pCoeffs++;
-        a1 = *pCoeffs++;
-        a2 = *pCoeffs++;
-
-        /*Reading the state values */
-        d1a = pState[0];
-        d2a = pState[1];
-        d1b = pState[2];
-        d2b = pState[3];
-
-
-        sample = blockSize;
-
-        while (sample > 0U)
-        {
-            /* Read the input */
-            Xn1a = *pIn++; //Channel a
-            Xn1b = *pIn++; //Channel b
-
-            /* y[n] = b0 * x[n] + d1 */
-            acc1a = (b0 * Xn1a) + d1a;
-            acc1b = (b0 * Xn1b) + d1b;
-
-            /* Store the result in the accumulator in the destination buffer. */
-            *pOut++ = acc1a;
-            *pOut++ = acc1b;
-
-            /* Every time after the output is computed state should be updated. */
-            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
-            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
-
-            /* d2 = b2 * x[n] + a2 * y[n] */
-            d2a = (b2 * Xn1a) + (a2 * acc1a);
-            d2b = (b2 * Xn1b) + (a2 * acc1b);
-
-            /* decrement the loop counter */
-            sample--;
-        }
-
-        /* Store the updated state variables back into the state array */
-        *pState++ = d1a;
-        *pState++ = d2a;
-        *pState++ = d1b;
-        *pState++ = d2b;

        /* The current stage input is given as the output to the next stage */
        pIn = pDst;

-        /*Reset the output working pointer */
+        /* Reset the output working pointer */
        pOut = pDst;

-        /* decrement the loop counter */
+        /* Decrement the loop counter */
        stage--;

    } while (stage > 0U);

-#else
-
-    float32_t Xn2a, Xn3a, Xn4a;                          /*  Input State variables     */
-    float32_t Xn2b, Xn3b, Xn4b;                          /*  Input State variables     */
-    float32_t acc2a, acc3a, acc4a;                       /*  accumulator               */
-    float32_t acc2b, acc3b, acc4b;                       /*  accumulator               */
-    float32_t p0a, p1a, p2a, p3a, p4a, A1a;
-    float32_t p0b, p1b, p2b, p3b, p4b, A1b;
-
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    do
-    {
-        /* Reading the coefficients */
-        b0 = *pCoeffs++;
-        b1 = *pCoeffs++;
-        b2 = *pCoeffs++;
-        a1 = *pCoeffs++;
-        a2 = *pCoeffs++;
-
-        /*Reading the state values */
-        d1a = pState[0];
-        d2a = pState[1];
-        d1b = pState[2];
-        d2b = pState[3];
-
-        /* Apply loop unrolling and compute 4 output values simultaneously. */
-        sample = blockSize >> 2U;
-
-        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-        ** a second loop below computes the remaining 1 to 3 samples. */
-        while (sample > 0U) {
-
-            /* y[n] = b0 * x[n] + d1 */
-            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-            /* d2 = b2 * x[n] + a2 * y[n] */
-
-            /* Read the four inputs */
-            Xn1a = pIn[0];
-            Xn1b = pIn[1];
-            Xn2a = pIn[2];
-            Xn2b = pIn[3];
-            Xn3a = pIn[4];
-            Xn3b = pIn[5];
-            Xn4a = pIn[6];
-            Xn4b = pIn[7];
-            pIn += 8;
-
-            p0a = b0 * Xn1a;
-            p0b = b0 * Xn1b;
-            p1a = b1 * Xn1a;
-            p1b = b1 * Xn1b;
-            acc1a = p0a + d1a;
-            acc1b = p0b + d1b;
-            p0a = b0 * Xn2a;
-            p0b = b0 * Xn2b;
-            p3a = a1 * acc1a;
-            p3b = a1 * acc1b;
-            p2a = b2 * Xn1a;
-            p2b = b2 * Xn1b;
-            A1a = p1a + p3a;
-            A1b = p1b + p3b;
-            p4a = a2 * acc1a;
-            p4b = a2 * acc1b;
-            d1a = A1a + d2a;
-            d1b = A1b + d2b;
-            d2a = p2a + p4a;
-            d2b = p2b + p4b;
-
-            p1a = b1 * Xn2a;
-            p1b = b1 * Xn2b;
-            acc2a = p0a + d1a;
-            acc2b = p0b + d1b;
-            p0a = b0 * Xn3a;
-            p0b = b0 * Xn3b;
-            p3a = a1 * acc2a;
-            p3b = a1 * acc2b;
-            p2a = b2 * Xn2a;
-            p2b = b2 * Xn2b;
-            A1a = p1a + p3a;
-            A1b = p1b + p3b;
-            p4a = a2 * acc2a;
-            p4b = a2 * acc2b;
-            d1a = A1a + d2a;
-            d1b = A1b + d2b;
-            d2a = p2a + p4a;
-            d2b = p2b + p4b;
-
-            p1a = b1 * Xn3a;
-            p1b = b1 * Xn3b;
-            acc3a = p0a + d1a;
-            acc3b = p0b + d1b;
-            p0a = b0 * Xn4a;
-            p0b = b0 * Xn4b;
-            p3a = a1 * acc3a;
-            p3b = a1 * acc3b;
-            p2a = b2 * Xn3a;
-            p2b = b2 * Xn3b;
-            A1a = p1a + p3a;
-            A1b = p1b + p3b;
-            p4a = a2 * acc3a;
-            p4b = a2 * acc3b;
-            d1a = A1a + d2a;
-            d1b = A1b + d2b;
-            d2a = p2a + p4a;
-            d2b = p2b + p4b;
-
-            acc4a = p0a + d1a;
-            acc4b = p0b + d1b;
-            p1a = b1 * Xn4a;
-            p1b = b1 * Xn4b;
-            p3a = a1 * acc4a;
-            p3b = a1 * acc4b;
-            p2a = b2 * Xn4a;
-            p2b = b2 * Xn4b;
-            A1a = p1a + p3a;
-            A1b = p1b + p3b;
-            p4a = a2 * acc4a;
-            p4b = a2 * acc4b;
-            d1a = A1a + d2a;
-            d1b = A1b + d2b;
-            d2a = p2a + p4a;
-            d2b = p2b + p4b;
-
-            pOut[0] = acc1a;
-            pOut[1] = acc1b;
-            pOut[2] = acc2a;
-            pOut[3] = acc2b;
-            pOut[4] = acc3a;
-            pOut[5] = acc3b;
-            pOut[6] = acc4a;
-            pOut[7] = acc4b;
-            pOut += 8;
-
-            sample--;
-        }
-
-        sample = blockSize & 0x3U;
-        while (sample > 0U) {
-            Xn1a = *pIn++;
-            Xn1b = *pIn++;
-
-            p0a = b0 * Xn1a;
-            p0b = b0 * Xn1b;
-            p1a = b1 * Xn1a;
-            p1b = b1 * Xn1b;
-            acc1a = p0a + d1a;
-            acc1b = p0b + d1b;
-            p3a = a1 * acc1a;
-            p3b = a1 * acc1b;
-            p2a = b2 * Xn1a;
-            p2b = b2 * Xn1b;
-            A1a = p1a + p3a;
-            A1b = p1b + p3b;
-            p4a = a2 * acc1a;
-            p4b = a2 * acc1b;
-            d1a = A1a + d2a;
-            d1b = A1b + d2b;
-            d2a = p2a + p4a;
-            d2b = p2b + p4b;
-
-            *pOut++ = acc1a;
-            *pOut++ = acc1b;
-
-            sample--;
-        }
-
-        /* Store the updated state variables back into the state array */
-        *pState++ = d1a;
-        *pState++ = d2a;
-        *pState++ = d1b;
-        *pState++ = d2b;
-
-        /* The current stage input is given as the output to the next stage */
-        pIn = pDst;
-
-        /*Reset the output working pointer */
-        pOut = pDst;
-
-        /* decrement the loop counter */
-        stage--;
-
-    } while (stage > 0U);
-
-#endif
-
 }
 LOW_OPTIMIZATION_EXIT
-
 /**
-   * @} end of BiquadCascadeDF2T group
-   */
+  @} end of BiquadCascadeDF2T group
+ */
--- a/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
+++ b/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
@ -3,13 +3,13 @@
 * Title:        arm_biquad_cascade_stereo_df2T_init_f32.c
 * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
 *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
@ -29,47 +29,44 @@
 #include "arm_math.h"

 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
 */

 /**
- * @addtogroup BiquadCascadeDF2T
- * @{
+  @addtogroup BiquadCascadeDF2T
+  @{
 */

 /**
- * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
- * @param[in,out] *S           points to an instance of the filter data structure.
- * @param[in]     numStages    number of 2nd order stages in the filter.
- * @param[in]     *pCoeffs     points to the filter coefficients.
- * @param[in]     *pState      points to the state buffer.
- * @return        none
- *
- * <b>Coefficient and State Ordering:</b>
- * \par
- * The coefficients are stored in the array <code>pCoeffs</code> in the following order:
- * <pre>
- *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
- * </pre>
- *
- * \par
- * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
- * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
- * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
- *
- * \par
- * The <code>pState</code> is a pointer to state array.
- * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.
- * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
- * The state array has a total length of <code>2*numStages</code> values.
- * The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
 */

 void arm_biquad_cascade_stereo_df2T_init_f32(
-  arm_biquad_cascade_stereo_df2T_instance_f32 * S,
-  uint8_t numStages,
-  float32_t * pCoeffs,
-  float32_t * pState)
+        arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
@ -85,5 +82,5 @@ void arm_biquad_cascade_stereo_df2T_init_f32(
 }

 /**
- * @} end of BiquadCascadeDF2T group
+  @} end of BiquadCascadeDF2T group
 */
--- a/Show more
+++ b/Show more