/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_fir_f32.c
arm_fir_f32.c
   1  /* ----------------------------------------------------------------------
   2   * Project:      CMSIS DSP Library
   3   * Title:        arm_fir_f32.c
   4   * Description:  Floating-point FIR filter processing function
   5   *
   6   * $Date:        23 April 2021
   7   * $Revision:    V1.9.0
   8   *
   9   * Target Processor: Cortex-M and Cortex-A cores
  10   * -------------------------------------------------------------------- */
  11  /*
  12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13   *
  14   * SPDX-License-Identifier: Apache-2.0
  15   *
  16   * Licensed under the Apache License, Version 2.0 (the License); you may
  17   * not use this file except in compliance with the License.
  18   * You may obtain a copy of the License at
  19   *
  20   * www.apache.org/licenses/LICENSE-2.0
  21   *
  22   * Unless required by applicable law or agreed to in writing, software
  23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25   * See the License for the specific language governing permissions and
  26   * limitations under the License.
  27   */
  28  
  29  #include "dsp/filtering_functions.h"
  30  
  31  /**
  32    @ingroup groupFilters
  33   */
  34  
  35  /**
  36    @defgroup FIR Finite Impulse Response (FIR) Filters
  37  
  38    This set of functions implements Finite Impulse Response (FIR) filters
  39    for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.
  40    The functions operate on blocks of input and output data and each call to the function processes
  41    <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
  42    <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
  43  
  44    @par           Algorithm
  45                     The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
  46                     Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
  47    <pre>
  48        y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
  49    </pre>
  50    @par
  51                     \image html FIR.GIF "Finite Impulse Response filter"
  52    @par
  53                     <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
  54                     Coefficients are stored in time reversed order.
  55    @par
  56    <pre>
  57        {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
  58    </pre>
  59    @par
  60                     <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
  61                     Samples in the state buffer are stored in the following order.
  62    @par
  63    <pre>
  64        {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
  65    </pre>
  66    @par
  67                     Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
  68                     The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
  69                     to be avoided and yields a significant speed improvement.
  70                     The state variables are updated after each block of data is processed; the coefficients are untouched.
  71  
  72    @par           Instance Structure
  73                     The coefficients and state variables for a filter are stored together in an instance data structure.
  74                     A separate instance structure must be defined for each filter.
  75                     Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
  76                     There are separate instance structure declarations for each of the 4 supported data types.
  77  
  78    @par           Initialization Functions
  79                     There is also an associated initialization function for each data type.
  80                     The initialization function performs the following operations:
  81                     - Sets the values of the internal structure fields.
  82                     - Zeros out the values in the state buffer.
  83                     To do this manually without calling the init function, assign the follow subfields of the instance structure:
  84                     numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
  85    @par
  86                     Use of the initialization function is optional.
  87                     However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
  88                     To place an instance structure into a const data section, the instance structure must be manually initialized.
  89                     Set the values in the state buffer to zeros before static initialization.
  90                     The code below statically initializes each of the 4 different data type filter instance structures
  91    <pre>
  92        arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
  93        arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
  94        arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
  95        arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
  96    </pre>
  97                     where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
  98                     <code>pCoeffs</code> is the address of the coefficient buffer.
  99    @par          Initialization of Helium version
 100                   For Helium version the array of coefficients must be padded with zero to contain
 101                   a full number of lanes.
 102  
 103                   The array length L must be a multiple of x. L = x * a :
 104                   - x is 4  for f32
 105                   - x is 4  for q31
 106                   - x is 4  for f16 (so managed like the f32 version and not like the q15 one)
 107                   - x is 8  for q15
 108                   - x is 16 for q7
 109  
 110                   The additional coefficients 
 111                   (x * a - numTaps) must be set to 0.
 112                   numTaps is still set to its right value in the init function. It means that
 113                   the implementation may require to read more coefficients due to the vectorization and
 114                   to avoid having to manage too many different cases in the code.
 115  
 116                  
 117    @par          Helium state buffer
 118                   The state buffer must contain some additional temporary data
 119                   used during the computation but which is not the state of the FIR.
 120                   The first A samples are temporary data.
 121                   The remaining samples are the state of the FIR filter.
 122    @par                 
 123                   So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
 124                   - A is blockSize for f32
 125                   - A is 8*ceil(blockSize/8) for f16
 126                   - A is 8*ceil(blockSize/4) for q31
 127                   - A is 0 for other datatypes (q15 and q7)
 128  
 129  
 130    @par           Fixed-Point Behavior
 131                     Care must be taken when using the fixed-point versions of the FIR filter functions.
 132                     In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
 133                     Refer to the function specific documentation below for usage guidelines.
 134  
 135   */
 136  
 137  /**
 138    @addtogroup FIR
 139    @{
 140   */
 141  
 142  /**
 143    @brief         Processing function for floating-point FIR filter.
 144    @param[in]     S          points to an instance of the floating-point FIR filter structure
 145    @param[in]     pSrc       points to the block of input data
 146    @param[out]    pDst       points to the block of output data
 147    @param[in]     blockSize  number of samples to process
 148    @return        none
 149   */
 150  
 151  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 152  
 153  #define FIR_F32_MAX_COEF_BLK        8
 154  
 155  #define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
 156          vecAcc0 = vdupq_n_f32(0.0f);                                       \
 157          for (int i = 0; i < NB_TAPS; i++) {                                \
 158              vecIn0 = vld1q(&pSamples[i]);                                  \
 159              vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
 160          }
 161  
 162  
 163  #define NB_TAPS 4
 164  __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
 165    const float32_t * __restrict pSrc, 
 166    float32_t * __restrict pDst, uint32_t blockSize)
 167  {
 168      float32_t *pRefStatePtr = S->pState + blockSize;
 169      float32_t      *pState = pRefStatePtr; /* State pointer */
 170      const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
 171      float32_t      *pStateCur;  /* Points to the current sample of the state */
 172      const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
 173      float32_t      *pOutput;    /* Temporary pointer to the output buffer */
 174      const float32_t *pTempSrc;  /* Temporary pointer to the source data */
 175      float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
 176      uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
 177      int32_t         blkCnt;
 178      float32x4_t         vecIn0;
 179      float32x4_t         vecAcc0;
 180      float32_t       c[NB_TAPS];
 181      const float32_t *pCoeffsCur = pCoeffs;
 182  
 183      /*
 184       * pState points to state array which contains previous frame (numTaps - 1) samples
 185       * pStateCur points to the location where the new input data should be written
 186       */
 187      pStateCur = &(pState[(numTaps - 1u)]);
 188      pTempSrc = pSrc;
 189  
 190      pSamples = pState;
 191      pOutput = pDst;
 192  
 193      for (int i = 0; i < NB_TAPS; i++)
 194          c[i] = *pCoeffsCur++;
 195  
 196      blkCnt = blockSize >> 2;
 197      while (blkCnt > 0) {
 198          /*
 199           * Save 4 input samples in the history buffer
 200           */
 201          vst1q(pStateCur, vld1q(pTempSrc));
 202          pStateCur += 4;
 203          pTempSrc += 4;
 204  
 205          FIR_F32_CORE(pSamples, c, NB_TAPS);
 206  
 207          vst1q(pOutput, vecAcc0);
 208  
 209          pOutput += 4;
 210          pSamples += 4;
 211  
 212          blkCnt--;
 213      }
 214  
 215      blkCnt = blockSize & 3;
 216      if (blkCnt)
 217      {
 218          mve_pred16_t    p0 = vctp32q(blkCnt);
 219  
 220          vst1q(pStateCur, vld1q(pTempSrc));
 221          pStateCur += 4;
 222          pTempSrc += 4;
 223  
 224          FIR_F32_CORE(pSamples, c, NB_TAPS);
 225  
 226          vstrwq_p_f32(pOutput, vecAcc0, p0);
 227      }
 228  
 229      /*
 230       * Copy the samples back into the history buffer start
 231       */
 232      pTempSrc = &pState[blockSize];
 233      pTempDest = pState;
 234  
 235      blkCnt = numTaps - 1;
 236      do {
 237          mve_pred16_t    p = vctp32q(blkCnt);
 238  
 239          vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
 240          pTempSrc += 4;
 241          pTempDest += 4;
 242          blkCnt -= 4;
 243      }
 244      while (blkCnt > 0);
 245  }
 246  #undef NB_TAPS
 247  
 248  __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
 249    const float32_t * __restrict pSrc, 
 250    float32_t * __restrict pDst, uint32_t blockSize)
 251  {
 252      float32_t *pRefStatePtr = S->pState + blockSize;
 253      float32_t *pState = pRefStatePtr;      /* State pointer */
 254      const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
 255      const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
 256      const float32_t *pTempSrc;          /* Temporary pointer to the source data */
 257      float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
 258      uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
 259      int32_t  blkCnt;
 260      float32_t c0, c1, c2, c3;
 261      float32_t c4, c5, c6, c7;
 262  
 263  
 264      pTempSrc = pSrc;
 265      pTempDest = &(pState[(numTaps - 1u)]);
 266      int cnt = blockSize;
 267      do {
 268          mve_pred16_t p0 = vctp32q(cnt);
 269          vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
 270          pTempDest += 4;
 271          pTempSrc += 4;
 272          cnt -= 4;
 273      } while(cnt > 0);
 274  
 275  
 276  
 277      pSamples = pState;
 278      c0 = *pCoeffs++;
 279      c1 = *pCoeffs++;
 280      c2 = *pCoeffs++;
 281      c3 = *pCoeffs++;
 282      c4 = *pCoeffs++;
 283      c5 = *pCoeffs++;
 284      c6 = *pCoeffs++;
 285      c7 = *pCoeffs++;
 286  
 287      cnt = blockSize >> 2;
 288      while(cnt > 0) 
 289      {
 290          float32x4_t vecAcc0;
 291          float32x4_t vecIn0;
 292  
 293          vecIn0 = vld1q(pSamples);
 294          vecAcc0 = vmulq(vecIn0, c0);
 295          vecIn0 = vld1q(&pSamples[1]);
 296          vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 297          vecIn0 = vld1q(&pSamples[2]);
 298          vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 299          vecIn0 = vld1q(&pSamples[3]);
 300          vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 301          vecIn0 = vld1q(&pSamples[4]);
 302          vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 303          vecIn0 = vld1q(&pSamples[5]);
 304          vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 305          vecIn0 = vld1q(&pSamples[6]);
 306          vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 307          vecIn0 = vld1q(&pSamples[7]);
 308          vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 309          pSamples += 4;
 310          vst1q(pDst, vecAcc0);
 311          cnt--;
 312          pDst += 4;
 313      }
 314  
 315      cnt = blockSize & 3;
 316      if (cnt > 0) 
 317      {
 318          float32x4_t vecAcc0;
 319          float32x4_t vecIn0;
 320  
 321          mve_pred16_t p0 = vctp32q(cnt);
 322  
 323          vecIn0 = vld1q(pSamples);
 324          vecAcc0 = vmulq(vecIn0, c0);
 325          vecIn0 = vld1q(&pSamples[1]);
 326          vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 327          vecIn0 = vld1q(&pSamples[2]);
 328          vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 329          vecIn0 = vld1q(&pSamples[3]);
 330          vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 331          vecIn0 = vld1q(&pSamples[4]);
 332          vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 333          vecIn0 = vld1q(&pSamples[5]);
 334          vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 335          vecIn0 = vld1q(&pSamples[6]);
 336          vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 337          vecIn0 = vld1q(&pSamples[7]);
 338          vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 339          vstrwq_p_f32(pDst, vecAcc0,p0);
 340      }
 341  
 342  
 343      /*
 344       * Copy the samples back into the history buffer start
 345       */
 346      pTempSrc = &pState[blockSize];
 347      pTempDest = pState;
 348      blkCnt = numTaps;
 349      while (blkCnt > 0)
 350      {
 351          *pTempDest++ = *pTempSrc++;
 352          blkCnt--;
 353      }
 354  }
 355  
 356  
 357  
 358  void arm_fir_f32(
 359  const arm_fir_instance_f32 * S,
 360  const float32_t * pSrc,
 361  float32_t * pDst,
 362  uint32_t blockSize)
 363  {
 364      /* 
 365         S->pState is the arm_fir_partial_accu
 366         S->pState + blockSize is the FIR state
 367      */
 368      float32_t *pRefStatePtr = S->pState + blockSize;
 369      float32_t *pState = pRefStatePtr ;      /* State pointer */
 370      const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
 371      const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
 372      float32_t *pOutput;                 /* Temporary pointer to the output buffer */
 373      const float32_t *pTempSrc;          /* Temporary pointer to the source data */
 374      float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
 375      uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
 376      uint32_t  blkCnt;
 377      float32_t c0, c1, c2, c3;
 378      float32_t c4, c5, c6, c7;
 379  
 380      /*
 381       * [1 to 8 taps] specialized routines
 382       */
 383      if (numTaps <= 4)
 384      {
 385          arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
 386          return;
 387      }
 388      else if (numTaps <= 8)
 389      {
 390          arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
 391          return;
 392      }
 393  
 394      pTempSrc = pSrc;
 395      pTempDest = &(pState[(numTaps - 1u)]);
 396      int cnt = blockSize;
 397      do {
 398          mve_pred16_t p0 = vctp32q(cnt);
 399          vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
 400          pTempDest += 4;
 401          pTempSrc += 4;
 402          cnt -= 4;
 403      } while(cnt > 0);
 404  
 405      float32_t *partial_accu_ptr = S->pState;
 406  
 407      pSamples = pState;
 408      c0 = *pCoeffs++;
 409      c1 = *pCoeffs++;
 410      c2 = *pCoeffs++;
 411      c3 = *pCoeffs++;
 412      c4 = *pCoeffs++;
 413      c5 = *pCoeffs++;
 414      c6 = *pCoeffs++;
 415      c7 = *pCoeffs++;
 416  
 417      cnt = blockSize >> 2;
 418      while(cnt > 0) {
 419          float32x4_t vecAcc0;
 420          float32x4_t vecIn0;
 421  
 422          vecIn0 = vld1q(pSamples);
 423          vecAcc0 = vmulq(vecIn0, c0);
 424          vecIn0 = vld1q(&pSamples[1]);
 425          vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 426          vecIn0 = vld1q(&pSamples[2]);
 427          vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 428          vecIn0 = vld1q(&pSamples[3]);
 429          vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 430          vecIn0 = vld1q(&pSamples[4]);
 431          vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 432          vecIn0 = vld1q(&pSamples[5]);
 433          vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 434          vecIn0 = vld1q(&pSamples[6]);
 435          vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 436          vecIn0 = vld1q(&pSamples[7]);
 437          vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 438          pSamples += 4;
 439          vst1q(partial_accu_ptr, vecAcc0);
 440          cnt--;
 441          partial_accu_ptr += 4;
 442      }
 443  
 444      cnt = blockSize & 3;
 445      if (cnt > 0) 
 446      {
 447          float32x4_t vecAcc0;
 448          float32x4_t vecIn0;
 449  
 450          mve_pred16_t p0 = vctp32q(cnt);
 451  
 452          vecIn0 = vld1q(pSamples);
 453          vecAcc0 = vmulq(vecIn0, c0);
 454          vecIn0 = vld1q(&pSamples[1]);
 455          vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 456          vecIn0 = vld1q(&pSamples[2]);
 457          vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 458          vecIn0 = vld1q(&pSamples[3]);
 459          vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 460          vecIn0 = vld1q(&pSamples[4]);
 461          vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 462          vecIn0 = vld1q(&pSamples[5]);
 463          vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 464          vecIn0 = vld1q(&pSamples[6]);
 465          vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 466          vecIn0 = vld1q(&pSamples[7]);
 467          vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 468          vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
 469      }
 470  
 471      int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
 472      int sample_offset = FIR_F32_MAX_COEF_BLK;
 473      while (localTaps > FIR_F32_MAX_COEF_BLK) {
 474          c0 = *pCoeffs++;
 475          c1 = *pCoeffs++;
 476          c2 = *pCoeffs++;
 477          c3 = *pCoeffs++;
 478          c4 = *pCoeffs++;
 479          c5 = *pCoeffs++;
 480          c6 = *pCoeffs++;
 481          c7 = *pCoeffs++;
 482  
 483          partial_accu_ptr = S->pState;
 484          pSamples = pState + sample_offset;
 485          int cnt = blockSize >> 2;
 486          while(cnt > 0) {
 487              float32x4_t vecAcc0;
 488              float32x4_t vecIn0;
 489  
 490              vecIn0 = vld1q(pSamples);
 491              vecAcc0 = vmulq(vecIn0, c0);
 492              vecIn0 = vld1q(&pSamples[1]);
 493              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 494              vecIn0 = vld1q(&pSamples[2]);
 495              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 496              vecIn0 = vld1q(&pSamples[3]);
 497              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 498              vecIn0 = vld1q(&pSamples[4]);
 499              vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 500              vecIn0 = vld1q(&pSamples[5]);
 501              vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 502              vecIn0 = vld1q(&pSamples[6]);
 503              vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 504              vecIn0 = vld1q(&pSamples[7]);
 505              vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 506              pSamples += 4;
 507              vecAcc0 += vld1q_f32(partial_accu_ptr);
 508              vst1q(partial_accu_ptr, vecAcc0);
 509              cnt--;
 510              partial_accu_ptr += 4;
 511          }
 512  
 513          cnt = blockSize & 3;
 514          if (cnt > 0) {
 515              float32x4_t vecAcc0;
 516              float32x4_t vecIn0;
 517  
 518              mve_pred16_t p0 = vctp32q(cnt);
 519  
 520              vecIn0 = vld1q(pSamples);
 521              vecAcc0 = vmulq(vecIn0, c0);
 522              vecIn0 = vld1q(&pSamples[1]);
 523              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 524              vecIn0 = vld1q(&pSamples[2]);
 525              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 526              vecIn0 = vld1q(&pSamples[3]);
 527              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 528              vecIn0 = vld1q(&pSamples[4]);
 529              vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 530              vecIn0 = vld1q(&pSamples[5]);
 531              vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 532              vecIn0 = vld1q(&pSamples[6]);
 533              vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 534              vecIn0 = vld1q(&pSamples[7]);
 535              vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 536              vecAcc0 += vld1q_f32(partial_accu_ptr);
 537              vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
 538          }
 539  
 540          localTaps -= FIR_F32_MAX_COEF_BLK;
 541          sample_offset += FIR_F32_MAX_COEF_BLK;
 542      }
 543  
 544      pSamples = pState + sample_offset;
 545  
 546      if (localTaps > 4) {
 547          c0 = *pCoeffs++;
 548          c1 = *pCoeffs++;
 549          c2 = *pCoeffs++;
 550          c3 = *pCoeffs++;
 551          c4 = *pCoeffs++;
 552          c5 = *pCoeffs++;
 553          c6 = *pCoeffs++;
 554          c7 = *pCoeffs++;
 555          pOutput = pDst;
 556  
 557          partial_accu_ptr = S->pState;
 558          cnt = blockSize  >> 2;
 559          while(cnt > 0) {
 560              float32x4_t vecAcc0;
 561              float32x4_t vecIn0;
 562  
 563              vecIn0 = vld1q(pSamples);
 564              vecAcc0 = vmulq(vecIn0, c0);
 565              vecIn0 = vld1q(&pSamples[1]);
 566              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 567              vecIn0 = vld1q(&pSamples[2]);
 568              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 569              vecIn0 = vld1q(&pSamples[3]);
 570              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 571              vecIn0 = vld1q(&pSamples[4]);
 572              vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 573              vecIn0 = vld1q(&pSamples[5]);
 574              vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 575              vecIn0 = vld1q(&pSamples[6]);
 576              vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 577              vecIn0 = vld1q(&pSamples[7]);
 578              vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 579              pSamples += 4;
 580              float32x4_t pap = vld1q_f32(partial_accu_ptr);
 581              vst1q(pOutput, vecAcc0+pap);
 582              cnt--;
 583              partial_accu_ptr += 4;
 584              pOutput += 4;
 585          }
 586  
 587          cnt = blockSize  & 3;
 588          if (cnt > 0) {
 589              float32x4_t vecAcc0;
 590              float32x4_t vecIn0;
 591  
 592              mve_pred16_t p0 = vctp32q(cnt);
 593  
 594              vecIn0 = vld1q(pSamples);
 595              vecAcc0 = vmulq(vecIn0, c0);
 596              vecIn0 = vld1q(&pSamples[1]);
 597              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 598              vecIn0 = vld1q(&pSamples[2]);
 599              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 600              vecIn0 = vld1q(&pSamples[3]);
 601              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 602              vecIn0 = vld1q(&pSamples[4]);
 603              vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
 604              vecIn0 = vld1q(&pSamples[5]);
 605              vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
 606              vecIn0 = vld1q(&pSamples[6]);
 607              vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
 608              vecIn0 = vld1q(&pSamples[7]);
 609              vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
 610              float32x4_t pap = vld1q_f32(partial_accu_ptr);
 611              vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
 612              pOutput += cnt;
 613          }
 614      }
 615      else {
 616          c0 = *pCoeffs++;
 617          c1 = *pCoeffs++;
 618          c2 = *pCoeffs++;
 619          c3 = *pCoeffs++;
 620          pOutput = pDst;
 621  
 622          partial_accu_ptr = S->pState;
 623          cnt = blockSize >> 2;
 624          while(cnt > 0) {
 625              float32x4_t vecAcc0;
 626              float32x4_t vecIn0;
 627  
 628              vecIn0 = vld1q(pSamples);
 629              vecAcc0 = vmulq(vecIn0, c0);
 630              vecIn0 = vld1q(&pSamples[1]);
 631              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 632              vecIn0 = vld1q(&pSamples[2]);
 633              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 634              vecIn0 = vld1q(&pSamples[3]);
 635              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 636              pSamples += 4;
 637              float32x4_t pap = vld1q_f32(partial_accu_ptr);
 638              vst1q(pOutput, vecAcc0+pap);
 639              cnt--;
 640              partial_accu_ptr += 4;
 641              pOutput += 4;
 642          }
 643  
 644          cnt = blockSize & 3;
 645          if (cnt > 0) {
 646              float32x4_t vecAcc0;
 647              float32x4_t vecIn0;
 648  
 649              mve_pred16_t p0 = vctp32q(cnt);
 650  
 651              vecIn0 = vld1q(pSamples);
 652              vecAcc0 = vmulq(vecIn0, c0);
 653              vecIn0 = vld1q(&pSamples[1]);
 654              vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
 655              vecIn0 = vld1q(&pSamples[2]);
 656              vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
 657              vecIn0 = vld1q(&pSamples[3]);
 658              vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
 659              float32x4_t pap = vld1q_f32(partial_accu_ptr);
 660              vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
 661              pOutput += cnt;
 662          }
 663      }
 664  
 665      /*
 666       * Copy the samples back into the history buffer start
 667       */
 668      pTempSrc = &pRefStatePtr[blockSize];
 669      pTempDest = pRefStatePtr;
 670  
 671      blkCnt = numTaps >> 2;
 672      while (blkCnt > 0)
 673      {
 674          vst1q(pTempDest, vld1q(pTempSrc));
 675          pTempSrc += 4;
 676          pTempDest += 4;
 677          blkCnt--;
 678      }
 679      blkCnt = numTaps & 3;
 680      if (blkCnt > 0)
 681      {
 682          mve_pred16_t p0 = vctp32q(blkCnt);
 683          vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
 684      }
 685  }
 686  
 687  #else
 688  #if defined(ARM_MATH_NEON)
 689  
 690  void arm_fir_f32(
 691  const arm_fir_instance_f32 * S,
 692  const float32_t * pSrc,
 693  float32_t * pDst,
 694  uint32_t blockSize)
 695  {
 696     float32_t *pState = S->pState;                 /* State pointer */
 697     const float32_t *pCoeffs = S->pCoeffs;         /* Coefficient pointer */
 698     float32_t *pStateCurnt;                        /* Points to the current sample of the state */
 699     float32_t *px;                                 /* Temporary pointers for state buffer */
 700     const float32_t *pb;                           /* Temporary pointers for coefficient buffer */
 701     uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
 702     uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
 703  
 704     float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
 705     float32_t acc;
 706  
 707     /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
 708     /* pStateCurnt points to the location where the new input data should be written */
 709     pStateCurnt = &(S->pState[(numTaps - 1U)]);
 710  
 711     /* Loop unrolling */
 712     blkCnt = blockSize >> 3;
 713  
 714     while (blkCnt > 0U)
 715     {
 716        /* Copy 8 samples at a time into state buffers */
 717        samples0 = vld1q_f32(pSrc);
 718        vst1q_f32(pStateCurnt,samples0);
 719  
 720        pStateCurnt += 4;
 721        pSrc += 4 ;
 722  
 723        samples1 = vld1q_f32(pSrc);
 724        vst1q_f32(pStateCurnt,samples1);
 725  
 726        pStateCurnt += 4;
 727        pSrc += 4 ;
 728  
 729        /* Set the accumulators to zero */
 730        accv0 = vdupq_n_f32(0);
 731        accv1 = vdupq_n_f32(0);
 732  
 733        /* Initialize state pointer */
 734        px = pState;
 735  
 736        /* Initialize coefficient pointer */
 737        pb = pCoeffs;
 738  
 739        /* Loop unroling */
 740        i = numTaps >> 2;
 741  
 742        /* Perform the multiply-accumulates */
 743        x0 = vld1q_f32(px);
 744        x1 = vld1q_f32(px + 4);
 745  
 746        while(i > 0)
 747        {
 748           /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
 749           x2 = vld1q_f32(px + 8);
 750           b = vld1q_f32(pb);
 751           xa = x0;
 752           xb = x1;
 753           accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
 754           accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
 755  
 756           xa = vextq_f32(x0,x1,1);
 757           xb = vextq_f32(x1,x2,1);
 758  
 759           accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
 760           accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
 761  
 762           xa = vextq_f32(x0,x1,2);
 763           xb = vextq_f32(x1,x2,2);
 764  
 765           accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
 766           accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
 767  
 768           xa = vextq_f32(x0,x1,3);
 769           xb = vextq_f32(x1,x2,3);
 770  
 771           accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
 772           accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
 773  
 774           pb += 4;
 775           x0 = x1;
 776           x1 = x2;
 777           px += 4;
 778           i--;
 779  
 780        }
 781  
 782        /* Tail */
 783        i = numTaps & 3;
 784        x2 = vld1q_f32(px + 8);
 785  
 786        /* Perform the multiply-accumulates */
 787        switch(i)
 788        {
 789           case 3:
 790           {
 791             accv0 = vmlaq_n_f32(accv0,x0,*pb);
 792             accv1 = vmlaq_n_f32(accv1,x1,*pb);
 793  
 794             pb++;
 795  
 796             xa = vextq_f32(x0,x1,1);
 797             xb = vextq_f32(x1,x2,1);
 798  
 799             accv0 = vmlaq_n_f32(accv0,xa,*pb);
 800             accv1 = vmlaq_n_f32(accv1,xb,*pb);
 801  
 802             pb++;
 803  
 804             xa = vextq_f32(x0,x1,2);
 805             xb = vextq_f32(x1,x2,2);
 806  
 807             accv0 = vmlaq_n_f32(accv0,xa,*pb);
 808             accv1 = vmlaq_n_f32(accv1,xb,*pb);
 809  
 810           }
 811           break;
 812           case 2:
 813           {
 814             accv0 = vmlaq_n_f32(accv0,x0,*pb);
 815             accv1 = vmlaq_n_f32(accv1,x1,*pb);
 816  
 817             pb++;
 818  
 819             xa = vextq_f32(x0,x1,1);
 820             xb = vextq_f32(x1,x2,1);
 821  
 822             accv0 = vmlaq_n_f32(accv0,xa,*pb);
 823             accv1 = vmlaq_n_f32(accv1,xb,*pb);
 824  
 825           }
 826           break;
 827           case 1:
 828           {
 829  
 830             accv0 = vmlaq_n_f32(accv0,x0,*pb);
 831             accv1 = vmlaq_n_f32(accv1,x1,*pb);
 832  
 833           }
 834           break;
 835           default:
 836           break;
 837        }
 838  
 839        /* The result is stored in the destination buffer. */
 840        vst1q_f32(pDst,accv0);
 841        pDst += 4;
 842        vst1q_f32(pDst,accv1);
 843        pDst += 4;
 844  
 845        /* Advance state pointer by 8 for the next 8 samples */
 846        pState = pState + 8;
 847  
 848        blkCnt--;
 849     }
 850  
 851     /* Tail */
 852     blkCnt = blockSize & 0x7;
 853  
 854     while (blkCnt > 0U)
 855     {
 856        /* Copy one sample at a time into state buffer */
 857        *pStateCurnt++ = *pSrc++;
 858  
 859        /* Set the accumulator to zero */
 860        acc = 0.0f;
 861  
 862        /* Initialize state pointer */
 863        px = pState;
 864  
 865        /* Initialize Coefficient pointer */
 866        pb = pCoeffs;
 867  
 868        i = numTaps;
 869  
 870        /* Perform the multiply-accumulates */
 871        do
 872        {
 873           /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
 874           acc += *px++ * *pb++;
 875           i--;
 876  
 877        } while (i > 0U);
 878  
 879        /* The result is stored in the destination buffer. */
 880        *pDst++ = acc;
 881  
 882        /* Advance state pointer by 1 for the next sample */
 883        pState = pState + 1;
 884  
 885        blkCnt--;
 886     }
 887  
 888     /* Processing is complete.
 889     ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
 890     ** This prepares the state buffer for the next function call. */
 891  
 892     /* Points to the start of the state buffer */
 893     pStateCurnt = S->pState;
 894  
 895     /* Copy numTaps number of values */
 896     tapCnt = numTaps - 1U;
 897  
 898     /* Copy data */
 899     while (tapCnt > 0U)
 900     {
 901        *pStateCurnt++ = *pState++;
 902  
 903        /* Decrement the loop counter */
 904        tapCnt--;
 905     }
 906  
 907  }
 908  #else
 909  void arm_fir_f32(
 910    const arm_fir_instance_f32 * S,
 911    const float32_t * pSrc,
 912          float32_t * pDst,
 913          uint32_t blockSize)
 914  {
 915          float32_t *pState = S->pState;                 /* State pointer */
 916    const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
 917          float32_t *pStateCurnt;                        /* Points to the current sample of the state */
 918          float32_t *px;                                 /* Temporary pointer for state buffer */
 919    const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
 920          float32_t acc0;                                /* Accumulator */
 921          uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
 922          uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
 923  
 924  #if defined (ARM_MATH_LOOPUNROLL)
 925          float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
 926          float32_t x0, x1, x2, x3, x4, x5, x6, x7;               /* Temporary variables to hold state values */
 927          float32_t c0;                                           /* Temporary variable to hold coefficient value */
 928  #endif
 929  
 930    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
 931    /* pStateCurnt points to the location where the new input data should be written */
 932    pStateCurnt = &(S->pState[(numTaps - 1U)]);
 933  
 934  #if defined (ARM_MATH_LOOPUNROLL)
 935  
 936    /* Loop unrolling: Compute 8 output values simultaneously.
 937     * The variables acc0 ... acc7 hold output values that are being computed:
 938     *
 939     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
 940     *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
 941     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 942     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 943     */
 944  
 945    blkCnt = blockSize >> 3U;
 946  
 947    while (blkCnt > 0U)
 948    {
 949      /* Copy 4 new input samples into the state buffer. */
 950      *pStateCurnt++ = *pSrc++;
 951      *pStateCurnt++ = *pSrc++;
 952      *pStateCurnt++ = *pSrc++;
 953      *pStateCurnt++ = *pSrc++;
 954  
 955      /* Set all accumulators to zero */
 956      acc0 = 0.0f;
 957      acc1 = 0.0f;
 958      acc2 = 0.0f;
 959      acc3 = 0.0f;
 960      acc4 = 0.0f;
 961      acc5 = 0.0f;
 962      acc6 = 0.0f;
 963      acc7 = 0.0f;
 964  
 965      /* Initialize state pointer */
 966      px = pState;
 967  
 968      /* Initialize coefficient pointer */
 969      pb = pCoeffs;
 970  
 971      /* This is separated from the others to avoid
 972       * a call to __aeabi_memmove which would be slower
 973       */
 974      *pStateCurnt++ = *pSrc++;
 975      *pStateCurnt++ = *pSrc++;
 976      *pStateCurnt++ = *pSrc++;
 977      *pStateCurnt++ = *pSrc++;
 978  
 979      /* Read the first 7 samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
 980      x0 = *px++;
 981      x1 = *px++;
 982      x2 = *px++;
 983      x3 = *px++;
 984      x4 = *px++;
 985      x5 = *px++;
 986      x6 = *px++;
 987  
 988      /* Loop unrolling: process 8 taps at a time. */
 989      tapCnt = numTaps >> 3U;
 990  
 991      while (tapCnt > 0U)
 992      {
 993        /* Read the b[numTaps-1] coefficient */
 994        c0 = *(pb++);
 995  
 996        /* Read x[n-numTaps-3] sample */
 997        x7 = *(px++);
 998  
 999        /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
1000        acc0 += x0 * c0;
1001  
1002        /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
1003        acc1 += x1 * c0;
1004  
1005        /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
1006        acc2 += x2 * c0;
1007  
1008        /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
1009        acc3 += x3 * c0;
1010  
1011        /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
1012        acc4 += x4 * c0;
1013  
1014        /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
1015        acc5 += x5 * c0;
1016  
1017        /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
1018        acc6 += x6 * c0;
1019  
1020        /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
1021        acc7 += x7 * c0;
1022  
1023        /* Read the b[numTaps-2] coefficient */
1024        c0 = *(pb++);
1025  
1026        /* Read x[n-numTaps-4] sample */
1027        x0 = *(px++);
1028  
1029        /* Perform the multiply-accumulate */
1030        acc0 += x1 * c0;
1031        acc1 += x2 * c0;
1032        acc2 += x3 * c0;
1033        acc3 += x4 * c0;
1034        acc4 += x5 * c0;
1035        acc5 += x6 * c0;
1036        acc6 += x7 * c0;
1037        acc7 += x0 * c0;
1038  
1039        /* Read the b[numTaps-3] coefficient */
1040        c0 = *(pb++);
1041  
1042        /* Read x[n-numTaps-5] sample */
1043        x1 = *(px++);
1044  
1045        /* Perform the multiply-accumulates */
1046        acc0 += x2 * c0;
1047        acc1 += x3 * c0;
1048        acc2 += x4 * c0;
1049        acc3 += x5 * c0;
1050        acc4 += x6 * c0;
1051        acc5 += x7 * c0;
1052        acc6 += x0 * c0;
1053        acc7 += x1 * c0;
1054  
1055        /* Read the b[numTaps-4] coefficient */
1056        c0 = *(pb++);
1057  
1058        /* Read x[n-numTaps-6] sample */
1059        x2 = *(px++);
1060  
1061        /* Perform the multiply-accumulates */
1062        acc0 += x3 * c0;
1063        acc1 += x4 * c0;
1064        acc2 += x5 * c0;
1065        acc3 += x6 * c0;
1066        acc4 += x7 * c0;
1067        acc5 += x0 * c0;
1068        acc6 += x1 * c0;
1069        acc7 += x2 * c0;
1070  
1071        /* Read the b[numTaps-4] coefficient */
1072        c0 = *(pb++);
1073  
1074        /* Read x[n-numTaps-6] sample */
1075        x3 = *(px++);
1076        /* Perform the multiply-accumulates */
1077        acc0 += x4 * c0;
1078        acc1 += x5 * c0;
1079        acc2 += x6 * c0;
1080        acc3 += x7 * c0;
1081        acc4 += x0 * c0;
1082        acc5 += x1 * c0;
1083        acc6 += x2 * c0;
1084        acc7 += x3 * c0;
1085  
1086        /* Read the b[numTaps-4] coefficient */
1087        c0 = *(pb++);
1088  
1089        /* Read x[n-numTaps-6] sample */
1090        x4 = *(px++);
1091  
1092        /* Perform the multiply-accumulates */
1093        acc0 += x5 * c0;
1094        acc1 += x6 * c0;
1095        acc2 += x7 * c0;
1096        acc3 += x0 * c0;
1097        acc4 += x1 * c0;
1098        acc5 += x2 * c0;
1099        acc6 += x3 * c0;
1100        acc7 += x4 * c0;
1101  
1102        /* Read the b[numTaps-4] coefficient */
1103        c0 = *(pb++);
1104  
1105        /* Read x[n-numTaps-6] sample */
1106        x5 = *(px++);
1107  
1108        /* Perform the multiply-accumulates */
1109        acc0 += x6 * c0;
1110        acc1 += x7 * c0;
1111        acc2 += x0 * c0;
1112        acc3 += x1 * c0;
1113        acc4 += x2 * c0;
1114        acc5 += x3 * c0;
1115        acc6 += x4 * c0;
1116        acc7 += x5 * c0;
1117  
1118        /* Read the b[numTaps-4] coefficient */
1119        c0 = *(pb++);
1120  
1121        /* Read x[n-numTaps-6] sample */
1122        x6 = *(px++);
1123  
1124        /* Perform the multiply-accumulates */
1125        acc0 += x7 * c0;
1126        acc1 += x0 * c0;
1127        acc2 += x1 * c0;
1128        acc3 += x2 * c0;
1129        acc4 += x3 * c0;
1130        acc5 += x4 * c0;
1131        acc6 += x5 * c0;
1132        acc7 += x6 * c0;
1133  
1134        /* Decrement loop counter */
1135        tapCnt--;
1136      }
1137  
1138      /* Loop unrolling: Compute remaining outputs */
1139      tapCnt = numTaps % 0x8U;
1140  
1141      while (tapCnt > 0U)
1142      {
1143        /* Read coefficients */
1144        c0 = *(pb++);
1145  
1146        /* Fetch 1 state variable */
1147        x7 = *(px++);
1148  
1149        /* Perform the multiply-accumulates */
1150        acc0 += x0 * c0;
1151        acc1 += x1 * c0;
1152        acc2 += x2 * c0;
1153        acc3 += x3 * c0;
1154        acc4 += x4 * c0;
1155        acc5 += x5 * c0;
1156        acc6 += x6 * c0;
1157        acc7 += x7 * c0;
1158  
1159        /* Reuse the present sample states for next sample */
1160        x0 = x1;
1161        x1 = x2;
1162        x2 = x3;
1163        x3 = x4;
1164        x4 = x5;
1165        x5 = x6;
1166        x6 = x7;
1167  
1168        /* Decrement loop counter */
1169        tapCnt--;
1170      }
1171  
1172      /* Advance the state pointer by 8 to process the next group of 8 samples */
1173      pState = pState + 8;
1174  
1175      /* The results in the 8 accumulators, store in the destination buffer. */
1176      *pDst++ = acc0;
1177      *pDst++ = acc1;
1178      *pDst++ = acc2;
1179      *pDst++ = acc3;
1180      *pDst++ = acc4;
1181      *pDst++ = acc5;
1182      *pDst++ = acc6;
1183      *pDst++ = acc7;
1184  
1185  
1186      /* Decrement loop counter */
1187      blkCnt--;
1188    }
1189  
1190    /* Loop unrolling: Compute remaining output samples */
1191    blkCnt = blockSize % 0x8U;
1192  
1193  #else
1194  
1195    /* Initialize blkCnt with number of taps */
1196    blkCnt = blockSize;
1197  
1198  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1199  
1200    while (blkCnt > 0U)
1201    {
1202      /* Copy one sample at a time into state buffer */
1203      *pStateCurnt++ = *pSrc++;
1204  
1205      /* Set the accumulator to zero */
1206      acc0 = 0.0f;
1207  
1208      /* Initialize state pointer */
1209      px = pState;
1210  
1211      /* Initialize Coefficient pointer */
1212      pb = pCoeffs;
1213  
1214      i = numTaps;
1215  
1216      /* Perform the multiply-accumulates */
1217      while (i > 0U)
1218      {
1219        /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1220        acc0 += *px++ * *pb++;
1221  
1222        i--;
1223      }
1224  
1225      /* Store result in destination buffer. */
1226      *pDst++ = acc0;
1227  
1228      /* Advance state pointer by 1 for the next sample */
1229      pState = pState + 1U;
1230  
1231      /* Decrement loop counter */
1232      blkCnt--;
1233    }
1234  
1235    /* Processing is complete.
1236       Now copy the last numTaps - 1 samples to the start of the state buffer.
1237       This prepares the state buffer for the next function call. */
1238  
1239    /* Points to the start of the state buffer */
1240    pStateCurnt = S->pState;
1241  
1242  #if defined (ARM_MATH_LOOPUNROLL)
1243  
1244    /* Loop unrolling: Compute 4 taps at a time */
1245    tapCnt = (numTaps - 1U) >> 2U;
1246  
1247    /* Copy data */
1248    while (tapCnt > 0U)
1249    {
1250      *pStateCurnt++ = *pState++;
1251      *pStateCurnt++ = *pState++;
1252      *pStateCurnt++ = *pState++;
1253      *pStateCurnt++ = *pState++;
1254  
1255      /* Decrement loop counter */
1256      tapCnt--;
1257    }
1258  
1259    /* Calculate remaining number of copies */
1260    tapCnt = (numTaps - 1U) % 0x4U;
1261  
1262  #else
1263  
1264    /* Initialize tapCnt with number of taps */
1265    tapCnt = (numTaps - 1U);
1266  
1267  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1268  
1269    /* Copy remaining data */
1270    while (tapCnt > 0U)
1271    {
1272      *pStateCurnt++ = *pState++;
1273  
1274      /* Decrement loop counter */
1275      tapCnt--;
1276    }
1277  
1278  }
1279  
1280  #endif /* #if defined(ARM_MATH_NEON) */
1281  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1282  
1283  /**
1284  * @} end of FIR group
1285  */