/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_fir_q31.c
arm_fir_q31.c
   1  /* ----------------------------------------------------------------------
   2   * Project:      CMSIS DSP Library
   3   * Title:        arm_fir_q31.c
   4   * Description:  Q31 FIR filter processing function
   5   *
   6   * $Date:        23 April 2021
   7   * $Revision:    V1.9.0
   8   *
   9   * Target Processor: Cortex-M and Cortex-A cores
  10   * -------------------------------------------------------------------- */
  11  /*
  12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13   *
  14   * SPDX-License-Identifier: Apache-2.0
  15   *
  16   * Licensed under the Apache License, Version 2.0 (the License); you may
  17   * not use this file except in compliance with the License.
  18   * You may obtain a copy of the License at
  19   *
  20   * www.apache.org/licenses/LICENSE-2.0
  21   *
  22   * Unless required by applicable law or agreed to in writing, software
  23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25   * See the License for the specific language governing permissions and
  26   * limitations under the License.
  27   */
  28  
  29  #include "dsp/filtering_functions.h"
  30  
  31  
  32  /**
  33    @ingroup groupFilters
  34   */
  35  
  36  /**
  37    @addtogroup FIR
  38    @{
  39   */
  40  
  41  /**
  42    @brief         Processing function for Q31 FIR filter.
  43    @param[in]     S          points to an instance of the Q31 FIR filter structure
  44    @param[in]     pSrc       points to the block of input data
  45    @param[out]    pDst       points to the block of output data
  46    @param[in]     blockSize  number of samples to process
  47    @return        none
  48  
  49    @par           Scaling and Overflow Behavior
  50                     The function is implemented using an internal 64-bit accumulator.
  51                     The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  52                     Thus, if the accumulator result overflows it wraps around rather than clip.
  53                     In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
  54                     After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  55  
  56   @remark
  57                     Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
  58   */
  59  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
  60  
  61  #include "arm_helium_utils.h"
  62  
  63  
  64  #define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs)                 \
  65          for (int j = 0; j < nbAcc; j++) {                                  \
  66              const q31_t    *pSmp = &pSamples[j];                           \
  67              q31x4_t         vecIn0;                                        \
  68              q63_t           acc[4];                                        \
  69                                                                             \
  70              acc[j] = 0;                                                    \
  71              for (int i = 0; i < nbVecTaps; i++) {                          \
  72                  vecIn0 = vld1q(pSmp + 4 * i);                  \
  73                  acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
  74              }                                                              \
  75              *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
  76          }
  77  
  78  
  79  #define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)     \
  80          for (int j = 0; j < nbAcc; j++) {                                  \
  81              const q31_t    *pSmp = &pSamples[j];                           \
  82              q31x4_t         vecIn0;                                        \
  83                                                                             \
  84              acc[j] = 0;                                                    \
  85              for (int i = 0; i < nbVecTaps; i++) {                          \
  86                  vecIn0 = vld1q(pSmp + 4 * i);                  \
  87                  acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
  88              }                                                              \
  89              *arm_fir_partial_accu_ptr++ = acc[j];                          \
  90          }
  91  
  92  
  93  #define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)      \
  94          for (int j = 0; j < nbAcc; j++) {                                  \
  95              const q31_t    *pSmp = &pSamples[j];                           \
  96              q31x4_t         vecIn0;                                        \
  97                                                                             \
  98              acc[j] = *arm_fir_partial_accu_ptr++;                          \
  99                                                                             \
 100              for (int i = 0; i < nbVecTaps; i++) {                          \
 101                  vecIn0 = vld1q(pSmp + 4 * i);                  \
 102                  acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
 103              }                                                              \
 104              *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
 105          }
 106  
 107                        
 108  #define FIR_Q31_MAIN_CORE()                                                              \
 109  {                                                                                        \
 110      q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);                          \
 111      q31_t      *pState = pRefStatePtr; /* State pointer */                               \
 112      const q31_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */                        \
 113      q31_t       *pStateCur;             /* Points to the current sample of the state */  \
 114      const q31_t *pSamples;              /* Temporary pointer to the sample buffer */     \
 115      q31_t       *pOutput;               /* Temporary pointer to the output buffer */     \
 116      const q31_t *pTempSrc;              /* Temporary pointer to the source data */       \
 117      q31_t       *pTempDest;             /* Temporary pointer to the destination buffer */\
 118      uint32_t     numTaps = S->numTaps;  /* Number of filter coefficients in the filter */\
 119      int32_t      blkCnt;                                                                 \
 120                                                                                           \
 121      /*                                                                                   \
 122       * load coefs                                                                        \
 123       */                                                                                  \
 124      q31x4_t         vecCoeffs[NBVECTAPS];                                                \
 125                                                                                           \
 126      for (int i = 0; i < NBVECTAPS; i++)                                                  \
 127          vecCoeffs[i] = vld1q(pCoeffs + 4 * i);                                           \
 128                                                                                           \
 129      /*                                                                                   \
 130       * pState points to state array which contains previous frame (numTaps - 1) samples  \
 131       * pStateCur points to the location where the new input data should be written       \
 132       */                                                                                  \
 133      pStateCur = &(pState[(numTaps - 1u)]);                                               \
 134      pTempSrc = pSrc;                                                                     \
 135      pSamples = pState;                                                                   \
 136      pOutput = pDst;                                                                      \
 137                                                                                           \
 138      blkCnt = blockSize >> 2;                                                             \
 139      while (blkCnt > 0) {                                                                 \
 140          /*                                                                               \
 141           * Save 4 input samples in the history buffer                                    \
 142           */                                                                              \
 143          vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));                                     \
 144          pStateCur += 4;                                                                  \
 145          pTempSrc += 4;                                                                   \
 146                                                                                           \
 147          FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs);                                 \
 148                                                                                           \
 149          pSamples += 4;                                                                   \
 150          /*                                                                               \
 151           * Decrement the sample block loop counter                                       \
 152           */                                                                              \
 153          blkCnt--;                                                                        \
 154      }                                                                                    \
 155                                                                                           \
 156      /* tail */                                                                           \
 157      int32_t        residual = blockSize & 3;                                             \
 158      switch (residual) {                                                                  \
 159        case 3:                                                                            \
 160            {                                                                              \
 161                for (int i = 0; i < residual; i++)                                         \
 162                    *pStateCur++ = *pTempSrc++;                                            \
 163                                                                                           \
 164                FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs);                           \
 165            }                                                                              \
 166            break;                                                                         \
 167                                                                                           \
 168        case 2:                                                                            \
 169            {                                                                              \
 170                for (int i = 0; i < residual; i++)                                         \
 171                    *pStateCur++ = *pTempSrc++;                                            \
 172                                                                                           \
 173                 FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs);                          \
 174            }                                                                              \
 175            break;                                                                         \
 176                                                                                           \
 177        case 1:                                                                            \
 178            {                                                                              \
 179                for (int i = 0; i < residual; i++)                                         \
 180                    *pStateCur++ = *pTempSrc++;                                            \
 181                                                                                           \
 182                FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs);                           \
 183            }                                                                              \
 184            break;                                                                         \
 185      }                                                                                    \
 186                                                                                           \
 187      /*                                                                                   \
 188       * Copy the samples back into the history buffer start                               \
 189       */                                                                                  \
 190      pTempSrc = &pState[blockSize];                                                       \
 191      pTempDest = pState;                                                                  \
 192                                                                                           \
 193      blkCnt =(numTaps - 1) >> 2;                                                          \
 194      while (blkCnt > 0)                                                                   \
 195      {                                                                                    \
 196          vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc));                                     \
 197          pTempSrc += 4;                                                                   \
 198          pTempDest += 4;                                                                  \
 199          blkCnt--;                                                                        \
 200      }                                                                                    \
 201      blkCnt = (numTaps - 1) & 3;                                                          \
 202      if (blkCnt > 0)                                                                      \
 203      {                                                                                    \
 204          mve_pred16_t p0 = vctp32q(blkCnt);                                               \
 205          vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0);                         \
 206      }                                                                                    \
 207  }
 208  
 209  static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, 
 210      const q31_t * __restrict pSrc, 
 211      q31_t * __restrict pDst, uint32_t blockSize)
 212  {
 213      q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
 214      q31_t      *pState = pRefStatePtr; /* State pointer */
 215      const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
 216      q31_t    *pStateCur;        /* Points to the current sample of the state */
 217      const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
 218      q31_t    *pOutput;          /* Temporary pointer to the output buffer */
 219      const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
 220      q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
 221      uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
 222      uint32_t  blkCnt;
 223      q31x4_t vecIn0;
 224  
 225  
 226      /*
 227       * pState points to state array which contains previous frame (numTaps - 1) samples
 228       * pStateCur points to the location where the new input data should be written
 229       */
 230      pStateCur = &(pState[(numTaps - 1u)]);
 231      pTempSrc = pSrc;
 232      pSamples = pState;
 233      pOutput = pDst;
 234  
 235      q63_t     acc0=0, acc1=0, acc2=0, acc3=0;
 236      /*
 237       * load 4 coefs
 238       */
 239      q31x4_t vecCoeffs = *(q31x4_t *) pCoeffs;
 240  
 241      blkCnt = blockSize >> 2;
 242      while (blkCnt > 0U)
 243      {
 244          const q31_t    *pSamplesTmp = pSamples;
 245  
 246          /*
 247           * Save 4 input samples in the history buffer
 248           */
 249          vst1q(pStateCur, vld1q(pTempSrc));
 250          pStateCur += 4;
 251          pTempSrc += 4;
 252  
 253          vecIn0 = vld1q(pSamplesTmp);
 254          acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
 255  
 256          vecIn0 = vld1q(&pSamplesTmp[1]);
 257          acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
 258  
 259          vecIn0 = vld1q(&pSamplesTmp[2]);
 260          acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
 261  
 262          vecIn0 = vld1q(&pSamplesTmp[3]);
 263          acc3 = vrmlaldavhq(vecIn0, vecCoeffs);
 264  
 265          acc0 = asrl(acc0, 23);
 266          acc1 = asrl(acc1, 23);
 267          acc2 = asrl(acc2, 23);
 268          acc3 = asrl(acc3, 23);
 269  
 270          *pOutput++ = (q31_t) acc0;
 271          *pOutput++ = (q31_t) acc1;
 272          *pOutput++ = (q31_t) acc2;
 273          *pOutput++ = (q31_t) acc3;
 274  
 275          pSamples += 4;
 276          /*
 277           * Decrement the sample block loop counter
 278           */
 279          blkCnt--;
 280      }
 281  
 282      uint32_t  residual = blockSize & 3;
 283      switch (residual)
 284      {
 285      case 3:
 286          {
 287              /*
 288               * Save 4 input samples in the history buffer
 289               */
 290              *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
 291              pStateCur += 4;
 292              pTempSrc += 4;
 293  
 294              vecIn0 = vld1q(pSamples);
 295              acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
 296  
 297              vecIn0 = vld1q(&pSamples[1]);
 298              acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
 299  
 300              vecIn0 = vld1q(&pSamples[2]);
 301              acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
 302  
 303              acc0 = asrl(acc0, 23);
 304              acc1 = asrl(acc1, 23);
 305              acc2 = asrl(acc2, 23);
 306  
 307              *pOutput++ = (q31_t) acc0;
 308              *pOutput++ = (q31_t) acc1;
 309              *pOutput++ = (q31_t) acc2;
 310          }
 311          break;
 312  
 313      case 2:
 314          {
 315              /*
 316               * Save 4 input samples in the history buffer
 317               */
 318              vst1q(pStateCur, vld1q(pTempSrc));
 319              pStateCur += 4;
 320              pTempSrc += 4;
 321  
 322              vecIn0 = vld1q(pSamples);
 323              acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
 324  
 325              vecIn0 = vld1q(&pSamples[1]);
 326              acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
 327  
 328              acc0 = asrl(acc0, 23);
 329              acc1 = asrl(acc1, 23);
 330  
 331              *pOutput++ = (q31_t) acc0;
 332              *pOutput++ = (q31_t) acc1;
 333          }
 334          break;
 335  
 336      case 1:
 337          {
 338              /*
 339               * Save 4 input samples in the history buffer
 340               */
 341              vst1q(pStateCur, vld1q(pTempSrc));
 342              pStateCur += 4;
 343              pTempSrc += 4;
 344  
 345              vecIn0 = vld1q(pSamples);
 346              acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
 347  
 348              acc0 = asrl(acc0, 23);
 349  
 350              *pOutput++ = (q31_t) acc0;
 351          }
 352          break;
 353      }
 354  
 355      /*
 356       * Copy the samples back into the history buffer start
 357       */
 358      pTempSrc = &pState[blockSize];
 359      pTempDest = pState;
 360  
 361      blkCnt = (numTaps-1) >> 2;
 362      while (blkCnt > 0U)
 363      {
 364          vst1q(pTempDest, vld1q(pTempSrc));
 365          pTempSrc += 4;
 366          pTempDest += 4;
 367          blkCnt--;
 368      }
 369      blkCnt = (numTaps-1) & 3;
 370      if (blkCnt > 0U)
 371      {
 372          mve_pred16_t p0 = vctp32q(blkCnt);
 373          vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
 374      }
 375  }
 376  
 377  
 378  
 379  static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, 
 380      const q31_t * __restrict pSrc, 
 381      q31_t * __restrict pDst, uint32_t blockSize)
 382  {
 383      #define NBTAPS 8
 384      #define NBVECTAPS (NBTAPS / 4)
 385      FIR_Q31_MAIN_CORE();
 386      #undef NBVECTAPS
 387      #undef NBTAPS
 388  }
 389  
 390  
 391  static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S, 
 392      const q31_t * __restrict pSrc, 
 393      q31_t * __restrict pDst, uint32_t blockSize)
 394  {
 395      #define NBTAPS 12
 396      #define NBVECTAPS (NBTAPS / 4)
 397      FIR_Q31_MAIN_CORE();
 398      #undef NBVECTAPS
 399      #undef NBTAPS
 400  }
 401  
 402  
 403  static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S, 
 404      const q31_t * __restrict pSrc, 
 405      q31_t * __restrict pDst, uint32_t blockSize)
 406  {
 407      #define NBTAPS 16
 408      #define NBVECTAPS (NBTAPS / 4)
 409      FIR_Q31_MAIN_CORE();
 410      #undef NBVECTAPS
 411      #undef NBTAPS
 412  }
 413  
 414  
 415  static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S, 
 416      const q31_t * __restrict pSrc, 
 417      q31_t * __restrict pDst, uint32_t blockSize)
 418  {
 419      #define NBTAPS 20
 420      #define NBVECTAPS (NBTAPS / 4)
 421      FIR_Q31_MAIN_CORE();
 422      #undef NBVECTAPS
 423      #undef NBTAPS
 424  }
 425  
 426  
 427  static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S, 
 428      const q31_t * __restrict pSrc, 
 429      q31_t * __restrict pDst, uint32_t blockSize)
 430  {
 431      #define NBTAPS 24
 432      #define NBVECTAPS (NBTAPS / 4)
 433      FIR_Q31_MAIN_CORE();
 434      #undef NBVECTAPS
 435      #undef NBTAPS
 436  }
 437  
 438  
 439  static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S, 
 440      const q31_t * __restrict pSrc, 
 441      q31_t * __restrict pDst, uint32_t blockSize)
 442  {
 443      #define NBTAPS 28
 444      #define NBVECTAPS (NBTAPS / 4)
 445      FIR_Q31_MAIN_CORE();
 446      #undef NBVECTAPS
 447      #undef NBTAPS
 448  }
 449  
 450  static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S, 
 451      const q31_t * __restrict pSrc, 
 452      q31_t * __restrict pDst,
 453                                 uint32_t blockSize)
 454  {
 455      q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
 456      q31_t      *pState = pRefStatePtr; /* State pointer */
 457      const q31_t    *pCoeffs = S->pCoeffs;       /* Coefficient pointer */
 458      q31_t          *pStateCur;  /* Points to the current sample of the state */
 459      const q31_t    *pSamples;   /* Temporary pointer to the sample buffer */
 460      q31_t          *pOutput;    /* Temporary pointer to the output buffer */
 461      const q31_t    *pTempSrc;   /* Temporary pointer to the source data */
 462      q31_t          *pTempDest;  /* Temporary pointer to the destination buffer */
 463      uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
 464      int32_t         blkCnt;
 465      q63_t           acc0, acc1, acc2, acc3;
 466  
 467  #define MAX_VECT_BATCH 7
 468  
 469      /*
 470       * pre-load 28 1st coefs
 471       */
 472      q31x4_t         vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
 473      q31x4_t         vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
 474      q31x4_t         vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
 475      q31x4_t         vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
 476      q31x4_t         vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
 477      q31x4_t         vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
 478      q31x4_t         vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
 479  
 480      /*
 481       * pState points to state array which contains previous frame (numTaps - 1) samples
 482       * pStateCur points to the location where the new input data should be written
 483       */
 484      pStateCur = &(pState[(numTaps - 1u)]);
 485      pTempSrc = pSrc;
 486      pSamples = pState;
 487  
 488      q63_t          *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
 489  
 490      blkCnt = blockSize >> 2;
 491      while (blkCnt > 0) {
 492          /*
 493           * Save 4 input samples in the history buffer
 494           */
 495          vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
 496          pStateCur += 4;
 497          pTempSrc += 4;
 498  
 499          const q31_t    *pSmp;
 500          q31x4_t         vecIn0;
 501  
 502          pSmp = &pSamples[0];
 503  
 504          vecIn0 = vld1q(pSmp);
 505          acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
 506          vecIn0 = vld1q(pSmp + 4 * 1);
 507          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
 508          vecIn0 = vld1q(pSmp + 4 * 2);
 509          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
 510          vecIn0 = vld1q(pSmp + 4 * 3);
 511          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
 512          vecIn0 = vld1q(pSmp + 4 * 4);
 513          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
 514          vecIn0 = vld1q(pSmp + 4 * 5);
 515          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
 516          vecIn0 = vld1q(pSmp + 4 * 6);
 517          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
 518  
 519          *arm_fir_partial_accu_ptr++ = acc0;
 520  
 521          pSmp = &pSamples[1];
 522  
 523          vecIn0 = vld1q(pSmp);
 524          acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
 525          vecIn0 = vld1q(pSmp + 4 * 1);
 526          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
 527          vecIn0 = vld1q(pSmp + 4 * 2);
 528          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
 529          vecIn0 = vld1q(pSmp + 4 * 3);
 530          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
 531          vecIn0 = vld1q(pSmp + 4 * 4);
 532          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
 533          vecIn0 = vld1q(pSmp + 4 * 5);
 534          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
 535          vecIn0 = vld1q(pSmp + 4 * 6);
 536          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
 537  
 538          *arm_fir_partial_accu_ptr++ = acc1;
 539  
 540          pSmp = &pSamples[2];
 541  
 542          vecIn0 = vld1q(pSmp);
 543          acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
 544          vecIn0 = vld1q(pSmp + 4 * 1);
 545          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
 546          vecIn0 = vld1q(pSmp + 4 * 2);
 547          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
 548          vecIn0 = vld1q(pSmp + 4 * 3);
 549          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
 550          vecIn0 = vld1q(pSmp + 4 * 4);
 551          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
 552          vecIn0 = vld1q(pSmp + 4 * 5);
 553          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
 554          vecIn0 = vld1q(pSmp + 4 * 6);
 555          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
 556          *arm_fir_partial_accu_ptr++ = acc2;
 557  
 558          pSmp = &pSamples[3];
 559  
 560          vecIn0 = vld1q(pSmp);
 561          acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
 562          vecIn0 = vld1q(pSmp + 4 * 1);
 563          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
 564          vecIn0 = vld1q(pSmp + 4 * 2);
 565          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
 566          vecIn0 = vld1q(pSmp + 4 * 3);
 567          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
 568          vecIn0 = vld1q(pSmp + 4 * 4);
 569          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
 570          vecIn0 = vld1q(pSmp + 4 * 5);
 571          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
 572          vecIn0 = vld1q(pSmp + 4 * 6);
 573          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
 574  
 575          *arm_fir_partial_accu_ptr++ = acc3;
 576  
 577          pSamples += 4;
 578          /*
 579           * Decrement the sample block loop counter
 580           */
 581          blkCnt--;
 582      }
 583  
 584  
 585      /* reminder */
 586  
 587      /* load last 4 coef */
 588      vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
 589      arm_fir_partial_accu_ptr = (q63_t*)S->pState;
 590      pOutput = pDst;
 591      pSamples = pState + (MAX_VECT_BATCH * 4);
 592  
 593  
 594      blkCnt = blockSize >> 2;
 595      while (blkCnt > 0) {
 596          q31x4_t         vecIn0;
 597  
 598          /* reload intermediate MAC */
 599          acc0 = *arm_fir_partial_accu_ptr++;
 600          acc1 = *arm_fir_partial_accu_ptr++;
 601          acc2 = *arm_fir_partial_accu_ptr++;
 602          acc3 = *arm_fir_partial_accu_ptr++;
 603  
 604  
 605          vecIn0 = vld1q(&pSamples[0]);
 606          acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
 607  
 608          vecIn0 = vld1q(&pSamples[1]);
 609          acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
 610  
 611          vecIn0 = vld1q(&pSamples[2]);
 612          acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
 613  
 614          vecIn0 = vld1q(&pSamples[3]);
 615          acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
 616  
 617          *pOutput++ = asrl(acc0, 23);
 618          *pOutput++ = asrl(acc1, 23);
 619          *pOutput++ = asrl(acc2, 23);
 620          *pOutput++ = asrl(acc3, 23);
 621  
 622          pSamples += 4;
 623          /*
 624           * Decrement the sample block loop counter
 625           */
 626          blkCnt--;
 627      }
 628  
 629      /*
 630       * Copy the samples back into the history buffer start
 631       */
 632      pTempSrc = &pState[blockSize];
 633      pTempDest = pState;
 634  
 635      blkCnt = numTaps - 1;
 636      do {
 637          mve_pred16_t    p = vctp32q(blkCnt);
 638  
 639          vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
 640          pTempSrc += 4;
 641          pTempDest += 4;
 642          blkCnt -= 4;
 643      }
 644      while (blkCnt > 0);
 645  }
 646  
 647  
 648  
 649  void arm_fir_q31(
 650    const arm_fir_instance_q31 * S,
 651    const q31_t * pSrc,
 652          q31_t * pDst,
 653          uint32_t blockSize)
 654  {
 655      q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
 656      q31_t      *pState = pRefStatePtr; /* State pointer */
 657      const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
 658      q31_t    *pStateCur;        /* Points to the current sample of the state */
 659      const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
 660      q31_t    *pOutput;          /* Temporary pointer to the output buffer */
 661      const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
 662      q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
 663      uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
 664      uint32_t  blkCnt;
 665      q31x4_t vecIn0;
 666      uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
 667      q63_t     acc0, acc1, acc2, acc3;
 668      q31x4_t vecCoeffs;
 669  
 670  
 671      /*
 672       * [1 to 32 taps] specialized routines
 673       */
 674      if (numTaps <= 4)
 675      {
 676          arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
 677          return;
 678      }
 679      else if (numTaps <= 8)
 680      {
 681          arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
 682          return;
 683      }
 684      else if (numTaps <= 12)
 685      {
 686          arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
 687          return;
 688      }
 689      else if (numTaps <= 16)
 690      {
 691          arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
 692          return;
 693      }
 694      else if (numTaps <= 20)
 695      {
 696          arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
 697          return;
 698      }
 699      else if (numTaps <= 24)
 700      {
 701          arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
 702          return;
 703      }
 704      else if (numTaps <= 28)
 705      {
 706          arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
 707          return;
 708      }
 709      else if ((numTaps <= 32)  && (blockSize >= 32))
 710      {
 711          arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
 712          return;
 713      }
 714  
 715      /*
 716       * pState points to state array which contains previous frame (numTaps - 1) samples
 717       * pStateCur points to the location where the new input data should be written
 718       */
 719      pStateCur   = &(pState[(numTaps - 1u)]);
 720      pSamples    = pState;
 721      pTempSrc    = pSrc;
 722      pOutput     = pDst;
 723      blkCnt      = blockSize >> 2;
 724      while (blkCnt > 0)
 725      {
 726          const q31_t    *pCoeffsTmp = pCoeffs;
 727          const q31_t    *pSamplesTmp = pSamples;
 728  
 729          acc0 = 0LL;
 730          acc1 = 0LL;
 731          acc2 = 0LL;
 732          acc3 = 0LL;
 733  
 734          /*
 735           * Save 4 input samples in the history buffer
 736           */
 737          vst1q(pStateCur, vld1q(pTempSrc));
 738          pStateCur += 4;
 739          pTempSrc += 4;
 740  
 741          int       i = tapsBlkCnt;
 742          while (i > 0)
 743          {
 744              /*
 745               * load 4 coefs
 746               */
 747              vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 748  
 749              vecIn0 = vld1q(pSamplesTmp);
 750              acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 751  
 752              vecIn0 = vld1q(&pSamplesTmp[1]);
 753              acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 754  
 755              vecIn0 = vld1q(&pSamplesTmp[2]);
 756              acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 757  
 758              vecIn0 = vld1q(&pSamplesTmp[3]);
 759              acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
 760  
 761              pSamplesTmp += 4;
 762              pCoeffsTmp += 4;
 763              /*
 764               * Decrement the taps block loop counter
 765               */
 766              i--;
 767          }
 768  
 769          /* .54-> .31 conversion and store accumulators */
 770          acc0 = asrl(acc0, 23);
 771          acc1 = asrl(acc1, 23);
 772          acc2 = asrl(acc2, 23);
 773          acc3 = asrl(acc3, 23);
 774  
 775          *pOutput++ = (q31_t) acc0;
 776          *pOutput++ = (q31_t) acc1;
 777          *pOutput++ = (q31_t) acc2;
 778          *pOutput++ = (q31_t) acc3;
 779  
 780          pSamples += 4;
 781  
 782          /*
 783           * Decrement the sample block loop counter
 784           */
 785          blkCnt--;
 786      }
 787  
 788      int32_t  residual = blockSize & 3;
 789      switch (residual)
 790      {
 791      case 3:
 792          {
 793              const q31_t    *pCoeffsTmp = pCoeffs;
 794              const q31_t    *pSamplesTmp = pSamples;
 795  
 796              acc0 = 0LL;
 797              acc1 = 0LL;
 798              acc2 = 0LL;
 799  
 800              /*
 801               * Save 4 input samples in the history buffer
 802               */
 803              *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
 804              pStateCur += 4;
 805              pTempSrc += 4;
 806  
 807              int       i = tapsBlkCnt;
 808              while (i > 0)
 809              {
 810                  vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 811  
 812                  vecIn0 = vld1q(pSamplesTmp);
 813                  acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 814  
 815                  vecIn0 = vld1q(&pSamplesTmp[1]);
 816                  acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 817  
 818                  vecIn0 = vld1q(&pSamplesTmp[2]);
 819                  acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 820  
 821                  pSamplesTmp += 4;
 822                  pCoeffsTmp += 4;
 823                  i--;
 824              }
 825  
 826              acc0 = asrl(acc0, 23);
 827              acc1 = asrl(acc1, 23);
 828              acc2 = asrl(acc2, 23);
 829  
 830              *pOutput++ = (q31_t) acc0;
 831              *pOutput++ = (q31_t) acc1;
 832              *pOutput++ = (q31_t) acc2;
 833          }
 834          break;
 835  
 836      case 2:
 837          {
 838              const q31_t    *pCoeffsTmp = pCoeffs;
 839              const q31_t    *pSamplesTmp = pSamples;
 840  
 841              acc0 = 0LL;
 842              acc1 = 0LL;
 843  
 844              /*
 845               * Save 4 input samples in the history buffer
 846               */
 847              vst1q(pStateCur, vld1q(pTempSrc));
 848              pStateCur += 4;
 849              pTempSrc += 4;
 850  
 851              int       i = tapsBlkCnt;
 852              while (i > 0)
 853              {
 854                  vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 855  
 856                  vecIn0 = vld1q(pSamplesTmp);
 857                  acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 858  
 859                  vecIn0 = vld1q(&pSamplesTmp[1]);
 860                  acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 861  
 862                  pSamplesTmp += 4;
 863                  pCoeffsTmp += 4;
 864                  i--;
 865              }
 866  
 867              acc0 = asrl(acc0, 23);
 868              acc1 = asrl(acc1, 23);
 869  
 870              *pOutput++ = (q31_t) acc0;
 871              *pOutput++ = (q31_t) acc1;
 872          }
 873          break;
 874  
 875      case 1:
 876          {
 877              const q31_t    *pCoeffsTmp = pCoeffs;
 878              const q31_t    *pSamplesTmp = pSamples;
 879  
 880              acc0 = 0LL;
 881  
 882              /*
 883               * Save 4 input samples in the history buffer
 884               */
 885              vst1q(pStateCur, vld1q(pTempSrc));
 886              pStateCur += 4;
 887              pTempSrc += 4;
 888  
 889              int       i = tapsBlkCnt;
 890              while (i > 0)
 891              {
 892                  vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 893  
 894                  vecIn0 = vld1q(pSamplesTmp);
 895                  acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 896  
 897                  pSamplesTmp += 4;
 898                  pCoeffsTmp += 4;
 899                  i--;
 900              }
 901  
 902              acc0 = asrl(acc0, 23);
 903  
 904              *pOutput++ = (q31_t) acc0;
 905          }
 906          break;
 907      }
 908  
 909      /*
 910       * Copy the samples back into the history buffer start
 911       */
 912      pTempSrc = &pState[blockSize];
 913      pTempDest = pState;
 914  
 915      blkCnt = (numTaps - 1U) >> 2;
 916      while (blkCnt > 0)
 917      {
 918          vst1q(pTempDest, vld1q(pTempSrc));
 919          pTempSrc += 4;
 920          pTempDest += 4;
 921          blkCnt--;
 922      }
 923      blkCnt = (numTaps - 1U) & 3;
 924      if (blkCnt > 0)
 925      {
 926          mve_pred16_t p0 = vctp32q(blkCnt);
 927          vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
 928      }
 929  }
 930  
 931  #else
 932  void arm_fir_q31(
 933    const arm_fir_instance_q31 * S,
 934    const q31_t * pSrc,
 935          q31_t * pDst,
 936          uint32_t blockSize)
 937  {
 938          q31_t *pState = S->pState;                     /* State pointer */
 939    const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
 940          q31_t *pStateCurnt;                            /* Points to the current sample of the state */
 941          q31_t *px;                                     /* Temporary pointer for state buffer */
 942    const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
 943          q63_t acc0;                                    /* Accumulator */
 944          uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
 945          uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
 946  
 947  #if defined (ARM_MATH_LOOPUNROLL)
 948          q63_t acc1, acc2;                              /* Accumulators */
 949          q31_t x0, x1, x2;                              /* Temporary variables to hold state values */
 950          q31_t c0;                                      /* Temporary variable to hold coefficient value */
 951  #endif
 952  
 953    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
 954    /* pStateCurnt points to the location where the new input data should be written */
 955    pStateCurnt = &(S->pState[(numTaps - 1U)]);
 956  
 957  #if defined (ARM_MATH_LOOPUNROLL)
 958  
 959    /* Loop unrolling: Compute 4 output values simultaneously.
 960     * The variables acc0 ... acc3 hold output values that are being computed:
 961     *
 962     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
 963     *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
 964     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 965     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 966     */
 967  
 968    blkCnt = blockSize / 3;
 969  
 970    while (blkCnt > 0U)
 971    {
 972      /* Copy 3 new input samples into the state buffer. */
 973      *pStateCurnt++ = *pSrc++;
 974      *pStateCurnt++ = *pSrc++;
 975      *pStateCurnt++ = *pSrc++;
 976  
 977      /* Set all accumulators to zero */
 978      acc0 = 0;
 979      acc1 = 0;
 980      acc2 = 0;
 981  
 982      /* Initialize state pointer */
 983      px = pState;
 984  
 985      /* Initialize coefficient pointer */
 986      pb = pCoeffs;
 987  
 988      /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */
 989      x0 = *px++;
 990      x1 = *px++;
 991  
 992      /* Loop unrolling: process 3 taps at a time. */
 993      tapCnt = numTaps / 3;
 994  
 995      while (tapCnt > 0U)
 996      {
 997        /* Read the b[numTaps] coefficient */
 998        c0 = *pb;
 999  
1000        /* Read x[n-numTaps-2] sample */
1001        x2 = *(px++);
1002  
1003        /* Perform the multiply-accumulates */
1004        acc0 += ((q63_t) x0 * c0);
1005        acc1 += ((q63_t) x1 * c0);
1006        acc2 += ((q63_t) x2 * c0);
1007  
1008        /* Read the coefficient and state */
1009        c0 = *(pb + 1U);
1010        x0 = *(px++);
1011  
1012        /* Perform the multiply-accumulates */
1013        acc0 += ((q63_t) x1 * c0);
1014        acc1 += ((q63_t) x2 * c0);
1015        acc2 += ((q63_t) x0 * c0);
1016  
1017        /* Read the coefficient and state */
1018        c0 = *(pb + 2U);
1019        x1 = *(px++);
1020  
1021        /* update coefficient pointer */
1022        pb += 3U;
1023  
1024        /* Perform the multiply-accumulates */
1025        acc0 += ((q63_t) x2 * c0);
1026        acc1 += ((q63_t) x0 * c0);
1027        acc2 += ((q63_t) x1 * c0);
1028  
1029        /* Decrement loop counter */
1030        tapCnt--;
1031      }
1032  
1033      /* Loop unrolling: Compute remaining outputs */
1034      tapCnt = numTaps % 0x3U;
1035  
1036      while (tapCnt > 0U)
1037      {
1038        /* Read coefficients */
1039        c0 = *(pb++);
1040  
1041        /* Fetch 1 state variable */
1042        x2 = *(px++);
1043  
1044        /* Perform the multiply-accumulates */
1045        acc0 += ((q63_t) x0 * c0);
1046        acc1 += ((q63_t) x1 * c0);
1047        acc2 += ((q63_t) x2 * c0);
1048  
1049        /* Reuse the present sample states for next sample */
1050        x0 = x1;
1051        x1 = x2;
1052  
1053        /* Decrement loop counter */
1054        tapCnt--;
1055      }
1056  
1057      /* Advance the state pointer by 3 to process the next group of 3 samples */
1058      pState = pState + 3;
1059  
1060      /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */
1061      *pDst++ = (q31_t) (acc0 >> 31U);
1062      *pDst++ = (q31_t) (acc1 >> 31U);
1063      *pDst++ = (q31_t) (acc2 >> 31U);
1064  
1065      /* Decrement loop counter */
1066      blkCnt--;
1067    }
1068  
1069    /* Loop unrolling: Compute remaining output samples */
1070    blkCnt = blockSize % 0x3U;
1071  
1072  #else
1073  
1074    /* Initialize blkCnt with number of taps */
1075    blkCnt = blockSize;
1076  
1077  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1078  
1079    while (blkCnt > 0U)
1080    {
1081      /* Copy one sample at a time into state buffer */
1082      *pStateCurnt++ = *pSrc++;
1083  
1084      /* Set the accumulator to zero */
1085      acc0 = 0;
1086  
1087      /* Initialize state pointer */
1088      px = pState;
1089  
1090      /* Initialize Coefficient pointer */
1091      pb = pCoeffs;
1092  
1093      i = numTaps;
1094  
1095      /* Perform the multiply-accumulates */
1096      do
1097      {
1098        /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1099        acc0 += (q63_t) *px++ * *pb++;
1100  
1101        i--;
1102      } while (i > 0U);
1103  
1104      /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
1105      *pDst++ = (q31_t) (acc0 >> 31U);
1106  
1107      /* Advance state pointer by 1 for the next sample */
1108      pState = pState + 1U;
1109  
1110      /* Decrement loop counter */
1111      blkCnt--;
1112    }
1113  
1114    /* Processing is complete.
1115       Now copy the last numTaps - 1 samples to the start of the state buffer.
1116       This prepares the state buffer for the next function call. */
1117  
1118    /* Points to the start of the state buffer */
1119    pStateCurnt = S->pState;
1120  
1121  #if defined (ARM_MATH_LOOPUNROLL)
1122  
1123    /* Loop unrolling: Compute 4 taps at a time */
1124    tapCnt = (numTaps - 1U) >> 2U;
1125  
1126    /* Copy data */
1127    while (tapCnt > 0U)
1128    {
1129      *pStateCurnt++ = *pState++;
1130      *pStateCurnt++ = *pState++;
1131      *pStateCurnt++ = *pState++;
1132      *pStateCurnt++ = *pState++;
1133  
1134      /* Decrement loop counter */
1135      tapCnt--;
1136    }
1137  
1138    /* Calculate remaining number of copies */
1139    tapCnt = (numTaps - 1U) % 0x4U;
1140  
1141  #else
1142  
1143    /* Initialize tapCnt with number of taps */
1144    tapCnt = (numTaps - 1U);
1145  
1146  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1147  
1148    /* Copy remaining data */
1149    while (tapCnt > 0U)
1150    {
1151      *pStateCurnt++ = *pState++;
1152  
1153      /* Decrement loop counter */
1154      tapCnt--;
1155    }
1156  
1157  }
1158  #endif /* defined(ARM_MATH_MVEI) */
1159  
1160  /**
1161    @} end of FIR group
1162   */