/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_fir_q15.c
arm_fir_q15.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_fir_q15.c
  4   * Description:  Q15 FIR filter processing function
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/filtering_functions.h"
 30  
 31  /**
 32    @ingroup groupFilters
 33   */
 34  
 35  /**
 36    @addtogroup FIR
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Processing function for the Q15 FIR filter.
 42    @param[in]     S          points to an instance of the Q15 FIR filter structure
 43    @param[in]     pSrc       points to the block of input data
 44    @param[out]    pDst       points to the block of output data
 45    @param[in]     blockSize  number of samples to process
 46    @return        none
 47  
 48    @par           Scaling and Overflow Behavior
 49                     The function is implemented using a 64-bit internal accumulator.
 50                     Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
 51                     The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
 52                     There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
 53                     After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
 54                     Lastly, the accumulator is saturated to yield a result in 1.15 format.
 55  
 56    @remark
 57                     Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
 58   */
 59  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 60  
 61  #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
 62  
 63  
 64  #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
 65          for (int j = 0; j < nbAcc; j++) {                                  \
 66              const q15_t    *pSmp = &pSample[j];                            \
 67              q63_t           acc[4];                                        \
 68                                                                             \
 69              acc[j] = 0;                                                    \
 70              for (int i = 0; i < nbVecTaps; i++) {                          \
 71                  vecIn0 = vld1q(pSmp + 8 * i);                  \
 72                  acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
 73              }                                                              \
 74              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
 75          }
 76  
 77  #define FIR_Q15_MAIN_CORE()                                                                  \
 78  {                                                                                            \
 79      q15_t          *pState = S->pState;     /* State pointer */                              \
 80      const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
 81      q15_t          *pStateCur;              /* Points to the current sample of the state */  \
 82      const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
 83      q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
 84      const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
 85      q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
 86      uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
 87      int32_t         blkCnt;                                                                  \
 88      q15x8_t         vecIn0;                                                                  \
 89                                                                                               \
 90      /*                                                                                       \
 91       * load coefs                                                                            \
 92       */                                                                                      \
 93      q15x8_t         vecCoeffs[NBVECTAPS];                                                    \
 94                                                                                               \
 95      for (int i = 0; i < NBVECTAPS; i++)                                                      \
 96          vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
 97                                                                                               \
 98      /*                                                                                       \
 99       * pState points to state array which contains previous frame (numTaps - 1) samples      \
100       * pStateCur points to the location where the new input data should be written           \
101       */                                                                                      \
102      pStateCur = &(pState[(numTaps - 1u)]);                                                   \
103      pTempSrc = pSrc;                                                                         \
104      pSamples = pState;                                                                       \
105      pOutput = pDst;                                                                          \
106                                                                                               \
107      blkCnt = blockSize >> 2;                                                                 \
108      while (blkCnt > 0) {                                                                     \
109          /*                                                                                   \
110           * Save 4 input samples in the history buffer                                        \
111           */                                                                                  \
112          vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
113          pStateCur += 4;                                                                      \
114          pTempSrc += 4;                                                                       \
115                                                                                               \
116          FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
117          pSamples += 4;                                                                       \
118                                                                                               \
119          blkCnt--;                                                                            \
120      }                                                                                        \
121                                                                                               \
122      /* tail */                                                                               \
123      int32_t        residual = blockSize & 3;                                                \
124                                                                                               \
125      for (int i = 0; i < residual; i++)                                                       \
126          *pStateCur++ = *pTempSrc++;                                                          \
127                                                                                               \
128      FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
129                                                                                               \
130      /*                                                                                       \
131       * Copy the samples back into the history buffer start                                   \
132       */                                                                                      \
133      pTempSrc = &pState[blockSize];                                                           \
134      pTempDest = pState;                                                                      \
135                                                                                               \
136      /* current compiler limitation */                                                        \
137      blkCnt = (numTaps - 1) >> 3;                                                             \
138      while (blkCnt > 0)                                                                       \
139      {                                                                                        \
140          vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
141          pTempSrc += 8;                                                                       \
142          pTempDest += 8;                                                                      \
143          blkCnt--;                                                                            \
144      }                                                                                        \
145      blkCnt = (numTaps - 1) & 7;                                                              \
146      if (blkCnt > 0)                                                                          \
147      {                                                                                        \
148          mve_pred16_t p = vctp16q(blkCnt);                                                    \
149          vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
150      }                                                                                        \
151  }
152      
153  static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, 
154    const q15_t * __restrict pSrc,
155    q15_t * __restrict pDst, uint32_t blockSize)
156  {
157      #define NBTAPS 32
158      #define NBVECTAPS (NBTAPS / 8)
159      FIR_Q15_MAIN_CORE();
160      #undef NBVECTAPS
161      #undef NBTAPS
162  }
163  
164  static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, 
165    const q15_t * __restrict pSrc,
166    q15_t * __restrict pDst, uint32_t blockSize)
167  {
168      #define NBTAPS 24
169      #define NBVECTAPS (NBTAPS / 8)
170      FIR_Q15_MAIN_CORE();
171      #undef NBVECTAPS
172      #undef NBTAPS
173  }
174  
175  
176  static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, 
177    const q15_t * __restrict pSrc,
178    q15_t * __restrict pDst, uint32_t blockSize)
179  {
180      #define NBTAPS 16
181      #define NBVECTAPS (NBTAPS / 8)
182      FIR_Q15_MAIN_CORE();
183      #undef NBVECTAPS
184      #undef NBTAPS
185  }
186  
187  static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, 
188    const q15_t * __restrict pSrc, 
189    q15_t * __restrict pDst, uint32_t blockSize)
190  {
191      #define NBTAPS 8
192      #define NBVECTAPS (NBTAPS / 8)
193      FIR_Q15_MAIN_CORE();
194      #undef NBVECTAPS
195      #undef NBTAPS
196  }
197  
198  
199  void arm_fir_q15(
200    const arm_fir_instance_q15 * S,
201    const q15_t * pSrc,
202          q15_t * pDst,
203          uint32_t blockSize)
204  {
205      q15_t    *pState = S->pState;   /* State pointer */
206      const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
207      q15_t    *pStateCur;        /* Points to the current sample of the state */
208      const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
209      q15_t    *pOutput;          /* Temporary pointer to the output buffer */
210      const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
211      q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
212      uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
213      uint32_t  blkCnt;
214      q15x8_t vecIn0;
215      uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
216      q63_t     acc0, acc1, acc2, acc3;
217  
218  
219  int32_t nbTaps = (numTaps + 7) >> 3;
220  
221  switch(nbTaps) {
222  
223      case 1:
224          arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
225          return;
226      case 2:
227          arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
228          return;
229      case 3:
230          arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
231          return;
232      case 4:
233          arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
234          return;
235      }
236      /*
237       * pState points to state array which contains previous frame (numTaps - 1) samples
238       * pStateCur points to the location where the new input data should be written
239       */
240      pStateCur   = &(pState[(numTaps - 1u)]);
241      pTempSrc    = pSrc;
242      pSamples    = pState;
243      pOutput     = pDst;
244      blkCnt      = blockSize >> 2;
245  
246      while (blkCnt > 0U)
247      {
248          const q15_t    *pCoeffsTmp = pCoeffs;
249          const q15_t    *pSamplesTmp = pSamples;
250  
251          acc0 = 0LL;
252          acc1 = 0LL;
253          acc2 = 0LL;
254          acc3 = 0LL;
255  
256          /*
257           * Save 8 input samples in the history buffer
258           */
259          vst1q(pStateCur, vld1q(pTempSrc));
260          pStateCur += 8;
261          pTempSrc += 8;
262  
263          int       i = tapsBlkCnt;
264          while (i > 0)
265          {
266              /*
267               * load 8 coefs
268               */
269              q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
270  
271              vecIn0 = vld1q(pSamplesTmp);
272              acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
273  
274              vecIn0 = vld1q(&pSamplesTmp[1]);
275              acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
276  
277              vecIn0 = vld1q(&pSamplesTmp[2]);
278              acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
279  
280              vecIn0 = vld1q(&pSamplesTmp[3]);
281              acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
282  
283              pSamplesTmp += 8;
284              pCoeffsTmp += 8;
285              /*
286               * Decrement the taps block loop counter
287               */
288              i--;
289          }
290  
291          *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
292          *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
293          *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
294          *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
295  
296          pSamples += 4;
297          /*
298           * Decrement the sample block loop counter
299           */
300          blkCnt--;
301      }
302  
303      uint32_t  residual = blockSize & 3;
304      switch (residual)
305      {
306      case 3:
307          {
308              const q15_t    *pCoeffsTmp = pCoeffs;
309              const q15_t    *pSamplesTmp = pSamples;
310  
311              acc0 = 0LL;
312              acc1 = 0LL;
313              acc2 = 0LL;
314  
315              /*
316               * Save 8 input samples in the history buffer
317               */
318              *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
319              pStateCur += 8;
320              pTempSrc += 8;
321  
322              int       i = tapsBlkCnt;
323              while (i > 0)
324              {
325                  /*
326                   * load 8 coefs
327                   */
328                  q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
329  
330                  vecIn0 = vld1q(pSamplesTmp);
331                  acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
332  
333                  vecIn0 = vld1q(&pSamplesTmp[2]);
334                  acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
335  
336                  vecIn0 = vld1q(&pSamplesTmp[4]);
337                  acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
338  
339                  pSamplesTmp += 8;
340                  pCoeffsTmp += 8;
341                  /*
342                   * Decrement the taps block loop counter
343                   */
344                  i--;
345              }
346  
347              acc0 = asrl(acc0, 15);
348              acc1 = asrl(acc1, 15);
349              acc2 = asrl(acc2, 15);
350  
351              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
352              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
353              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
354          }
355          break;
356  
357      case 2:
358          {
359              const q15_t    *pCoeffsTmp = pCoeffs;
360              const q15_t    *pSamplesTmp = pSamples;
361  
362              acc0 = 0LL;
363              acc1 = 0LL;
364              /*
365               * Save 8 input samples in the history buffer
366               */
367              vst1q(pStateCur, vld1q(pTempSrc));
368              pStateCur += 8;
369              pTempSrc += 8;
370  
371              int       i = tapsBlkCnt;
372              while (i > 0)
373              {
374                  /*
375                   * load 8 coefs
376                   */
377                  q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
378  
379                  vecIn0 = vld1q(pSamplesTmp);
380                  acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
381  
382                  vecIn0 = vld1q(&pSamplesTmp[2]);
383                  acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
384  
385                  pSamplesTmp += 8;
386                  pCoeffsTmp += 8;
387                  /*
388                   * Decrement the taps block loop counter
389                   */
390                  i--;
391              }
392  
393              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
394              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
395          }
396          break;
397  
398      case 1:
399          {
400              const q15_t    *pCoeffsTmp = pCoeffs;
401              const q15_t    *pSamplesTmp = pSamples;
402  
403              acc0 = 0LL;
404  
405              /*
406               * Save 8 input samples in the history buffer
407               */
408              vst1q(pStateCur, vld1q(pTempSrc));
409              pStateCur += 8;
410              pTempSrc += 8;
411  
412              int       i = tapsBlkCnt;
413              while (i > 0)
414              {
415                  /*
416                   * load 8 coefs
417                   */
418                  q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
419  
420                  vecIn0 = vld1q(pSamplesTmp);
421                  acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
422  
423                  pSamplesTmp += 8;
424                  pCoeffsTmp += 8;
425                  /*
426                   * Decrement the taps block loop counter
427                   */
428                  i--;
429              }
430  
431              *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
432          }
433          break;
434      }
435  
436      /*
437       * Copy the samples back into the history buffer start
438       */
439      pTempSrc = &pState[blockSize];
440      pTempDest = pState;
441  
442      blkCnt = numTaps >> 3;
443      while (blkCnt > 0U)
444      {
445          vst1q(pTempDest, vld1q(pTempSrc));
446          pTempSrc += 8;
447          pTempDest += 8;
448          blkCnt--;
449      }
450      blkCnt = numTaps & 7;
451      if (blkCnt > 0U)
452      {
453          mve_pred16_t p0 = vctp16q(blkCnt);
454          vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
455      }
456  }
457  
458  #else
459  void arm_fir_q15(
460    const arm_fir_instance_q15 * S,
461    const q15_t * pSrc,
462          q15_t * pDst,
463          uint32_t blockSize)
464  {
465          q15_t *pState = S->pState;                     /* State pointer */
466    const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
467          q15_t *pStateCurnt;                            /* Points to the current sample of the state */
468          q15_t *px;                                     /* Temporary pointer for state buffer */
469    const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
470          q63_t acc0;                                    /* Accumulators */
471          uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
472          uint32_t tapCnt, blkCnt;                       /* Loop counters */
473  
474  #if defined (ARM_MATH_LOOPUNROLL)
475          q63_t acc1, acc2, acc3;                        /* Accumulators */
476          q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
477  #endif
478  
479    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
480    /* pStateCurnt points to the location where the new input data should be written */
481    pStateCurnt = &(S->pState[(numTaps - 1U)]);
482  
483  #if defined (ARM_MATH_LOOPUNROLL)
484  
485    /* Loop unrolling: Compute 4 output values simultaneously.
486     * The variables acc0 ... acc3 hold output values that are being computed:
487     *
488     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
489     *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
490     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
491     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
492     */
493    blkCnt = blockSize >> 2U;
494  
495    while (blkCnt > 0U)
496    {
497      /* Copy 4 new input samples into the state buffer. */
498      *pStateCurnt++ = *pSrc++;
499      *pStateCurnt++ = *pSrc++;
500      *pStateCurnt++ = *pSrc++;
501      *pStateCurnt++ = *pSrc++;
502  
503      /* Set all accumulators to zero */
504      acc0 = 0;
505      acc1 = 0;
506      acc2 = 0;
507      acc3 = 0;
508  
509      /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
510      px = pState;
511  
512      /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
513      pb = pCoeffs;
514  
515      /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
516      x0 = read_q15x2_ia (&px);
517  
518      /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
519      x2 = read_q15x2_ia (&px);
520  
521      /* Loop over the number of taps.  Unroll by a factor of 4.
522         Repeat until we've computed numTaps-(numTaps%4) coefficients. */
523      tapCnt = numTaps >> 2U;
524  
525      while (tapCnt > 0U)
526      {
527        /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
528        c0 = read_q15x2_ia (&pb);
529  
530        /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
531        acc0 = __SMLALD(x0, c0, acc0);
532  
533        /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
534        acc2 = __SMLALD(x2, c0, acc2);
535  
536        /* pack  x[n-N-1] and x[n-N-2] */
537  #ifndef ARM_MATH_BIG_ENDIAN
538        x1 = __PKHBT(x2, x0, 0);
539  #else
540        x1 = __PKHBT(x0, x2, 0);
541  #endif
542  
543        /* Read state x[n-N-4], x[n-N-5] */
544        x0 = read_q15x2_ia (&px);
545  
546        /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
547        acc1 = __SMLALDX(x1, c0, acc1);
548  
549        /* pack  x[n-N-3] and x[n-N-4] */
550  #ifndef ARM_MATH_BIG_ENDIAN
551        x1 = __PKHBT(x0, x2, 0);
552  #else
553        x1 = __PKHBT(x2, x0, 0);
554  #endif
555  
556        /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
557        acc3 = __SMLALDX(x1, c0, acc3);
558  
559        /* Read coefficients b[N-2], b[N-3] */
560        c0 = read_q15x2_ia (&pb);
561  
562        /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
563        acc0 = __SMLALD(x2, c0, acc0);
564  
565        /* Read state x[n-N-6], x[n-N-7] with offset */
566        x2 = read_q15x2_ia (&px);
567  
568        /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
569        acc2 = __SMLALD(x0, c0, acc2);
570  
571        /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
572        acc1 = __SMLALDX(x1, c0, acc1);
573  
574        /* pack  x[n-N-5] and x[n-N-6] */
575  #ifndef ARM_MATH_BIG_ENDIAN
576        x1 = __PKHBT(x2, x0, 0);
577  #else
578        x1 = __PKHBT(x0, x2, 0);
579  #endif
580  
581        /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
582        acc3 = __SMLALDX(x1, c0, acc3);
583  
584        /* Decrement tap count */
585        tapCnt--;
586      }
587  
588      /* If the filter length is not a multiple of 4, compute the remaining filter taps.
589         This is always be 2 taps since the filter length is even. */
590      if ((numTaps & 0x3U) != 0U)
591      {
592        /* Read last two coefficients */
593        c0 = read_q15x2_ia (&pb);
594  
595        /* Perform the multiply-accumulates */
596        acc0 = __SMLALD(x0, c0, acc0);
597        acc2 = __SMLALD(x2, c0, acc2);
598  
599        /* pack state variables */
600  #ifndef ARM_MATH_BIG_ENDIAN
601        x1 = __PKHBT(x2, x0, 0);
602  #else
603        x1 = __PKHBT(x0, x2, 0);
604  #endif
605  
606        /* Read last state variables */
607        x0 = read_q15x2 (px);
608  
609        /* Perform the multiply-accumulates */
610        acc1 = __SMLALDX(x1, c0, acc1);
611  
612        /* pack state variables */
613  #ifndef ARM_MATH_BIG_ENDIAN
614        x1 = __PKHBT(x0, x2, 0);
615  #else
616        x1 = __PKHBT(x2, x0, 0);
617  #endif
618  
619        /* Perform the multiply-accumulates */
620        acc3 = __SMLALDX(x1, c0, acc3);
621      }
622  
623      /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
624         Then store the 4 outputs in the destination buffer. */
625  #ifndef ARM_MATH_BIG_ENDIAN
626      write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
627      write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
628  #else
629      write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
630      write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
631  #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
632  
633      /* Advance the state pointer by 4 to process the next group of 4 samples */
634      pState = pState + 4U;
635  
636      /* Decrement loop counter */
637      blkCnt--;
638    }
639  
640    /* Loop unrolling: Compute remaining output samples */
641    blkCnt = blockSize % 0x4U;
642  
643  #else
644  
645    /* Initialize blkCnt with number of taps */
646    blkCnt = blockSize;
647  
648  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
649  
650    while (blkCnt > 0U)
651    {
652      /* Copy two samples into state buffer */
653      *pStateCurnt++ = *pSrc++;
654  
655      /* Set the accumulator to zero */
656      acc0 = 0;
657  
658      /* Use SIMD to hold states and coefficients */
659      px = pState;
660      pb = pCoeffs;
661  
662      tapCnt = numTaps >> 1U;
663  
664      while (tapCnt > 0U)
665      {
666        acc0 += (q31_t) *px++ * *pb++;
667  	    acc0 += (q31_t) *px++ * *pb++;
668  
669        tapCnt--;
670      }
671      
672  
673      /* The result is in 2.30 format. Convert to 1.15 with saturation.
674         Then store the output in the destination buffer. */
675      *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
676  
677      /* Advance state pointer by 1 for the next sample */
678      pState = pState + 1U;
679  
680      /* Decrement loop counter */
681      blkCnt--;
682    }
683  
684    /* Processing is complete.
685       Now copy the last numTaps - 1 samples to the start of the state buffer.
686       This prepares the state buffer for the next function call. */
687  
688    /* Points to the start of the state buffer */
689    pStateCurnt = S->pState;
690  
691  #if defined (ARM_MATH_LOOPUNROLL)
692  
693    /* Loop unrolling: Compute 4 taps at a time */
694    tapCnt = (numTaps - 1U) >> 2U;
695  
696    /* Copy data */
697    while (tapCnt > 0U)
698    {
699      *pStateCurnt++ = *pState++;
700      *pStateCurnt++ = *pState++;
701      *pStateCurnt++ = *pState++;
702      *pStateCurnt++ = *pState++;
703  
704      /* Decrement loop counter */
705      tapCnt--;
706    }
707  
708    /* Calculate remaining number of copies */
709    tapCnt = (numTaps - 1U) % 0x4U;
710  
711  #else
712  
713    /* Initialize tapCnt with number of taps */
714    tapCnt = (numTaps - 1U);
715  
716  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
717  
718    /* Copy remaining data */
719    while (tapCnt > 0U)
720    {
721      *pStateCurnt++ = *pState++;
722  
723      /* Decrement loop counter */
724      tapCnt--;
725    }
726  
727  }
728  #endif /* defined(ARM_MATH_MVEI) */
729  
730  /**
731    @} end of FIR group
732   */