/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_conv_q7.c
arm_conv_q7.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_conv_q7.c
  4   * Description:  Convolution of Q7 sequences
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/filtering_functions.h"
 30  
 31  /**
 32    @ingroup groupFilters
 33   */
 34  
 35  /**
 36    @addtogroup Conv
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Convolution of Q7 sequences.
 42    @param[in]     pSrcA      points to the first input sequence
 43    @param[in]     srcALen    length of the first input sequence
 44    @param[in]     pSrcB      points to the second input sequence
 45    @param[in]     srcBLen    length of the second input sequence
 46    @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
 47    @return        none
 48  
 49    @par           Scaling and Overflow Behavior
 50                     The function is implemented using a 32-bit internal accumulator.
 51                     Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
 52                     The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
 53                     This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
 54                     The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
 55    @remark
 56                     Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
 57   */
 58  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 59  #include "arm_helium_utils.h"
 60  
 61  #include "arm_vec_filtering.h"
 62  
 63  void arm_conv_q7(
 64    const q7_t * pSrcA,
 65          uint32_t srcALen,
 66    const q7_t * pSrcB,
 67          uint32_t srcBLen,
 68          q7_t * pDst)
 69  {
 70      const q7_t     *pIn1 = pSrcA;     /* inputA pointer               */
 71      const q7_t     *pIn2 = pSrcB;     /* inputB pointer               */
 72      /*
 73       * Loop to perform MAC operations according to correlation equation
 74       */
 75      const q7_t     *pX;
 76      const q7_t     *pY;
 77      const q7_t     *pA;
 78      const q7_t     *pB;
 79      int32_t   i = 0U, j = 0;    /* loop counters */
 80      int32_t   block1, block2, block3;
 81      uint8_t   vddupStartIdx = 15;
 82      uint8x16_t decrIdxVec = vddupq_u8(vddupStartIdx, 1);
 83  
 84      if (srcALen < srcBLen)
 85      {
 86          /*
 87           * Initialization to inputB pointer
 88           */
 89          pIn1 = pSrcB;
 90          /*
 91           * Initialization to the end of inputA pointer
 92           */
 93          pIn2 = pSrcA;
 94          /*
 95           * Swapping the lengths
 96           */
 97          j = srcALen;
 98          srcALen = srcBLen;
 99          srcBLen = j;
100      }
101  
102      block1 = srcBLen - 1;
103      block2 = srcALen - srcBLen + 1;
104      block3 = srcBLen - 1;
105  
106      pA = pIn1;
107      pB = pIn2 - 15;
108  
109      for (i = 0; i <= block1 - 2; i += 2)
110      {
111          uint32_t  count = i + 1;
112          int32_t   acc0 = 0;
113          int32_t   acc1 = 0;
114  
115          pX = pA;
116          pY = pB;
117  
118          MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
119          *pDst++ = (q7_t) acc0;
120          *pDst++ = (q7_t) acc1;
121          pB += 2;
122      }
123      for (; i < block1; i++)
124      {
125          uint32_t  count = i + 1;
126          int32_t   acc = 0;
127  
128          pX = pA;
129          pY = pB;
130  
131          MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
132          *pDst++ = (q7_t) acc;
133          pB++;
134      }
135  
136      for (i = 0; i <= block2 - 4; i += 4)
137      {
138          uint32_t  count = srcBLen;
139          int32_t   acc0 = 0;
140          int32_t   acc1 = 0;
141          int32_t   acc2 = 0;
142          int32_t   acc3 = 0;
143  
144          pX = pA;
145          pY = pB;
146          /*
147           * compute 4 accumulators per loop
148           * size is fixed for all accumulators
149           * X pointer is incrementing for successive accumulators
150           */
151          MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count);
152          *pDst++ = (q7_t) acc0;
153          *pDst++ = (q7_t) acc1;
154          *pDst++ = (q7_t) acc2;
155          *pDst++ = (q7_t) acc3;
156          pA += 4;
157      }
158      for (; i <= block2 - 2; i += 2)
159      {
160          uint32_t  count = srcBLen;
161          int32_t   acc0 = 0;
162          int32_t   acc1 = 0;
163  
164          pX = pA;
165          pY = pB;
166          /*
167           * compute 2 accumulators per loop
168           * size is fixed for all accumulators
169           * X pointer is incrementing for successive accumulators
170           */
171          MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count);
172          *pDst++ = (q7_t) acc0;
173          *pDst++ = (q7_t) acc1;
174          pA += 2;
175      }
176      if (block2 & 1)
177      {
178          uint32_t  count = srcBLen;
179          int32_t   acc = 0;
180  
181          pX = pA;
182          pY = pB;
183  
184          MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
185          *pDst++ = (q7_t) acc;
186          pA++;
187      }
188  
189      for (i = block3; i >= 1; i -= 2)
190      {
191          uint32_t  count = i;
192          int32_t   acc0 = 0;
193          int32_t   acc1 = 0;
194  
195          pX = pA;
196          pY = pB;
197  
198          MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
199          *pDst++ = (q7_t) acc0;
200          *pDst++ = (q7_t) acc1;
201          pA += 2;
202      }
203      for (; i >= 1; i--)
204      {
205          uint32_t  count = i;
206          int32_t   acc = 0;
207  
208          pX = pA;
209          pY = pB;
210  
211          MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
212          *pDst++ = (q7_t) acc;
213          pA++;
214      }
215  }
216  
217  #else
218  void arm_conv_q7(
219    const q7_t * pSrcA,
220          uint32_t srcALen,
221    const q7_t * pSrcB,
222          uint32_t srcBLen,
223          q7_t * pDst)
224  {
225  
226  #if (1)
227  //#if !defined(ARM_MATH_CM0_FAMILY)
228  
229    const q7_t *pIn1;                                    /* InputA pointer */
230    const q7_t *pIn2;                                    /* InputB pointer */
231          q7_t *pOut = pDst;                             /* Output pointer */
232    const q7_t *px;                                      /* Intermediate inputA pointer */
233    const q7_t *py;                                      /* Intermediate inputB pointer */
234    const q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
235          q31_t sum;                                     /* Accumulators */
236          uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
237          uint32_t j, k, count, blkCnt;                  /* Loop counters */
238  
239  #if defined (ARM_MATH_LOOPUNROLL)
240          q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
241          q31_t input1, input2;                          /* Temporary input variables */
242          q15_t in1, in2;                                /* Temporary input variables */
243          q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
244  #endif
245  
246    /* The algorithm implementation is based on the lengths of the inputs. */
247    /* srcB is always made to slide across srcA. */
248    /* So srcBLen is always considered as shorter or equal to srcALen */
249    if (srcALen >= srcBLen)
250    {
251      /* Initialization of inputA pointer */
252      pIn1 = pSrcA;
253  
254      /* Initialization of inputB pointer */
255      pIn2 = pSrcB;
256    }
257    else
258    {
259      /* Initialization of inputA pointer */
260      pIn1 = pSrcB;
261  
262      /* Initialization of inputB pointer */
263      pIn2 = pSrcA;
264  
265      /* srcBLen is always considered as shorter or equal to srcALen */
266      j = srcBLen;
267      srcBLen = srcALen;
268      srcALen = j;
269    }
270  
271    /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
272    /* The function is internally
273     * divided into three stages according to the number of multiplications that has to be
274     * taken place between inputA samples and inputB samples. In the first stage of the
275     * algorithm, the multiplications increase by one for every iteration.
276     * In the second stage of the algorithm, srcBLen number of multiplications are done.
277     * In the third stage of the algorithm, the multiplications decrease by one
278     * for every iteration. */
279  
280    /* The algorithm is implemented in three stages.
281       The loop counters of each stage is initiated here. */
282    blockSize1 = srcBLen - 1U;
283    blockSize2 = srcALen - (srcBLen - 1U);
284    blockSize3 = blockSize1;
285  
286    /* --------------------------
287     * Initializations of stage1
288     * -------------------------*/
289  
290    /* sum = x[0] * y[0]
291     * sum = x[0] * y[1] + x[1] * y[0]
292     * ....
293     * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
294     */
295  
296    /* In this stage the MAC operations are increased by 1 for every iteration.
297       The count variable holds the number of MAC operations performed */
298    count = 1U;
299  
300    /* Working pointer of inputA */
301    px = pIn1;
302  
303    /* Working pointer of inputB */
304    py = pIn2;
305  
306  
307    /* ------------------------
308     * Stage1 process
309     * ----------------------*/
310  
311    /* The first stage starts here */
312    while (blockSize1 > 0U)
313    {
314      /* Accumulator is made zero for every iteration */
315      sum = 0;
316  
317  #if defined (ARM_MATH_LOOPUNROLL)
318  
319      /* Loop unrolling: Compute 4 outputs at a time */
320      k = count >> 2U;
321  
322      while (k > 0U)
323      {
324        /* x[0] , x[1] */
325        in1 = (q15_t) *px++;
326        in2 = (q15_t) *px++;
327        input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
328  
329        /* y[srcBLen - 1] , y[srcBLen - 2] */
330        in1 = (q15_t) *py--;
331        in2 = (q15_t) *py--;
332        input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
333  
334        /* x[0] * y[srcBLen - 1] */
335        /* x[1] * y[srcBLen - 2] */
336        sum = __SMLAD(input1, input2, sum);
337  
338        /* x[2] , x[3] */
339        in1 = (q15_t) *px++;
340        in2 = (q15_t) *px++;
341        input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
342  
343        /* y[srcBLen - 3] , y[srcBLen - 4] */
344        in1 = (q15_t) *py--;
345        in2 = (q15_t) *py--;
346        input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
347  
348        /* x[2] * y[srcBLen - 3] */
349        /* x[3] * y[srcBLen - 4] */
350        sum = __SMLAD(input1, input2, sum);
351  
352        /* Decrement loop counter */
353        k--;
354      }
355  
356      /* Loop unrolling: Compute remaining outputs */
357      k = count % 0x4U;
358  
359  #else
360  
361      /* Initialize k with number of samples */
362      k = count;
363  
364  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
365  
366      while (k > 0U)
367      {
368        /* Perform the multiply-accumulate */
369        sum += ((q15_t) *px++ * *py--);
370  
371        /* Decrement loop counter */
372        k--;
373      }
374  
375      /* Store the result in the accumulator in the destination buffer. */
376      *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
377  
378      /* Update the inputA and inputB pointers for next MAC calculation */
379      py = pIn2 + count;
380      px = pIn1;
381  
382      /* Increment MAC count */
383      count++;
384  
385      /* Decrement loop counter */
386      blockSize1--;
387    }
388  
389    /* --------------------------
390     * Initializations of stage2
391     * ------------------------*/
392  
393    /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
394     * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
395     * ....
396     * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
397     */
398  
399    /* Working pointer of inputA */
400    px = pIn1;
401  
402    /* Working pointer of inputB */
403    pSrc2 = pIn2 + (srcBLen - 1U);
404    py = pSrc2;
405  
406    /* count is index by which the pointer pIn1 to be incremented */
407    count = 0U;
408  
409    /* -------------------
410     * Stage2 process
411     * ------------------*/
412  
413    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
414     * So, to loop unroll over blockSize2,
415     * srcBLen should be greater than or equal to 4 */
416    if (srcBLen >= 4U)
417    {
418  #if defined (ARM_MATH_LOOPUNROLL)
419  
420      /* Loop unrolling: Compute 4 outputs at a time */
421      blkCnt = blockSize2 >> 2U;
422  
423      while (blkCnt > 0U)
424      {
425        /* Set all accumulators to zero */
426        acc0 = 0;
427        acc1 = 0;
428        acc2 = 0;
429        acc3 = 0;
430  
431        /* read x[0], x[1], x[2] samples */
432        x0 = *px++;
433        x1 = *px++;
434        x2 = *px++;
435  
436        /* Apply loop unrolling and compute 4 MACs simultaneously. */
437        k = srcBLen >> 2U;
438  
439        /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
440         ** a second loop below computes MACs for the remaining 1 to 3 samples. */
441        do
442        {
443          /* Read y[srcBLen - 1] sample */
444          c0 = *py--;
445          /* Read y[srcBLen - 2] sample */
446          c1 = *py--;
447  
448          /* Read x[3] sample */
449          x3 = *px++;
450  
451          /* x[0] and x[1] are packed */
452          in1 = (q15_t) x0;
453          in2 = (q15_t) x1;
454  
455          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
456  
457          /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
458          in1 = (q15_t) c0;
459          in2 = (q15_t) c1;
460  
461          input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
462  
463          /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
464          acc0 = __SMLAD(input1, input2, acc0);
465  
466          /* x[1] and x[2] are packed */
467          in1 = (q15_t) x1;
468          in2 = (q15_t) x2;
469  
470          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
471  
472          /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
473          acc1 = __SMLAD(input1, input2, acc1);
474  
475          /* x[2] and x[3] are packed */
476          in1 = (q15_t) x2;
477          in2 = (q15_t) x3;
478  
479          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
480  
481          /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
482          acc2 = __SMLAD(input1, input2, acc2);
483  
484          /* Read x[4] sample */
485          x0 = *px++;
486  
487          /* x[3] and x[4] are packed */
488          in1 = (q15_t) x3;
489          in2 = (q15_t) x0;
490  
491          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
492  
493          /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
494          acc3 = __SMLAD(input1, input2, acc3);
495  
496          /* Read y[srcBLen - 3] sample */
497          c0 = *py--;
498          /* Read y[srcBLen - 4] sample */
499          c1 = *py--;
500  
501          /* Read x[5] sample */
502          x1 = *px++;
503  
504          /* x[2] and x[3] are packed */
505          in1 = (q15_t) x2;
506          in2 = (q15_t) x3;
507  
508          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
509  
510          /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
511          in1 = (q15_t) c0;
512          in2 = (q15_t) c1;
513  
514          input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
515  
516          /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
517          acc0 = __SMLAD(input1, input2, acc0);
518  
519          /* x[3] and x[4] are packed */
520          in1 = (q15_t) x3;
521          in2 = (q15_t) x0;
522  
523          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
524  
525          /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
526          acc1 = __SMLAD(input1, input2, acc1);
527  
528          /* x[4] and x[5] are packed */
529          in1 = (q15_t) x0;
530          in2 = (q15_t) x1;
531  
532          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
533  
534          /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
535          acc2 = __SMLAD(input1, input2, acc2);
536  
537          /* Read x[6] sample */
538          x2 = *px++;
539  
540          /* x[5] and x[6] are packed */
541          in1 = (q15_t) x1;
542          in2 = (q15_t) x2;
543  
544          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
545  
546          /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
547          acc3 = __SMLAD(input1, input2, acc3);
548  
549        } while (--k);
550  
551        /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
552         ** No loop unrolling is used. */
553        k = srcBLen % 0x4U;
554  
555        while (k > 0U)
556        {
557          /* Read y[srcBLen - 5] sample */
558          c0 = *py--;
559          /* Read x[7] sample */
560          x3 = *px++;
561  
562          /* Perform the multiply-accumulates */
563          /* acc0 +=  x[4] * y[srcBLen - 5] */
564          acc0 += ((q15_t) x0 * c0);
565          /* acc1 +=  x[5] * y[srcBLen - 5] */
566          acc1 += ((q15_t) x1 * c0);
567          /* acc2 +=  x[6] * y[srcBLen - 5] */
568          acc2 += ((q15_t) x2 * c0);
569          /* acc3 +=  x[7] * y[srcBLen - 5] */
570          acc3 += ((q15_t) x3 * c0);
571  
572          /* Reuse the present samples for the next MAC */
573          x0 = x1;
574          x1 = x2;
575          x2 = x3;
576  
577          /* Decrement loop counter */
578          k--;
579        }
580  
581        /* Store the result in the accumulator in the destination buffer. */
582        *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
583        *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
584        *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
585        *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
586  
587        /* Increment the pointer pIn1 index, count by 4 */
588        count += 4U;
589  
590        /* Update the inputA and inputB pointers for next MAC calculation */
591        px = pIn1 + count;
592        py = pSrc2;
593  
594        /* Decrement loop counter */
595        blkCnt--;
596      }
597  
598      /* Loop unrolling: Compute remaining outputs */
599      blkCnt = blockSize2 % 0x4U;
600  
601  #else
602  
603      /* Initialize blkCnt with number of samples */
604      blkCnt = blockSize2;
605  
606  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
607  
608      while (blkCnt > 0U)
609      {
610        /* Accumulator is made zero for every iteration */
611        sum = 0;
612  
613  #if defined (ARM_MATH_LOOPUNROLL)
614  
615      /* Loop unrolling: Compute 4 outputs at a time */
616        k = srcBLen >> 2U;
617  
618        while (k > 0U)
619        {
620  
621          /* Reading two inputs of SrcA buffer and packing */
622          in1 = (q15_t) *px++;
623          in2 = (q15_t) *px++;
624          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
625  
626          /* Reading two inputs of SrcB buffer and packing */
627          in1 = (q15_t) *py--;
628          in2 = (q15_t) *py--;
629          input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
630  
631          /* Perform the multiply-accumulate */
632          sum = __SMLAD(input1, input2, sum);
633  
634          /* Reading two inputs of SrcA buffer and packing */
635          in1 = (q15_t) *px++;
636          in2 = (q15_t) *px++;
637          input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
638  
639          /* Reading two inputs of SrcB buffer and packing */
640          in1 = (q15_t) *py--;
641          in2 = (q15_t) *py--;
642          input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
643  
644          /* Perform the multiply-accumulate */
645          sum = __SMLAD(input1, input2, sum);
646  
647          /* Decrement loop counter */
648          k--;
649        }
650  
651        /* Loop unrolling: Compute remaining outputs */
652        k = srcBLen % 0x4U;
653  
654  #else
655  
656        /* Initialize blkCnt with number of samples */
657        k = srcBLen;
658  
659  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
660  
661        while (k > 0U)
662        {
663          /* Perform the multiply-accumulate */
664          sum += ((q15_t) *px++ * *py--);
665  
666          /* Decrement the loop counter */
667          k--;
668        }
669  
670        /* Store the result in the accumulator in the destination buffer. */
671        *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
672  
673        /* Increment the pointer pIn1 index, count by 1 */
674        count++;
675  
676        /* Update the inputA and inputB pointers for next MAC calculation */
677        px = pIn1 + count;
678        py = pSrc2;
679  
680        /* Decrement the loop counter */
681        blkCnt--;
682      }
683    }
684    else
685    {
686      /* If the srcBLen is not a multiple of 4,
687       * the blockSize2 loop cannot be unrolled by 4 */
688      blkCnt = blockSize2;
689  
690      while (blkCnt > 0U)
691      {
692        /* Accumulator is made zero for every iteration */
693        sum = 0;
694  
695        /* srcBLen number of MACS should be performed */
696        k = srcBLen;
697  
698        while (k > 0U)
699        {
700          /* Perform the multiply-accumulate */
701          sum += ((q15_t) *px++ * *py--);
702  
703          /* Decrement the loop counter */
704          k--;
705        }
706  
707        /* Store the result in the accumulator in the destination buffer. */
708        *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
709  
710        /* Increment the MAC count */
711        count++;
712  
713        /* Update the inputA and inputB pointers for next MAC calculation */
714        px = pIn1 + count;
715        py = pSrc2;
716  
717        /* Decrement loop counter */
718        blkCnt--;
719      }
720    }
721  
722  
723    /* --------------------------
724     * Initializations of stage3
725     * -------------------------*/
726  
727    /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
728     * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
729     * ....
730     * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
731     * sum +=  x[srcALen-1] * y[srcBLen-1]
732     */
733  
734    /* In this stage the MAC operations are decreased by 1 for every iteration.
735       The blockSize3 variable holds the number of MAC operations performed */
736  
737    /* Working pointer of inputA */
738    pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
739    px = pSrc1;
740  
741    /* Working pointer of inputB */
742    pSrc2 = pIn2 + (srcBLen - 1U);
743    py = pSrc2;
744  
745    /* -------------------
746     * Stage3 process
747     * ------------------*/
748  
749    while (blockSize3 > 0U)
750    {
751      /* Accumulator is made zero for every iteration */
752      sum = 0;
753  
754  #if defined (ARM_MATH_LOOPUNROLL)
755  
756      /* Loop unrolling: Compute 4 outputs at a time */
757      k = blockSize3 >> 2U;
758  
759      while (k > 0U)
760      {
761        /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
762        in1 = (q15_t) *px++;
763        in2 = (q15_t) *px++;
764        input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
765  
766        /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
767        in1 = (q15_t) *py--;
768        in2 = (q15_t) *py--;
769        input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
770  
771        /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
772        /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
773        sum = __SMLAD(input1, input2, sum);
774  
775        /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
776        in1 = (q15_t) *px++;
777        in2 = (q15_t) *px++;
778        input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
779  
780        /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
781        in1 = (q15_t) *py--;
782        in2 = (q15_t) *py--;
783        input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
784  
785        /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
786        /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
787        sum = __SMLAD(input1, input2, sum);
788  
789        /* Decrement loop counter */
790        k--;
791      }
792  
793      /* Loop unrolling: Compute remaining outputs */
794      k = blockSize3 % 0x4U;
795  
796  #else
797  
798      /* Initialize blkCnt with number of samples */
799      k = blockSize3;
800  
801  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
802  
803      while (k > 0U)
804      {
805        /* Perform the multiply-accumulate */
806        /* sum +=  x[srcALen-1] * y[srcBLen-1] */
807        sum += ((q15_t) *px++ * *py--);
808  
809        /* Decrement loop counter */
810        k--;
811      }
812  
813      /* Store the result in the accumulator in the destination buffer. */
814      *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
815  
816      /* Update the inputA and inputB pointers for next MAC calculation */
817      px = ++pSrc1;
818      py = pSrc2;
819  
820      /* Decrement loop counter */
821      blockSize3--;
822    }
823  
824  #else
825  /* alternate version for CM0_FAMILY */
826  
827    const q7_t *pIn1 = pSrcA;                            /* InputA pointer */
828    const q7_t *pIn2 = pSrcB;                            /* InputB pointer */
829          q31_t sum;                                     /* Accumulator */
830          uint32_t i, j;                                 /* Loop counters */
831  
832    /* Loop to calculate convolution for output length number of times */
833    for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
834    {
835      /* Initialize sum with zero to carry out MAC operations */
836      sum = 0;
837  
838      /* Loop to perform MAC operations according to convolution equation */
839      for (j = 0U; j <= i; j++)
840      {
841        /* Check the array limitations */
842        if (((i - j) < srcBLen) && (j < srcALen))
843        {
844          /* z[i] += x[i-j] * y[j] */
845          sum += ((q15_t) pIn1[j] * pIn2[i - j]);
846        }
847      }
848  
849      /* Store the output in the destination buffer */
850      pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
851    }
852  
853  #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
854  
855  }
856  #endif /* defined(ARM_MATH_MVEI) */
857  
858  /**
859    @} end of Conv group
860   */