/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_conv_q31.c
arm_conv_q31.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_conv_q31.c
  4   * Description:  Convolution of Q31 sequences
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/filtering_functions.h"
 30  
 31  /**
 32    @ingroup groupFilters
 33   */
 34  
 35  /**
 36    @addtogroup Conv
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Convolution of Q31 sequences.
 42    @param[in]     pSrcA      points to the first input sequence
 43    @param[in]     srcALen    length of the first input sequence
 44    @param[in]     pSrcB      points to the second input sequence
 45    @param[in]     srcBLen    length of the second input sequence
 46    @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
 47    @return        none
 48  
 49    @par           Scaling and Overflow Behavior
 50                     The function is implemented using an internal 64-bit accumulator.
 51                     The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
 52                     There is no saturation on intermediate additions.
 53                     Thus, if the accumulator overflows it wraps around and distorts the result.
 54                     The input signals should be scaled down to avoid intermediate overflows.
 55                     Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
 56                     as maximum of min(srcALen, srcBLen) number of additions are carried internally.
 57                     The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
 58  
 59    @remark
 60                     Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
 61   */
 62  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 63  #include "arm_helium_utils.h"
 64  #include "arm_vec_filtering.h"
 65  
 66  void arm_conv_q31(
 67    const q31_t * pSrcA,
 68          uint32_t srcALen,
 69    const q31_t * pSrcB,
 70          uint32_t srcBLen,
 71          q31_t * pDst)
 72  {
 73      const q31_t    *pIn1 = pSrcA;     /* inputA pointer               */
 74      const q31_t    *pIn2 = pSrcB;     /* inputB pointer               */
 75      /*
 76       * Loop to perform MAC operations according to correlation equation
 77       */
 78      const q31_t    *pX;
 79      const q31_t    *pY;
 80      const q31_t    *pA;
 81      const q31_t    *pB;
 82      int32_t   i = 0U, j = 0;    /* loop counters */
 83      int32_t   block1, block2, block3;
 84      uint32_t  vddupStartIdx = 3;
 85      uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
 86  
 87      if (srcALen < srcBLen)
 88      {
 89          /*
 90           * Initialization to inputB pointer
 91           */
 92          pIn1 = pSrcB;
 93          /*
 94           * Initialization to the end of inputA pointer
 95           */
 96          pIn2 = pSrcA;
 97          /*
 98           * Swapping the lengths
 99           */
100          j = srcALen;
101          srcALen = srcBLen;
102          srcBLen = j;
103      }
104  
105      block1 = srcBLen - 1;
106      block2 = srcALen - srcBLen + 1;
107      block3 = srcBLen - 1;
108  
109      pA = pIn1;
110      pB = pIn2 - 3;
111  
112      for (i = 0; i <= block1 - 2; i += 2)
113      {
114          uint32_t  count = i + 1;
115          int64_t   acc0 = 0LL;
116          int64_t   acc1 = 0LL;
117  
118          pX = pA;
119          pY = pB;
120          MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
121  
122          *pDst++ = (q31_t) acc0;
123          *pDst++ = (q31_t) acc1;
124          pB += 2;
125      }
126      for (; i < block1; i++)
127      {
128          uint32_t  count = i + 1;
129          int64_t   acc = 0LL;
130  
131          pX = pA;
132          pY = pB;
133          MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
134  
135          *pDst++ = (q31_t) acc;
136          pB++;
137      }
138  
139      for (i = 0; i <= block2 - 4; i += 4)
140      {
141          uint32_t  count = srcBLen;
142          int64_t   acc0 = 0LL;
143          int64_t   acc1 = 0LL;
144          int64_t   acc2 = 0LL;
145          int64_t   acc3 = 0LL;
146  
147          pX = pA;
148          pY = pB;
149          /*
150           * compute 4 accumulators per loop
151           * size is fixed for all accumulators
152           * X pointer is incrementing for successive accumulators
153           */
154          MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count);
155          *pDst++ = (q31_t) acc0;
156          *pDst++ = (q31_t) acc1;
157          *pDst++ = (q31_t) acc2;
158          *pDst++ = (q31_t) acc3;
159  
160          pA += 4;
161      }
162  
163      for (; i <= block2 - 2; i += 2)
164      {
165          uint32_t  count = srcBLen;
166          int64_t   acc0 = 0LL;
167          int64_t   acc1 = 0LL;
168  
169          pX = pA;
170          pY = pB;
171          /*
172           * compute 2 accumulators per loop
173           * size is fixed for all accumulators
174           * X pointer is incrementing for successive accumulators
175           */
176          MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count);
177          *pDst++ = (q31_t) acc0;
178          *pDst++ = (q31_t) acc1;
179  
180          pA += 2;
181      }
182      if (block2 & 1)
183      {
184          uint32_t  count = srcBLen;
185          int64_t   acc = 0LL;
186  
187          pX = pA;
188          pY = pB;
189  
190          MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
191          *pDst++ = (q31_t) acc;
192          pA++;
193      }
194  
195      for (i = block3; i >= 2; i -= 2)
196      {
197          uint32_t  count = i;
198          int64_t   acc0 = 0LL;
199          int64_t   acc1 = 0LL;
200  
201          pX = pA;
202          pY = pB;
203  
204          MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
205          *pDst++ = (q31_t) acc0;
206          *pDst++ = (q31_t) acc1;
207          pA += 2;
208      }
209  
210      for (; i >= 1; i--)
211      {
212          uint32_t  count = i;
213          int64_t   acc = 0LL;
214  
215          pX = pA;
216          pY = pB;
217  
218          MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
219          *pDst++ = (q31_t) acc;
220          pA++;
221      }
222  
223  }
224  
225  #else
226  void arm_conv_q31(
227    const q31_t * pSrcA,
228          uint32_t srcALen,
229    const q31_t * pSrcB,
230          uint32_t srcBLen,
231          q31_t * pDst)
232  {
233  
234  #if (1)
235  //#if !defined(ARM_MATH_CM0_FAMILY)
236  
237    const q31_t *pIn1;                                   /* InputA pointer */
238    const q31_t *pIn2;                                   /* InputB pointer */
239          q31_t *pOut = pDst;                            /* Output pointer */
240    const q31_t *px;                                     /* Intermediate inputA pointer */
241    const q31_t *py;                                     /* Intermediate inputB pointer */
242    const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
243          q63_t sum;                                     /* Accumulators */
244          uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
245          uint32_t j, k, count, blkCnt;                  /* Loop counters */
246  
247  #if defined (ARM_MATH_LOOPUNROLL)
248          q63_t acc0, acc1, acc2;                        /* Accumulators */
249          q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
250  #endif
251  
252    /* The algorithm implementation is based on the lengths of the inputs. */
253    /* srcB is always made to slide across srcA. */
254    /* So srcBLen is always considered as shorter or equal to srcALen */
255    if (srcALen >= srcBLen)
256    {
257      /* Initialization of inputA pointer */
258      pIn1 = pSrcA;
259  
260      /* Initialization of inputB pointer */
261      pIn2 = pSrcB;
262    }
263    else
264    {
265      /* Initialization of inputA pointer */
266      pIn1 = pSrcB;
267  
268      /* Initialization of inputB pointer */
269      pIn2 = pSrcA;
270  
271      /* srcBLen is always considered as shorter or equal to srcALen */
272      j = srcBLen;
273      srcBLen = srcALen;
274      srcALen = j;
275    }
276  
277    /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
278    /* The function is internally
279     * divided into three stages according to the number of multiplications that has to be
280     * taken place between inputA samples and inputB samples. In the first stage of the
281     * algorithm, the multiplications increase by one for every iteration.
282     * In the second stage of the algorithm, srcBLen number of multiplications are done.
283     * In the third stage of the algorithm, the multiplications decrease by one
284     * for every iteration. */
285  
286    /* The algorithm is implemented in three stages.
287       The loop counters of each stage is initiated here. */
288    blockSize1 = srcBLen - 1U;
289    blockSize2 = srcALen - (srcBLen - 1U);
290    blockSize3 = blockSize1;
291  
292    /* --------------------------
293     * Initializations of stage1
294     * -------------------------*/
295  
296    /* sum = x[0] * y[0]
297     * sum = x[0] * y[1] + x[1] * y[0]
298     * ....
299     * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
300     */
301  
302    /* In this stage the MAC operations are increased by 1 for every iteration.
303       The count variable holds the number of MAC operations performed */
304    count = 1U;
305  
306    /* Working pointer of inputA */
307    px = pIn1;
308  
309    /* Working pointer of inputB */
310    py = pIn2;
311  
312  
313    /* ------------------------
314     * Stage1 process
315     * ----------------------*/
316  
317    /* The first stage starts here */
318    while (blockSize1 > 0U)
319    {
320      /* Accumulator is made zero for every iteration */
321      sum = 0;
322  
323  #if defined (ARM_MATH_LOOPUNROLL)
324  
325      /* Loop unrolling: Compute 4 outputs at a time */
326      k = count >> 2U;
327  
328      while (k > 0U)
329      {
330        /* x[0] * y[srcBLen - 1] */
331        sum += (q63_t) *px++ * (*py--);
332  
333        /* x[1] * y[srcBLen - 2] */
334        sum += (q63_t) *px++ * (*py--);
335  
336        /* x[2] * y[srcBLen - 3] */
337        sum += (q63_t) *px++ * (*py--);
338  
339        /* x[3] * y[srcBLen - 4] */
340        sum += (q63_t) *px++ * (*py--);
341  
342        /* Decrement loop counter */
343        k--;
344      }
345  
346      /* Loop unrolling: Compute remaining outputs */
347      k = count % 0x4U;
348  
349  #else
350  
351      /* Initialize k with number of samples */
352      k = count;
353  
354  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
355  
356      while (k > 0U)
357      {
358        /* Perform the multiply-accumulate */
359        sum += (q63_t) *px++ * *py--;
360  
361        /* Decrement loop counter */
362        k--;
363      }
364  
365      /* Store the result in the accumulator in the destination buffer. */
366      *pOut++ = (q31_t) (sum >> 31);
367  
368      /* Update the inputA and inputB pointers for next MAC calculation */
369      py = pIn2 + count;
370      px = pIn1;
371  
372      /* Increment MAC count */
373      count++;
374  
375      /* Decrement loop counter */
376      blockSize1--;
377    }
378  
379    /* --------------------------
380     * Initializations of stage2
381     * ------------------------*/
382  
383    /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
384     * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
385     * ....
386     * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
387     */
388  
389    /* Working pointer of inputA */
390    px = pIn1;
391  
392    /* Working pointer of inputB */
393    pSrc2 = pIn2 + (srcBLen - 1U);
394    py = pSrc2;
395  
396    /* count is index by which the pointer pIn1 to be incremented */
397    count = 0U;
398  
399    /* -------------------
400     * Stage2 process
401     * ------------------*/
402  
403    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
404     * So, to loop unroll over blockSize2,
405     * srcBLen should be greater than or equal to 4 */
406    if (srcBLen >= 4U)
407    {
408  #if defined (ARM_MATH_LOOPUNROLL)
409  
410      /* Loop unroll by 3 */
411      blkCnt = blockSize2 / 3;
412  
413      while (blkCnt > 0U)
414      {
415        /* Set all accumulators to zero */
416        acc0 = 0;
417        acc1 = 0;
418        acc2 = 0;
419  
420        /* read x[0], x[1], x[2] samples */
421        x0 = *px++;
422        x1 = *px++;
423  
424        /* Apply loop unrolling and compute 3 MACs simultaneously. */
425        k = srcBLen / 3;
426  
427        /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
428         ** a second loop below computes MACs for the remaining 1 to 2 samples. */
429        do
430        {
431          /* Read y[srcBLen - 1] sample */
432          c0 = *(py);
433          /* Read x[3] sample */
434          x2 = *(px);
435  
436          /* Perform the multiply-accumulate */
437          /* acc0 +=  x[0] * y[srcBLen - 1] */
438          acc0 += ((q63_t) x0 * c0);
439          /* acc1 +=  x[1] * y[srcBLen - 1] */
440          acc1 += ((q63_t) x1 * c0);
441          /* acc2 +=  x[2] * y[srcBLen - 1] */
442          acc2 += ((q63_t) x2 * c0);
443  
444          /* Read y[srcBLen - 2] sample */
445          c0 = *(py - 1U);
446          /* Read x[4] sample */
447          x0 = *(px + 1U);
448  
449          /* Perform the multiply-accumulate */
450          /* acc0 +=  x[1] * y[srcBLen - 2] */
451          acc0 += ((q63_t) x1 * c0);
452          /* acc1 +=  x[2] * y[srcBLen - 2] */
453          acc1 += ((q63_t) x2 * c0);
454          /* acc2 +=  x[3] * y[srcBLen - 2] */
455          acc2 += ((q63_t) x0 * c0);
456  
457          /* Read y[srcBLen - 3] sample */
458          c0 = *(py - 2U);
459          /* Read x[5] sample */
460          x1 = *(px + 2U);
461  
462          /* Perform the multiply-accumulate */
463          /* acc0 +=  x[2] * y[srcBLen - 3] */
464          acc0 += ((q63_t) x2 * c0);
465          /* acc1 +=  x[3] * y[srcBLen - 2] */
466          acc1 += ((q63_t) x0 * c0);
467          /* acc2 +=  x[4] * y[srcBLen - 2] */
468          acc2 += ((q63_t) x1 * c0);
469  
470          /* update scratch pointers */
471          px += 3U;
472          py -= 3U;
473  
474        } while (--k);
475  
476        /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
477         ** No loop unrolling is used. */
478        k = srcBLen - (3 * (srcBLen / 3));
479  
480        while (k > 0U)
481        {
482          /* Read y[srcBLen - 5] sample */
483          c0 = *py--;
484          /* Read x[7] sample */
485          x2 = *px++;
486  
487          /* Perform the multiply-accumulates */
488          /* acc0 +=  x[4] * y[srcBLen - 5] */
489          acc0 += ((q63_t) x0 * c0);
490          /* acc1 +=  x[5] * y[srcBLen - 5] */
491          acc1 += ((q63_t) x1 * c0);
492          /* acc2 +=  x[6] * y[srcBLen - 5] */
493          acc2 += ((q63_t) x2 * c0);
494  
495          /* Reuse the present samples for the next MAC */
496          x0 = x1;
497          x1 = x2;
498  
499          /* Decrement loop counter */
500          k--;
501        }
502  
503        /* Store the result in the accumulator in the destination buffer. */
504        *pOut++ = (q31_t) (acc0 >> 31);
505        *pOut++ = (q31_t) (acc1 >> 31);
506        *pOut++ = (q31_t) (acc2 >> 31);
507  
508        /* Increment the pointer pIn1 index, count by 3 */
509        count += 3U;
510  
511        /* Update the inputA and inputB pointers for next MAC calculation */
512        px = pIn1 + count;
513        py = pSrc2;
514  
515        /* Decrement loop counter */
516        blkCnt--;
517      }
518  
519      /* Loop unrolling: Compute remaining outputs */
520      blkCnt = blockSize2 - 3 * (blockSize2 / 3);
521  
522  #else
523  
524      /* Initialize blkCnt with number of samples */
525      blkCnt = blockSize2;
526  
527  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
528  
529      while (blkCnt > 0U)
530      {
531        /* Accumulator is made zero for every iteration */
532        sum = 0;
533  
534  #if defined (ARM_MATH_LOOPUNROLL)
535  
536      /* Loop unrolling: Compute 4 outputs at a time */
537        k = srcBLen >> 2U;
538  
539        while (k > 0U)
540        {
541          /* Perform the multiply-accumulates */
542          sum += (q63_t) *px++ * *py--;
543          sum += (q63_t) *px++ * *py--;
544          sum += (q63_t) *px++ * *py--;
545          sum += (q63_t) *px++ * *py--;
546  
547          /* Decrement loop counter */
548          k--;
549        }
550  
551        /* Loop unrolling: Compute remaining outputs */
552        k = srcBLen % 0x4U;
553  
554  #else
555  
556        /* Initialize blkCnt with number of samples */
557        k = srcBLen;
558  
559  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
560  
561        while (k > 0U)
562        {
563          /* Perform the multiply-accumulate */
564          sum += (q63_t) *px++ * *py--;
565  
566          /* Decrement the loop counter */
567          k--;
568        }
569  
570        /* Store the result in the accumulator in the destination buffer. */
571        *pOut++ = (q31_t) (sum >> 31);
572  
573        /* Increment MAC count */
574        count++;
575  
576        /* Update the inputA and inputB pointers for next MAC calculation */
577        px = pIn1 + count;
578        py = pSrc2;
579  
580        /* Decrement loop counter */
581        blkCnt--;
582      }
583    }
584    else
585    {
586      /* If the srcBLen is not a multiple of 4,
587       * the blockSize2 loop cannot be unrolled by 4 */
588      blkCnt = blockSize2;
589  
590      while (blkCnt > 0U)
591      {
592        /* Accumulator is made zero for every iteration */
593        sum = 0;
594  
595        /* srcBLen number of MACS should be performed */
596        k = srcBLen;
597  
598        while (k > 0U)
599        {
600          /* Perform the multiply-accumulate */
601          sum += (q63_t) *px++ * *py--;
602  
603          /* Decrement the loop counter */
604          k--;
605        }
606  
607        /* Store the result in the accumulator in the destination buffer. */
608        *pOut++ = (q31_t) (sum >> 31);
609  
610        /* Increment MAC count */
611        count++;
612  
613        /* Update the inputA and inputB pointers for next MAC calculation */
614        px = pIn1 + count;
615        py = pSrc2;
616  
617        /* Decrement loop counter */
618        blkCnt--;
619      }
620    }
621  
622  
623    /* --------------------------
624     * Initializations of stage3
625     * -------------------------*/
626  
627    /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
628     * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
629     * ....
630     * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
631     * sum +=  x[srcALen-1] * y[srcBLen-1]
632     */
633  
634    /* In this stage the MAC operations are decreased by 1 for every iteration.
635       The blockSize3 variable holds the number of MAC operations performed */
636  
637    /* Working pointer of inputA */
638    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
639    px = pSrc1;
640  
641    /* Working pointer of inputB */
642    pSrc2 = pIn2 + (srcBLen - 1U);
643    py = pSrc2;
644  
645    /* -------------------
646     * Stage3 process
647     * ------------------*/
648  
649    while (blockSize3 > 0U)
650    {
651      /* Accumulator is made zero for every iteration */
652      sum = 0;
653  
654  #if defined (ARM_MATH_LOOPUNROLL)
655  
656      /* Loop unrolling: Compute 4 outputs at a time */
657      k = blockSize3 >> 2U;
658  
659      while (k > 0U)
660      {
661        /* Perform the multiply-accumulate */
662        /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
663        sum += (q63_t) *px++ * *py--;
664  
665        /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
666        sum += (q63_t) *px++ * *py--;
667  
668        /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
669        sum += (q63_t) *px++ * *py--;
670  
671        /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
672        sum += (q63_t) *px++ * *py--;
673  
674        /* Decrement loop counter */
675        k--;
676      }
677  
678      /* Loop unrolling: Compute remaining outputs */
679      k = blockSize3 % 0x4U;
680  
681  #else
682  
683      /* Initialize blkCnt with number of samples */
684      k = blockSize3;
685  
686  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
687  
688      while (k > 0U)
689      {
690        /* Perform the multiply-accumulate */
691        /* sum +=  x[srcALen-1] * y[srcBLen-1] */
692        sum += (q63_t) *px++ * *py--;
693  
694        /* Decrement loop counter */
695        k--;
696      }
697  
698      /* Store the result in the accumulator in the destination buffer. */
699      *pOut++ = (q31_t) (sum >> 31);
700  
701      /* Update the inputA and inputB pointers for next MAC calculation */
702      px = ++pSrc1;
703      py = pSrc2;
704  
705      /* Decrement loop counter */
706      blockSize3--;
707    }
708  
709  #else
710  /* alternate version for CM0_FAMILY */
711  
712    const q31_t *pIn1 = pSrcA;                           /* InputA pointer */
713    const q31_t *pIn2 = pSrcB;                           /* InputB pointer */
714          q63_t sum;                                     /* Accumulators */
715          uint32_t i, j;                                 /* Loop counters */
716  
717    /* Loop to calculate convolution for output length number of times */
718    for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
719    {
720      /* Initialize sum with zero to carry out MAC operations */
721      sum = 0;
722  
723      /* Loop to perform MAC operations according to convolution equation */
724      for (j = 0U; j <= i; j++)
725      {
726        /* Check the array limitations */
727        if (((i - j) < srcBLen) && (j < srcALen))
728        {
729          /* z[i] += x[i-j] * y[j] */
730          sum += ((q63_t) pIn1[j] * pIn2[i - j]);
731        }
732      }
733  
734      /* Store the output in the destination buffer */
735      pDst[i] = (q31_t) (sum >> 31U);
736    }
737  
738  #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
739  
740  }
741  #endif /* defined(ARM_MATH_MVEI) */
742  
743  /**
744    @} end of Conv group
745   */