/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_conv_f32.c
arm_conv_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_conv_f32.c
  4   * Description:  Convolution of floating-point sequences
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/filtering_functions.h"
 30  
 31  /**
 32    @ingroup groupFilters
 33   */
 34  
 35  /**
 36    @defgroup Conv Convolution
 37  
 38    Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
 39    Convolution is similar to correlation and is frequently used in filtering and data analysis.
 40    The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
 41    The library also provides fast versions of the Q15 and Q31 functions.
 42  
 43   @par            Algorithm
 44                     Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and
 45                     <code>srcBLen</code> samples respectively. Then the convolution
 46    <pre>
 47       c[n] = a[n] * b[n]
 48    </pre>
 49    @par
 50                     is defined as
 51                     \image html ConvolutionEquation.gif
 52    @par
 53                     Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
 54                     <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
 55                     <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
 56                     The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
 57    @par
 58                     Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
 59                     the signal <code>b[n]</code> slides over <code>a[n]</code>.
 60                     For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
 61    @par
 62                     Note that convolution is a commutative operation:
 63    <pre>
 64       a[n] * b[n] = b[n] * a[n].
 65    </pre>
 66    @par
 67                     This means that switching the A and B arguments to the convolution functions has no effect.
 68  
 69    @par           Fixed-Point Behavior
 70                     Convolution requires summing up a large number of intermediate products.
 71                     As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
 72                     Refer to the function specific documentation below for further details of the particular algorithm used.
 73  
 74    @par           Fast Versions
 75                     Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
 76                     the input signals should be scaled down to avoid intermediate overflows.
 77  
 78    @par           Opt Versions
 79                     Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
 80                     These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions
 81   */
 82  
 83  /**
 84    @addtogroup Conv
 85    @{
 86   */
 87  
 88  /**
 89    @brief         Convolution of floating-point sequences.
 90    @param[in]     pSrcA      points to the first input sequence
 91    @param[in]     srcALen    length of the first input sequence
 92    @param[in]     pSrcB      points to the second input sequence
 93    @param[in]     srcBLen    length of the second input sequence
 94    @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
 95    @return        none
 96   */
 97  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 98  
 99  #include "arm_helium_utils.h"
100  #include "arm_vec_filtering.h"
101  
102  
103  void arm_conv_f32(
104    const float32_t * pSrcA,
105          uint32_t srcALen,
106    const float32_t * pSrcB,
107          uint32_t srcBLen,
108          float32_t * pDst)
109  {
110      const float32_t *pIn1 = pSrcA;    /* inputA pointer               */
111      const float32_t *pIn2 = pSrcB;    /* inputB pointer               */
112      /*
113       * Loop to perform MAC operations according to correlation equation
114       */
115      const float32_t *pX;
116      const float32_t *pY;
117      const float32_t *pA;
118      const float32_t *pB;
119      int32_t   i = 0U, j = 0;    /* loop counters */
120      int32_t   block1, block2, block3;
121      uint32_t  vddupStartIdx = 3;
122      uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
123  
124      if (srcALen < srcBLen)
125      {
126          /*
127           * Initialization to inputB pointer
128           */
129          pIn1 = pSrcB;
130          /*
131           * Initialization to the end of inputA pointer
132           */
133          pIn2 = pSrcA;
134          /*
135           * Swapping the lengths
136           */
137          j = srcALen;
138          srcALen = srcBLen;
139          srcBLen = j;
140      }
141  
142      block1 = srcBLen - 1;
143      block2 = srcALen - srcBLen + 1;
144      block3 = srcBLen - 1;
145  
146      pA = pIn1;
147      pB = pIn2 - 3;
148  
149      for (i = 0; i <= block1 - 2; i += 2)
150      {
151          uint32_t  count = i + 1;
152          float32_t acc0;
153          float32_t acc1;
154  
155          pX = pA;
156          pY = pB;
157          /*
158           * compute 2 accumulators per loop
159           * size is incrementing for successive accumulators
160           * Y pointer is incrementing for successive accumulators
161           */
162          MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
163  
164          *pDst++ = acc0;
165          *pDst++ = acc1;
166          pB += 2;
167      }
168  
169      for (; i < block1; i++)
170      {
171          uint32_t  count = i + 1;
172          float32_t acc;
173  
174          pX = pA;
175          pY = pB;
176          MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
177  
178          *pDst++ = acc;
179          pB++;
180      }
181  
182      for (i = 0; i <= block2 - 2; i += 2)
183      {
184          uint32_t  count = srcBLen;
185          float32_t acc0 = 0;
186          float32_t acc1 = 0;
187  
188          pX = pA;
189          pY = pB;
190          /*
191           * compute 2 accumulators per loop
192           * size is fixed for all accumulators
193           * X pointer is incrementing for successive accumulators
194           */
195          MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count);
196          *pDst++ = acc0;
197          *pDst++ = acc1;
198          pA += 2;
199      }
200      if (block2 & 1)
201      {
202          uint32_t  count = srcBLen;
203          float32_t acc = 0;
204  
205          pX = pA;
206          pY = pB;
207          MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
208  
209          *pDst++ = acc;
210          pA++;
211      }
212  
213      for (i = block3; i >= 2; i -= 2)
214      {
215          int32_t   count = i;
216          float32_t acc0;
217          float32_t acc1;
218  
219          pX = pA;
220          pY = pB;
221          /*
222           * compute 2 accumulators per loop
223           * size is decrementing for successive accumulators
224           * X pointer is incrementing for successive accumulators
225           */
226          MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
227  
228          *pDst++ = acc0;
229          *pDst++ = acc1;
230          pA += 2;
231      }
232      for (; i >= 1; i--)
233      {
234          int32_t   count = i;
235          float32_t acc;
236  
237          pX = pA;
238          pY = pB;
239          MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
240  
241          *pDst++ = acc;
242          pA++;
243      }
244  }
245  #else
246  void arm_conv_f32(
247    const float32_t * pSrcA,
248          uint32_t srcALen,
249    const float32_t * pSrcB,
250          uint32_t srcBLen,
251          float32_t * pDst)
252  {
253  
254  #if defined(ARM_MATH_DSP)
255  
256    const float32_t *pIn1;                               /* InputA pointer */
257    const float32_t *pIn2;                               /* InputB pointer */
258          float32_t *pOut = pDst;                        /* Output pointer */
259    const float32_t *px;                                 /* Intermediate inputA pointer */
260    const float32_t *py;                                 /* Intermediate inputB pointer */
261    const float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
262          float32_t sum;                                 /* Accumulators */
263          uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
264          uint32_t j, k, count, blkCnt;                  /* Loop counters */
265  
266  
267  #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
268          float32_t acc0, acc1, acc2, acc3, c0;              /* Accumulators */
269  #if !defined(ARM_MATH_NEON)
270          float32_t x0, x1, x2, x3;                  /* Temporary variables to hold state and coefficient values */
271  #endif
272  #endif
273  
274    /* The algorithm implementation is based on the lengths of the inputs. */
275    /* srcB is always made to slide across srcA. */
276    /* So srcBLen is always considered as shorter or equal to srcALen */
277    if (srcALen >= srcBLen)
278    {
279      /* Initialization of inputA pointer */
280      pIn1 = pSrcA;
281  
282      /* Initialization of inputB pointer */
283      pIn2 = pSrcB;
284    }
285    else
286    {
287      /* Initialization of inputA pointer */
288      pIn1 = pSrcB;
289  
290      /* Initialization of inputB pointer */
291      pIn2 = pSrcA;
292  
293      /* srcBLen is always considered as shorter or equal to srcALen */
294      j = srcBLen;
295      srcBLen = srcALen;
296      srcALen = j;
297    }
298  
299    /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
300    /* The function is internally
301     * divided into three stages according to the number of multiplications that has to be
302     * taken place between inputA samples and inputB samples. In the first stage of the
303     * algorithm, the multiplications increase by one for every iteration.
304     * In the second stage of the algorithm, srcBLen number of multiplications are done.
305     * In the third stage of the algorithm, the multiplications decrease by one
306     * for every iteration. */
307  
308    /* The algorithm is implemented in three stages.
309       The loop counters of each stage is initiated here. */
310    blockSize1 = srcBLen - 1U;
311    blockSize2 = srcALen - (srcBLen - 1U);
312    blockSize3 = blockSize1;
313  
314    /* --------------------------
315     * Initializations of stage1
316     * -------------------------*/
317  
318    /* sum = x[0] * y[0]
319     * sum = x[0] * y[1] + x[1] * y[0]
320     * ....
321     * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
322     */
323  
324    /* In this stage the MAC operations are increased by 1 for every iteration.
325       The count variable holds the number of MAC operations performed */
326    count = 1U;
327  
328    /* Working pointer of inputA */
329    px = pIn1;
330  
331    /* Working pointer of inputB */
332    py = pIn2;
333  
334  
335    /* ------------------------
336     * Stage1 process
337     * ----------------------*/
338  #if defined(ARM_MATH_NEON)
339      float32x4_t vec1;
340      float32x4_t vec2;
341      float32x4_t res = vdupq_n_f32(0) ;
342      float32x2_t accum = vdup_n_f32(0);
343  #endif /* #if defined(ARM_MATH_NEON) */
344  
345    /* The first stage starts here */
346    while (blockSize1 > 0U)
347    {
348      /* Accumulator is made zero for every iteration */
349      sum = 0.0f;
350  
351  #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
352      /* Loop unrolling: Compute 4 outputs at a time */
353      k = count >> 2U;
354  
355  #if defined(ARM_MATH_NEON)
356      res = vdupq_n_f32(0) ;
357      accum = vdup_n_f32(0);
358  
359      /* Compute 4 MACs simultaneously. */
360      k = count >> 2U;
361  
362      /* First part of the processing.  Compute 4 MACs at a time.
363       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
364  
365      while (k > 0U)
366      {
367        vec1 = vld1q_f32(px);
368        vec2 = vld1q_f32(py-3);
369        vec2 = vrev64q_f32(vec2);
370        vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2));
371  
372        res = vmlaq_f32(res,vec1, vec2);
373  
374        /* Increment pointers */
375        px += 4;
376        py -= 4; 
377  
378        /* Decrement the loop counter */
379        k--;
380      }
381  
382      accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
383      sum += accum[0] + accum[1];
384  
385      /* If the count is not a multiple of 4, compute any remaining MACs here.
386       ** No loop unrolling is used. */
387      k = count & 3;
388  #else
389      while (k > 0U)
390      {
391        /* x[0] * y[srcBLen - 1] */
392        sum += *px++ * *py--;
393  
394        /* x[1] * y[srcBLen - 2] */
395        sum += *px++ * *py--;
396  
397        /* x[2] * y[srcBLen - 3] */
398        sum += *px++ * *py--;
399  
400        /* x[3] * y[srcBLen - 4] */
401        sum += *px++ * *py--;
402  
403        /* Decrement loop counter */
404        k--;
405      }
406  
407      /* Loop unrolling: Compute remaining outputs */
408      k = count % 0x4U;
409  
410  #endif /* #if defined(ARM_MATH_NEON) */
411  
412  #else /* defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
413      /* Initialize k with number of samples */
414      k = count;
415  
416  #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
417  
418      while (k > 0U)
419      {
420        /* Perform the multiply-accumulate */
421        sum += *px++ * *py--;
422  
423        /* Decrement loop counter */
424        k--;
425      }
426  
427      /* Store the result in the accumulator in the destination buffer. */
428      *pOut++ = sum;
429  
430      /* Update the inputA and inputB pointers for next MAC calculation */
431      py = pIn2 + count;
432      px = pIn1;
433  
434      /* Increment MAC count */
435      count++;
436  
437      /* Decrement loop counter */
438      blockSize1--;
439    }
440  
441    /* --------------------------
442     * Initializations of stage2
443     * ------------------------*/
444  
445    /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
446     * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
447     * ....
448     * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
449     */
450  
451    /* Working pointer of inputA */
452    px = pIn1;
453  
454    /* Working pointer of inputB */
455    pSrc2 = pIn2 + (srcBLen - 1U);
456    py = pSrc2;
457  
458    /* count is index by which the pointer pIn1 to be incremented */
459    count = 0U;
460  
461    /* -------------------
462     * Stage2 process
463     * ------------------*/
464  
465    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
466     * So, to loop unroll over blockSize2,
467     * srcBLen should be greater than or equal to 4 */
468    if (srcBLen >= 4U)
469    {
470     
471  #if defined(ARM_MATH_NEON)
472        float32x4_t c;
473        float32x4_t x1v;
474        float32x4_t x2v;
475        float32x4_t x;
476        float32x4_t res = vdupq_n_f32(0) ;
477  #endif /* #if defined(ARM_MATH_NEON) */
478     
479  #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
480  
481      /* Loop unrolling: Compute 4 outputs at a time */
482      blkCnt = blockSize2 >> 2U;
483  
484      while (blkCnt > 0U)
485      {
486        /* Set all accumulators to zero */
487        acc0 = 0.0f;
488        acc1 = 0.0f;
489        acc2 = 0.0f;
490        acc3 = 0.0f;
491  
492         /* Apply loop unrolling and compute 4 MACs simultaneously. */
493        k = srcBLen >> 2U;
494  
495  #if defined(ARM_MATH_NEON)
496        res = vdupq_n_f32(0) ;
497  
498        x1v = vld1q_f32(px);
499        x2v = vld1q_f32(px+4);
500  
501        do
502        {
503          c = vld1q_f32(py-3);
504  
505          px += 4;
506          x = x1v;
507          res = vmlaq_n_f32(res,x,c[3]);
508  
509  	x = vextq_f32(x1v,x2v,1);
510  
511          res = vmlaq_n_f32(res,x,c[2]);
512  
513          x = vextq_f32(x1v,x2v,2);
514  
515  	res = vmlaq_n_f32(res,x,c[1]);
516  
517  	x = vextq_f32(x1v,x2v,3);
518  
519  	res = vmlaq_n_f32(res,x,c[0]);
520  
521          py -= 4; 
522  
523          x1v = x2v ;
524          x2v = vld1q_f32(px+4);
525  
526        } while (--k);
527        
528        
529        /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
530         ** No loop unrolling is used. */
531        k = srcBLen & 0x3;
532  
533        x1v = vld1q_f32(px);
534        px += 4;
535  
536        while (k > 0U)
537        {
538          /* Read y[srcBLen - 5] sample */
539          c0 = *(py--);
540  
541          res = vmlaq_n_f32(res,x1v,c0);
542  
543          /* Reuse the present samples for the next MAC */
544          x1v[0] = x1v[1];
545          x1v[1] = x1v[2];
546          x1v[2] = x1v[3];
547  
548          x1v[3] = *(px++);
549  
550          /* Decrement the loop counter */
551          k--;
552        }
553  
554        acc0 = res[0];
555        acc1 = res[1];
556        acc2 = res[2];
557        acc3 = res[3];
558  
559  #else
560        /* read x[0], x[1], x[2] samples */
561        x0 = *px++;
562        x1 = *px++;
563        x2 = *px++;
564  
565        /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
566         ** a second loop below computes MACs for the remaining 1 to 3 samples. */
567        do
568        {
569          /* Read y[srcBLen - 1] sample */
570          c0 = *py--;
571          /* Read x[3] sample */
572          x3 = *(px);
573  
574          /* Perform the multiply-accumulate */
575          /* acc0 +=  x[0] * y[srcBLen - 1] */
576          acc0 += x0 * c0;
577          /* acc1 +=  x[1] * y[srcBLen - 1] */
578          acc1 += x1 * c0;
579          /* acc2 +=  x[2] * y[srcBLen - 1] */
580          acc2 += x2 * c0;
581          /* acc3 +=  x[3] * y[srcBLen - 1] */
582          acc3 += x3 * c0;
583  
584          /* Read y[srcBLen - 2] sample */
585          c0 = *py--;
586          /* Read x[4] sample */
587          x0 = *(px + 1U);
588  
589          /* Perform the multiply-accumulate */
590          /* acc0 +=  x[1] * y[srcBLen - 2] */
591          acc0 += x1 * c0;
592          /* acc1 +=  x[2] * y[srcBLen - 2] */
593          acc1 += x2 * c0;
594          /* acc2 +=  x[3] * y[srcBLen - 2] */
595          acc2 += x3 * c0;
596          /* acc3 +=  x[4] * y[srcBLen - 2] */
597          acc3 += x0 * c0;
598  
599          /* Read y[srcBLen - 3] sample */
600          c0 = *py--;
601          /* Read x[5] sample */
602          x1 = *(px + 2U);
603  
604          /* Perform the multiply-accumulate */
605          /* acc0 +=  x[2] * y[srcBLen - 3] */
606          acc0 += x2 * c0;
607          /* acc1 +=  x[3] * y[srcBLen - 2] */
608          acc1 += x3 * c0;
609          /* acc2 +=  x[4] * y[srcBLen - 2] */
610          acc2 += x0 * c0;
611          /* acc3 +=  x[5] * y[srcBLen - 2] */
612          acc3 += x1 * c0;
613  
614          /* Read y[srcBLen - 4] sample */
615          c0 = *py--;
616          /* Read x[6] sample */
617          x2 = *(px + 3U);
618          px += 4U;
619  
620          /* Perform the multiply-accumulate */
621          /* acc0 +=  x[3] * y[srcBLen - 4] */
622          acc0 += x3 * c0;
623          /* acc1 +=  x[4] * y[srcBLen - 4] */
624          acc1 += x0 * c0;
625          /* acc2 +=  x[5] * y[srcBLen - 4] */
626          acc2 += x1 * c0;
627          /* acc3 +=  x[6] * y[srcBLen - 4] */
628          acc3 += x2 * c0;
629  
630        } while (--k);
631  
632        /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
633         ** No loop unrolling is used. */
634        k = srcBLen % 0x4U;
635  
636        while (k > 0U)
637        {
638          /* Read y[srcBLen - 5] sample */
639          c0 = *py--;
640          /* Read x[7] sample */
641          x3 = *px++;
642  
643          /* Perform the multiply-accumulate */
644          /* acc0 +=  x[4] * y[srcBLen - 5] */
645          acc0 += x0 * c0;
646          /* acc1 +=  x[5] * y[srcBLen - 5] */
647          acc1 += x1 * c0;
648          /* acc2 +=  x[6] * y[srcBLen - 5] */
649          acc2 += x2 * c0;
650          /* acc3 +=  x[7] * y[srcBLen - 5] */
651          acc3 += x3 * c0;
652  
653          /* Reuse the present samples for the next MAC */
654          x0 = x1;
655          x1 = x2;
656          x2 = x3;
657  
658          /* Decrement the loop counter */
659          k--;
660        }
661  #endif /* #if defined(ARM_MATH_NEON) */
662  
663        /* Store the result in the accumulator in the destination buffer. */
664        *pOut++ = acc0;
665        *pOut++ = acc1;
666        *pOut++ = acc2;
667        *pOut++ = acc3;
668  
669        /* Increment the pointer pIn1 index, count by 4 */
670        count += 4U;
671  
672        /* Update the inputA and inputB pointers for next MAC calculation */
673        px = pIn1 + count;
674        py = pSrc2;
675  
676        /* Decrement the loop counter */
677        blkCnt--;
678      }
679  
680      /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
681       ** No loop unrolling is used. */
682      blkCnt = blockSize2 % 0x4U;
683  
684  #else
685  
686      /* Initialize blkCnt with number of samples */
687      blkCnt = blockSize2;
688  
689  #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/
690  
691      while (blkCnt > 0U)
692      {
693        /* Accumulator is made zero for every iteration */
694        sum = 0.0f;
695  
696  #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)
697        /* Loop unrolling: Compute 4 outputs at a time */
698        k = srcBLen >> 2U;
699  
700  #if defined (ARM_MATH_NEON)
701        float32x4_t res = vdupq_n_f32(0) ;
702        float32x4_t x = vdupq_n_f32(0) ;
703        float32x4_t y = vdupq_n_f32(0) ;
704        float32x2_t accum = vdup_n_f32(0) ;
705  
706        /* First part of the processing.  Compute 4 MACs at a time.
707         ** a second loop below computes MACs for the remaining 1 to 3 samples. */
708        while (k > 0U)
709        {
710          x = vld1q_f32(px);
711          y = vld1q_f32(py-3);
712  
713          y = vrev64q_f32(y);
714          y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
715  
716          res = vmlaq_f32(res,x,y);
717  
718          px += 4 ;
719          py -= 4 ;
720  
721          /* Decrement the loop counter */
722          k--;
723        }
724  
725        accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
726        sum += accum[0] + accum[1]; 
727  
728        /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
729         ** No loop unrolling is used. */
730        k = srcBLen & 0x3U;
731  
732  #else
733        while (k > 0U)
734        {
735          /* Perform the multiply-accumulate */
736          sum += *px++ * *py--;
737          sum += *px++ * *py--;
738          sum += *px++ * *py--;
739          sum += *px++ * *py--;
740  
741          /* Decrement loop counter */
742          k--;
743        }
744  
745        /* Loop unrolling: Compute remaining outputs */
746        k = srcBLen % 0x4U;
747  
748  #endif /* if defined (ARM_MATH_NEON) */
749  #else
750        /* Initialize blkCnt with number of samples */
751        k = srcBLen;
752  
753  #endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */
754  
755        while (k > 0U)
756        {
757          /* Perform the multiply-accumulate */
758          sum += *px++ * *py--;
759  
760          /* Decrement the loop counter */
761          k--;
762        }
763  
764        /* Store the result in the accumulator in the destination buffer. */
765        *pOut++ = sum;
766  
767        /* Increment the MAC count */
768        count++;
769  
770        /* Update the inputA and inputB pointers for next MAC calculation */
771        px = pIn1 + count;
772        py = pSrc2;
773  
774        /* Decrement the loop counter */
775        blkCnt--;
776      }
777    }
778    else
779    {
780      /* If the srcBLen is not a multiple of 4,
781       * the blockSize2 loop cannot be unrolled by 4 */
782      blkCnt = blockSize2;
783  
784      while (blkCnt > 0U)
785      {
786        /* Accumulator is made zero for every iteration */
787        sum = 0.0f;
788  
789        /* srcBLen number of MACS should be performed */
790        k = srcBLen;
791  
792        while (k > 0U)
793        {
794          /* Perform the multiply-accumulate */
795          sum += *px++ * *py--;
796  
797          /* Decrement the loop counter */
798          k--;
799        }
800  
801        /* Store the result in the accumulator in the destination buffer. */
802        *pOut++ = sum;
803  
804        /* Increment the MAC count */
805        count++;
806  
807        /* Update the inputA and inputB pointers for next MAC calculation */
808        px = pIn1 + count;
809        py = pSrc2;
810  
811        /* Decrement the loop counter */
812        blkCnt--;
813      }
814    }
815  
816  
817    /* --------------------------
818     * Initializations of stage3
819     * -------------------------*/
820  
821    /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
822     * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
823     * ....
824     * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
825     * sum +=  x[srcALen-1] * y[srcBLen-1]
826     */
827  
828    /* In this stage the MAC operations are decreased by 1 for every iteration.
829       The blockSize3 variable holds the number of MAC operations performed */
830  
831    /* Working pointer of inputA */
832    pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
833    px = pSrc1;
834  
835    /* Working pointer of inputB */
836    pSrc2 = pIn2 + (srcBLen - 1U);
837    py = pSrc2;
838  
839    /* -------------------
840     * Stage3 process
841     * ------------------*/
842    while (blockSize3 > 0U)
843    {
844      /* Accumulator is made zero for every iteration */
845      sum = 0.0f;
846  
847  #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
848      /* Loop unrolling: Compute 4 outputs at a time */
849      k = blockSize3 >> 2U;
850  
851  #if defined(ARM_MATH_NEON)
852      float32x4_t res = vdupq_n_f32(0) ;
853      float32x4_t x = vdupq_n_f32(0) ;
854      float32x4_t y = vdupq_n_f32(0) ;
855      float32x2_t accum = vdup_n_f32(0) ;
856  
857      while (k > 0U)
858      {
859        x = vld1q_f32(px);
860        y = vld1q_f32(py-3);
861  
862        y = vrev64q_f32(y);
863        y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
864  
865        res = vmlaq_f32(res,x,y);
866  
867        px += 4 ;
868        py -= 4 ;
869  
870        /* Decrement the loop counter */
871        k--;
872      }
873  
874      accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
875      sum += accum[0] + accum[1]; 
876  
877  #else
878      while (k > 0U)
879      {
880        /* Perform the multiply-accumulate */
881        /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
882        sum += *px++ * *py--;
883  
884        /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
885        sum += *px++ * *py--;
886  
887        /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
888        sum += *px++ * *py--;
889  
890        /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
891        sum += *px++ * *py--;
892  
893        /* Decrement loop counter */
894        k--;
895      }
896  #endif /* #if defined (ARM_MATH_NEON) */
897  
898      /* Loop unrolling: Compute remaining outputs */
899      k = blockSize3 % 0x4U;
900  #else
901  
902      /* Initialize blkCnt with number of samples */
903      k = blockSize3;
904  
905  #endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/
906  
907      while (k > 0U)
908      {
909        /* Perform the multiply-accumulate */
910        /* sum +=  x[srcALen-1] * y[srcBLen-1] */
911        sum += *px++ * *py--;
912  
913        /* Decrement loop counter */
914        k--;
915      }
916  
917      /* Store the result in the accumulator in the destination buffer. */
918      *pOut++ = sum;
919  
920      /* Update the inputA and inputB pointers for next MAC calculation */
921      px = ++pSrc1;
922      py = pSrc2;
923  
924      /* Decrement the loop counter */
925      blockSize3--;
926    }
927  
928  #else
929  /* alternate version for CM0_FAMILY */
930  
931    const float32_t *pIn1 = pSrcA;                       /* InputA pointer */
932    const float32_t *pIn2 = pSrcB;                       /* InputB pointer */
933          float32_t sum;                                 /* Accumulator */
934          uint32_t i, j;                                 /* Loop counters */
935  
936    /* Loop to calculate convolution for output length number of times */
937    for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
938    {
939      /* Initialize sum with zero to carry out MAC operations */
940      sum = 0.0f;
941  
942      /* Loop to perform MAC operations according to convolution equation */
943      for (j = 0U; j <= i; j++)
944      {
945        /* Check the array limitations */
946        if (((i - j) < srcBLen) && (j < srcALen))
947        {
948          /* z[i] += x[i-j] * y[j] */
949          sum += ( pIn1[j] * pIn2[i - j]);
950        }
951      }
952  
953      /* Store the output in the destination buffer */
954      pDst[i] = sum;
955    }
956  
957  #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
958  
959  }
960  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
961  
962  /**
963    @} end of Conv group
964   */