arm_conv_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_conv_f32.c 4 * Description: Convolution of floating-point sequences 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @defgroup Conv Convolution 37 38 Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. 39 Convolution is similar to correlation and is frequently used in filtering and data analysis. 40 The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. 41 The library also provides fast versions of the Q15 and Q31 functions. 42 43 @par Algorithm 44 Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and 45 <code>srcBLen</code> samples respectively. Then the convolution 46 <pre> 47 c[n] = a[n] * b[n] 48 </pre> 49 @par 50 is defined as 51 \image html ConvolutionEquation.gif 52 @par 53 Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>. 54 <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and 55 <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>. 56 The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result. 57 @par 58 Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved, 59 the signal <code>b[n]</code> slides over <code>a[n]</code>. 60 For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together. 61 @par 62 Note that convolution is a commutative operation: 63 <pre> 64 a[n] * b[n] = b[n] * a[n]. 65 </pre> 66 @par 67 This means that switching the A and B arguments to the convolution functions has no effect. 68 69 @par Fixed-Point Behavior 70 Convolution requires summing up a large number of intermediate products. 71 As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. 72 Refer to the function specific documentation below for further details of the particular algorithm used. 73 74 @par Fast Versions 75 Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires 76 the input signals should be scaled down to avoid intermediate overflows. 77 78 @par Opt Versions 79 Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. 80 These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions 81 */ 82 83 /** 84 @addtogroup Conv 85 @{ 86 */ 87 88 /** 89 @brief Convolution of floating-point sequences. 90 @param[in] pSrcA points to the first input sequence 91 @param[in] srcALen length of the first input sequence 92 @param[in] pSrcB points to the second input sequence 93 @param[in] srcBLen length of the second input sequence 94 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 95 @return none 96 */ 97 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 98 99 #include "arm_helium_utils.h" 100 #include "arm_vec_filtering.h" 101 102 103 void arm_conv_f32( 104 const float32_t * pSrcA, 105 uint32_t srcALen, 106 const float32_t * pSrcB, 107 uint32_t srcBLen, 108 float32_t * pDst) 109 { 110 const float32_t *pIn1 = pSrcA; /* inputA pointer */ 111 const float32_t *pIn2 = pSrcB; /* inputB pointer */ 112 /* 113 * Loop to perform MAC operations according to correlation equation 114 */ 115 const float32_t *pX; 116 const float32_t *pY; 117 const float32_t *pA; 118 const float32_t *pB; 119 int32_t i = 0U, j = 0; /* loop counters */ 120 int32_t block1, block2, block3; 121 uint32_t vddupStartIdx = 3; 122 uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1); 123 124 if (srcALen < srcBLen) 125 { 126 /* 127 * Initialization to inputB pointer 128 */ 129 pIn1 = pSrcB; 130 /* 131 * Initialization to the end of inputA pointer 132 */ 133 pIn2 = pSrcA; 134 /* 135 * Swapping the lengths 136 */ 137 j = srcALen; 138 srcALen = srcBLen; 139 srcBLen = j; 140 } 141 142 block1 = srcBLen - 1; 143 block2 = srcALen - srcBLen + 1; 144 block3 = srcBLen - 1; 145 146 pA = pIn1; 147 pB = pIn2 - 3; 148 149 for (i = 0; i <= block1 - 2; i += 2) 150 { 151 uint32_t count = i + 1; 152 float32_t acc0; 153 float32_t acc1; 154 155 pX = pA; 156 pY = pB; 157 /* 158 * compute 2 accumulators per loop 159 * size is incrementing for successive accumulators 160 * Y pointer is incrementing for successive accumulators 161 */ 162 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count); 163 164 *pDst++ = acc0; 165 *pDst++ = acc1; 166 pB += 2; 167 } 168 169 for (; i < block1; i++) 170 { 171 uint32_t count = i + 1; 172 float32_t acc; 173 174 pX = pA; 175 pY = pB; 176 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count); 177 178 *pDst++ = acc; 179 pB++; 180 } 181 182 for (i = 0; i <= block2 - 2; i += 2) 183 { 184 uint32_t count = srcBLen; 185 float32_t acc0 = 0; 186 float32_t acc1 = 0; 187 188 pX = pA; 189 pY = pB; 190 /* 191 * compute 2 accumulators per loop 192 * size is fixed for all accumulators 193 * X pointer is incrementing for successive accumulators 194 */ 195 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count); 196 *pDst++ = acc0; 197 *pDst++ = acc1; 198 pA += 2; 199 } 200 if (block2 & 1) 201 { 202 uint32_t count = srcBLen; 203 float32_t acc = 0; 204 205 pX = pA; 206 pY = pB; 207 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count); 208 209 *pDst++ = acc; 210 pA++; 211 } 212 213 for (i = block3; i >= 2; i -= 2) 214 { 215 int32_t count = i; 216 float32_t acc0; 217 float32_t acc1; 218 219 pX = pA; 220 pY = pB; 221 /* 222 * compute 2 accumulators per loop 223 * size is decrementing for successive accumulators 224 * X pointer is incrementing for successive accumulators 225 */ 226 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count); 227 228 *pDst++ = acc0; 229 *pDst++ = acc1; 230 pA += 2; 231 } 232 for (; i >= 1; i--) 233 { 234 int32_t count = i; 235 float32_t acc; 236 237 pX = pA; 238 pY = pB; 239 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count); 240 241 *pDst++ = acc; 242 pA++; 243 } 244 } 245 #else 246 void arm_conv_f32( 247 const float32_t * pSrcA, 248 uint32_t srcALen, 249 const float32_t * pSrcB, 250 uint32_t srcBLen, 251 float32_t * pDst) 252 { 253 254 #if defined(ARM_MATH_DSP) 255 256 const float32_t *pIn1; /* InputA pointer */ 257 const float32_t *pIn2; /* InputB pointer */ 258 float32_t *pOut = pDst; /* Output pointer */ 259 const float32_t *px; /* Intermediate inputA pointer */ 260 const float32_t *py; /* Intermediate inputB pointer */ 261 const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 262 float32_t sum; /* Accumulators */ 263 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ 264 uint32_t j, k, count, blkCnt; /* Loop counters */ 265 266 267 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) 268 float32_t acc0, acc1, acc2, acc3, c0; /* Accumulators */ 269 #if !defined(ARM_MATH_NEON) 270 float32_t x0, x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 271 #endif 272 #endif 273 274 /* The algorithm implementation is based on the lengths of the inputs. */ 275 /* srcB is always made to slide across srcA. */ 276 /* So srcBLen is always considered as shorter or equal to srcALen */ 277 if (srcALen >= srcBLen) 278 { 279 /* Initialization of inputA pointer */ 280 pIn1 = pSrcA; 281 282 /* Initialization of inputB pointer */ 283 pIn2 = pSrcB; 284 } 285 else 286 { 287 /* Initialization of inputA pointer */ 288 pIn1 = pSrcB; 289 290 /* Initialization of inputB pointer */ 291 pIn2 = pSrcA; 292 293 /* srcBLen is always considered as shorter or equal to srcALen */ 294 j = srcBLen; 295 srcBLen = srcALen; 296 srcALen = j; 297 } 298 299 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 300 /* The function is internally 301 * divided into three stages according to the number of multiplications that has to be 302 * taken place between inputA samples and inputB samples. In the first stage of the 303 * algorithm, the multiplications increase by one for every iteration. 304 * In the second stage of the algorithm, srcBLen number of multiplications are done. 305 * In the third stage of the algorithm, the multiplications decrease by one 306 * for every iteration. */ 307 308 /* The algorithm is implemented in three stages. 309 The loop counters of each stage is initiated here. */ 310 blockSize1 = srcBLen - 1U; 311 blockSize2 = srcALen - (srcBLen - 1U); 312 blockSize3 = blockSize1; 313 314 /* -------------------------- 315 * Initializations of stage1 316 * -------------------------*/ 317 318 /* sum = x[0] * y[0] 319 * sum = x[0] * y[1] + x[1] * y[0] 320 * .... 321 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 322 */ 323 324 /* In this stage the MAC operations are increased by 1 for every iteration. 325 The count variable holds the number of MAC operations performed */ 326 count = 1U; 327 328 /* Working pointer of inputA */ 329 px = pIn1; 330 331 /* Working pointer of inputB */ 332 py = pIn2; 333 334 335 /* ------------------------ 336 * Stage1 process 337 * ----------------------*/ 338 #if defined(ARM_MATH_NEON) 339 float32x4_t vec1; 340 float32x4_t vec2; 341 float32x4_t res = vdupq_n_f32(0) ; 342 float32x2_t accum = vdup_n_f32(0); 343 #endif /* #if defined(ARM_MATH_NEON) */ 344 345 /* The first stage starts here */ 346 while (blockSize1 > 0U) 347 { 348 /* Accumulator is made zero for every iteration */ 349 sum = 0.0f; 350 351 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) 352 /* Loop unrolling: Compute 4 outputs at a time */ 353 k = count >> 2U; 354 355 #if defined(ARM_MATH_NEON) 356 res = vdupq_n_f32(0) ; 357 accum = vdup_n_f32(0); 358 359 /* Compute 4 MACs simultaneously. */ 360 k = count >> 2U; 361 362 /* First part of the processing. Compute 4 MACs at a time. 363 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 364 365 while (k > 0U) 366 { 367 vec1 = vld1q_f32(px); 368 vec2 = vld1q_f32(py-3); 369 vec2 = vrev64q_f32(vec2); 370 vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2)); 371 372 res = vmlaq_f32(res,vec1, vec2); 373 374 /* Increment pointers */ 375 px += 4; 376 py -= 4; 377 378 /* Decrement the loop counter */ 379 k--; 380 } 381 382 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res)); 383 sum += accum[0] + accum[1]; 384 385 /* If the count is not a multiple of 4, compute any remaining MACs here. 386 ** No loop unrolling is used. */ 387 k = count & 3; 388 #else 389 while (k > 0U) 390 { 391 /* x[0] * y[srcBLen - 1] */ 392 sum += *px++ * *py--; 393 394 /* x[1] * y[srcBLen - 2] */ 395 sum += *px++ * *py--; 396 397 /* x[2] * y[srcBLen - 3] */ 398 sum += *px++ * *py--; 399 400 /* x[3] * y[srcBLen - 4] */ 401 sum += *px++ * *py--; 402 403 /* Decrement loop counter */ 404 k--; 405 } 406 407 /* Loop unrolling: Compute remaining outputs */ 408 k = count % 0x4U; 409 410 #endif /* #if defined(ARM_MATH_NEON) */ 411 412 #else /* defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */ 413 /* Initialize k with number of samples */ 414 k = count; 415 416 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */ 417 418 while (k > 0U) 419 { 420 /* Perform the multiply-accumulate */ 421 sum += *px++ * *py--; 422 423 /* Decrement loop counter */ 424 k--; 425 } 426 427 /* Store the result in the accumulator in the destination buffer. */ 428 *pOut++ = sum; 429 430 /* Update the inputA and inputB pointers for next MAC calculation */ 431 py = pIn2 + count; 432 px = pIn1; 433 434 /* Increment MAC count */ 435 count++; 436 437 /* Decrement loop counter */ 438 blockSize1--; 439 } 440 441 /* -------------------------- 442 * Initializations of stage2 443 * ------------------------*/ 444 445 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 446 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 447 * .... 448 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 449 */ 450 451 /* Working pointer of inputA */ 452 px = pIn1; 453 454 /* Working pointer of inputB */ 455 pSrc2 = pIn2 + (srcBLen - 1U); 456 py = pSrc2; 457 458 /* count is index by which the pointer pIn1 to be incremented */ 459 count = 0U; 460 461 /* ------------------- 462 * Stage2 process 463 * ------------------*/ 464 465 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 466 * So, to loop unroll over blockSize2, 467 * srcBLen should be greater than or equal to 4 */ 468 if (srcBLen >= 4U) 469 { 470 471 #if defined(ARM_MATH_NEON) 472 float32x4_t c; 473 float32x4_t x1v; 474 float32x4_t x2v; 475 float32x4_t x; 476 float32x4_t res = vdupq_n_f32(0) ; 477 #endif /* #if defined(ARM_MATH_NEON) */ 478 479 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) 480 481 /* Loop unrolling: Compute 4 outputs at a time */ 482 blkCnt = blockSize2 >> 2U; 483 484 while (blkCnt > 0U) 485 { 486 /* Set all accumulators to zero */ 487 acc0 = 0.0f; 488 acc1 = 0.0f; 489 acc2 = 0.0f; 490 acc3 = 0.0f; 491 492 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 493 k = srcBLen >> 2U; 494 495 #if defined(ARM_MATH_NEON) 496 res = vdupq_n_f32(0) ; 497 498 x1v = vld1q_f32(px); 499 x2v = vld1q_f32(px+4); 500 501 do 502 { 503 c = vld1q_f32(py-3); 504 505 px += 4; 506 x = x1v; 507 res = vmlaq_n_f32(res,x,c[3]); 508 509 x = vextq_f32(x1v,x2v,1); 510 511 res = vmlaq_n_f32(res,x,c[2]); 512 513 x = vextq_f32(x1v,x2v,2); 514 515 res = vmlaq_n_f32(res,x,c[1]); 516 517 x = vextq_f32(x1v,x2v,3); 518 519 res = vmlaq_n_f32(res,x,c[0]); 520 521 py -= 4; 522 523 x1v = x2v ; 524 x2v = vld1q_f32(px+4); 525 526 } while (--k); 527 528 529 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 530 ** No loop unrolling is used. */ 531 k = srcBLen & 0x3; 532 533 x1v = vld1q_f32(px); 534 px += 4; 535 536 while (k > 0U) 537 { 538 /* Read y[srcBLen - 5] sample */ 539 c0 = *(py--); 540 541 res = vmlaq_n_f32(res,x1v,c0); 542 543 /* Reuse the present samples for the next MAC */ 544 x1v[0] = x1v[1]; 545 x1v[1] = x1v[2]; 546 x1v[2] = x1v[3]; 547 548 x1v[3] = *(px++); 549 550 /* Decrement the loop counter */ 551 k--; 552 } 553 554 acc0 = res[0]; 555 acc1 = res[1]; 556 acc2 = res[2]; 557 acc3 = res[3]; 558 559 #else 560 /* read x[0], x[1], x[2] samples */ 561 x0 = *px++; 562 x1 = *px++; 563 x2 = *px++; 564 565 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 566 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 567 do 568 { 569 /* Read y[srcBLen - 1] sample */ 570 c0 = *py--; 571 /* Read x[3] sample */ 572 x3 = *(px); 573 574 /* Perform the multiply-accumulate */ 575 /* acc0 += x[0] * y[srcBLen - 1] */ 576 acc0 += x0 * c0; 577 /* acc1 += x[1] * y[srcBLen - 1] */ 578 acc1 += x1 * c0; 579 /* acc2 += x[2] * y[srcBLen - 1] */ 580 acc2 += x2 * c0; 581 /* acc3 += x[3] * y[srcBLen - 1] */ 582 acc3 += x3 * c0; 583 584 /* Read y[srcBLen - 2] sample */ 585 c0 = *py--; 586 /* Read x[4] sample */ 587 x0 = *(px + 1U); 588 589 /* Perform the multiply-accumulate */ 590 /* acc0 += x[1] * y[srcBLen - 2] */ 591 acc0 += x1 * c0; 592 /* acc1 += x[2] * y[srcBLen - 2] */ 593 acc1 += x2 * c0; 594 /* acc2 += x[3] * y[srcBLen - 2] */ 595 acc2 += x3 * c0; 596 /* acc3 += x[4] * y[srcBLen - 2] */ 597 acc3 += x0 * c0; 598 599 /* Read y[srcBLen - 3] sample */ 600 c0 = *py--; 601 /* Read x[5] sample */ 602 x1 = *(px + 2U); 603 604 /* Perform the multiply-accumulate */ 605 /* acc0 += x[2] * y[srcBLen - 3] */ 606 acc0 += x2 * c0; 607 /* acc1 += x[3] * y[srcBLen - 2] */ 608 acc1 += x3 * c0; 609 /* acc2 += x[4] * y[srcBLen - 2] */ 610 acc2 += x0 * c0; 611 /* acc3 += x[5] * y[srcBLen - 2] */ 612 acc3 += x1 * c0; 613 614 /* Read y[srcBLen - 4] sample */ 615 c0 = *py--; 616 /* Read x[6] sample */ 617 x2 = *(px + 3U); 618 px += 4U; 619 620 /* Perform the multiply-accumulate */ 621 /* acc0 += x[3] * y[srcBLen - 4] */ 622 acc0 += x3 * c0; 623 /* acc1 += x[4] * y[srcBLen - 4] */ 624 acc1 += x0 * c0; 625 /* acc2 += x[5] * y[srcBLen - 4] */ 626 acc2 += x1 * c0; 627 /* acc3 += x[6] * y[srcBLen - 4] */ 628 acc3 += x2 * c0; 629 630 } while (--k); 631 632 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 633 ** No loop unrolling is used. */ 634 k = srcBLen % 0x4U; 635 636 while (k > 0U) 637 { 638 /* Read y[srcBLen - 5] sample */ 639 c0 = *py--; 640 /* Read x[7] sample */ 641 x3 = *px++; 642 643 /* Perform the multiply-accumulate */ 644 /* acc0 += x[4] * y[srcBLen - 5] */ 645 acc0 += x0 * c0; 646 /* acc1 += x[5] * y[srcBLen - 5] */ 647 acc1 += x1 * c0; 648 /* acc2 += x[6] * y[srcBLen - 5] */ 649 acc2 += x2 * c0; 650 /* acc3 += x[7] * y[srcBLen - 5] */ 651 acc3 += x3 * c0; 652 653 /* Reuse the present samples for the next MAC */ 654 x0 = x1; 655 x1 = x2; 656 x2 = x3; 657 658 /* Decrement the loop counter */ 659 k--; 660 } 661 #endif /* #if defined(ARM_MATH_NEON) */ 662 663 /* Store the result in the accumulator in the destination buffer. */ 664 *pOut++ = acc0; 665 *pOut++ = acc1; 666 *pOut++ = acc2; 667 *pOut++ = acc3; 668 669 /* Increment the pointer pIn1 index, count by 4 */ 670 count += 4U; 671 672 /* Update the inputA and inputB pointers for next MAC calculation */ 673 px = pIn1 + count; 674 py = pSrc2; 675 676 /* Decrement the loop counter */ 677 blkCnt--; 678 } 679 680 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 681 ** No loop unrolling is used. */ 682 blkCnt = blockSize2 % 0x4U; 683 684 #else 685 686 /* Initialize blkCnt with number of samples */ 687 blkCnt = blockSize2; 688 689 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/ 690 691 while (blkCnt > 0U) 692 { 693 /* Accumulator is made zero for every iteration */ 694 sum = 0.0f; 695 696 #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) 697 /* Loop unrolling: Compute 4 outputs at a time */ 698 k = srcBLen >> 2U; 699 700 #if defined (ARM_MATH_NEON) 701 float32x4_t res = vdupq_n_f32(0) ; 702 float32x4_t x = vdupq_n_f32(0) ; 703 float32x4_t y = vdupq_n_f32(0) ; 704 float32x2_t accum = vdup_n_f32(0) ; 705 706 /* First part of the processing. Compute 4 MACs at a time. 707 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 708 while (k > 0U) 709 { 710 x = vld1q_f32(px); 711 y = vld1q_f32(py-3); 712 713 y = vrev64q_f32(y); 714 y = vcombine_f32(vget_high_f32(y), vget_low_f32(y)); 715 716 res = vmlaq_f32(res,x,y); 717 718 px += 4 ; 719 py -= 4 ; 720 721 /* Decrement the loop counter */ 722 k--; 723 } 724 725 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res)); 726 sum += accum[0] + accum[1]; 727 728 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 729 ** No loop unrolling is used. */ 730 k = srcBLen & 0x3U; 731 732 #else 733 while (k > 0U) 734 { 735 /* Perform the multiply-accumulate */ 736 sum += *px++ * *py--; 737 sum += *px++ * *py--; 738 sum += *px++ * *py--; 739 sum += *px++ * *py--; 740 741 /* Decrement loop counter */ 742 k--; 743 } 744 745 /* Loop unrolling: Compute remaining outputs */ 746 k = srcBLen % 0x4U; 747 748 #endif /* if defined (ARM_MATH_NEON) */ 749 #else 750 /* Initialize blkCnt with number of samples */ 751 k = srcBLen; 752 753 #endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */ 754 755 while (k > 0U) 756 { 757 /* Perform the multiply-accumulate */ 758 sum += *px++ * *py--; 759 760 /* Decrement the loop counter */ 761 k--; 762 } 763 764 /* Store the result in the accumulator in the destination buffer. */ 765 *pOut++ = sum; 766 767 /* Increment the MAC count */ 768 count++; 769 770 /* Update the inputA and inputB pointers for next MAC calculation */ 771 px = pIn1 + count; 772 py = pSrc2; 773 774 /* Decrement the loop counter */ 775 blkCnt--; 776 } 777 } 778 else 779 { 780 /* If the srcBLen is not a multiple of 4, 781 * the blockSize2 loop cannot be unrolled by 4 */ 782 blkCnt = blockSize2; 783 784 while (blkCnt > 0U) 785 { 786 /* Accumulator is made zero for every iteration */ 787 sum = 0.0f; 788 789 /* srcBLen number of MACS should be performed */ 790 k = srcBLen; 791 792 while (k > 0U) 793 { 794 /* Perform the multiply-accumulate */ 795 sum += *px++ * *py--; 796 797 /* Decrement the loop counter */ 798 k--; 799 } 800 801 /* Store the result in the accumulator in the destination buffer. */ 802 *pOut++ = sum; 803 804 /* Increment the MAC count */ 805 count++; 806 807 /* Update the inputA and inputB pointers for next MAC calculation */ 808 px = pIn1 + count; 809 py = pSrc2; 810 811 /* Decrement the loop counter */ 812 blkCnt--; 813 } 814 } 815 816 817 /* -------------------------- 818 * Initializations of stage3 819 * -------------------------*/ 820 821 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 822 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 823 * .... 824 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 825 * sum += x[srcALen-1] * y[srcBLen-1] 826 */ 827 828 /* In this stage the MAC operations are decreased by 1 for every iteration. 829 The blockSize3 variable holds the number of MAC operations performed */ 830 831 /* Working pointer of inputA */ 832 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); 833 px = pSrc1; 834 835 /* Working pointer of inputB */ 836 pSrc2 = pIn2 + (srcBLen - 1U); 837 py = pSrc2; 838 839 /* ------------------- 840 * Stage3 process 841 * ------------------*/ 842 while (blockSize3 > 0U) 843 { 844 /* Accumulator is made zero for every iteration */ 845 sum = 0.0f; 846 847 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) 848 /* Loop unrolling: Compute 4 outputs at a time */ 849 k = blockSize3 >> 2U; 850 851 #if defined(ARM_MATH_NEON) 852 float32x4_t res = vdupq_n_f32(0) ; 853 float32x4_t x = vdupq_n_f32(0) ; 854 float32x4_t y = vdupq_n_f32(0) ; 855 float32x2_t accum = vdup_n_f32(0) ; 856 857 while (k > 0U) 858 { 859 x = vld1q_f32(px); 860 y = vld1q_f32(py-3); 861 862 y = vrev64q_f32(y); 863 y = vcombine_f32(vget_high_f32(y), vget_low_f32(y)); 864 865 res = vmlaq_f32(res,x,y); 866 867 px += 4 ; 868 py -= 4 ; 869 870 /* Decrement the loop counter */ 871 k--; 872 } 873 874 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res)); 875 sum += accum[0] + accum[1]; 876 877 #else 878 while (k > 0U) 879 { 880 /* Perform the multiply-accumulate */ 881 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 882 sum += *px++ * *py--; 883 884 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 885 sum += *px++ * *py--; 886 887 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 888 sum += *px++ * *py--; 889 890 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 891 sum += *px++ * *py--; 892 893 /* Decrement loop counter */ 894 k--; 895 } 896 #endif /* #if defined (ARM_MATH_NEON) */ 897 898 /* Loop unrolling: Compute remaining outputs */ 899 k = blockSize3 % 0x4U; 900 #else 901 902 /* Initialize blkCnt with number of samples */ 903 k = blockSize3; 904 905 #endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/ 906 907 while (k > 0U) 908 { 909 /* Perform the multiply-accumulate */ 910 /* sum += x[srcALen-1] * y[srcBLen-1] */ 911 sum += *px++ * *py--; 912 913 /* Decrement loop counter */ 914 k--; 915 } 916 917 /* Store the result in the accumulator in the destination buffer. */ 918 *pOut++ = sum; 919 920 /* Update the inputA and inputB pointers for next MAC calculation */ 921 px = ++pSrc1; 922 py = pSrc2; 923 924 /* Decrement the loop counter */ 925 blockSize3--; 926 } 927 928 #else 929 /* alternate version for CM0_FAMILY */ 930 931 const float32_t *pIn1 = pSrcA; /* InputA pointer */ 932 const float32_t *pIn2 = pSrcB; /* InputB pointer */ 933 float32_t sum; /* Accumulator */ 934 uint32_t i, j; /* Loop counters */ 935 936 /* Loop to calculate convolution for output length number of times */ 937 for (i = 0U; i < (srcALen + srcBLen - 1U); i++) 938 { 939 /* Initialize sum with zero to carry out MAC operations */ 940 sum = 0.0f; 941 942 /* Loop to perform MAC operations according to convolution equation */ 943 for (j = 0U; j <= i; j++) 944 { 945 /* Check the array limitations */ 946 if (((i - j) < srcBLen) && (j < srcALen)) 947 { 948 /* z[i] += x[i-j] * y[j] */ 949 sum += ( pIn1[j] * pIn2[i - j]); 950 } 951 } 952 953 /* Store the output in the destination buffer */ 954 pDst[i] = sum; 955 } 956 957 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */ 958 959 } 960 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 961 962 /** 963 @} end of Conv group 964 */