arm_conv_q7.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_conv_q7.c 4 * Description: Convolution of Q7 sequences 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @addtogroup Conv 37 @{ 38 */ 39 40 /** 41 @brief Convolution of Q7 sequences. 42 @param[in] pSrcA points to the first input sequence 43 @param[in] srcALen length of the first input sequence 44 @param[in] pSrcB points to the second input sequence 45 @param[in] srcBLen length of the second input sequence 46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 47 @return none 48 49 @par Scaling and Overflow Behavior 50 The function is implemented using a 32-bit internal accumulator. 51 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 52 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 53 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 54 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 55 @remark 56 Refer to \ref arm_conv_opt_q7() for a faster implementation of this function. 57 */ 58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 59 #include "arm_helium_utils.h" 60 61 #include "arm_vec_filtering.h" 62 63 void arm_conv_q7( 64 const q7_t * pSrcA, 65 uint32_t srcALen, 66 const q7_t * pSrcB, 67 uint32_t srcBLen, 68 q7_t * pDst) 69 { 70 const q7_t *pIn1 = pSrcA; /* inputA pointer */ 71 const q7_t *pIn2 = pSrcB; /* inputB pointer */ 72 /* 73 * Loop to perform MAC operations according to correlation equation 74 */ 75 const q7_t *pX; 76 const q7_t *pY; 77 const q7_t *pA; 78 const q7_t *pB; 79 int32_t i = 0U, j = 0; /* loop counters */ 80 int32_t block1, block2, block3; 81 uint8_t vddupStartIdx = 15; 82 uint8x16_t decrIdxVec = vddupq_u8(vddupStartIdx, 1); 83 84 if (srcALen < srcBLen) 85 { 86 /* 87 * Initialization to inputB pointer 88 */ 89 pIn1 = pSrcB; 90 /* 91 * Initialization to the end of inputA pointer 92 */ 93 pIn2 = pSrcA; 94 /* 95 * Swapping the lengths 96 */ 97 j = srcALen; 98 srcALen = srcBLen; 99 srcBLen = j; 100 } 101 102 block1 = srcBLen - 1; 103 block2 = srcALen - srcBLen + 1; 104 block3 = srcBLen - 1; 105 106 pA = pIn1; 107 pB = pIn2 - 15; 108 109 for (i = 0; i <= block1 - 2; i += 2) 110 { 111 uint32_t count = i + 1; 112 int32_t acc0 = 0; 113 int32_t acc1 = 0; 114 115 pX = pA; 116 pY = pB; 117 118 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count); 119 *pDst++ = (q7_t) acc0; 120 *pDst++ = (q7_t) acc1; 121 pB += 2; 122 } 123 for (; i < block1; i++) 124 { 125 uint32_t count = i + 1; 126 int32_t acc = 0; 127 128 pX = pA; 129 pY = pB; 130 131 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count); 132 *pDst++ = (q7_t) acc; 133 pB++; 134 } 135 136 for (i = 0; i <= block2 - 4; i += 4) 137 { 138 uint32_t count = srcBLen; 139 int32_t acc0 = 0; 140 int32_t acc1 = 0; 141 int32_t acc2 = 0; 142 int32_t acc3 = 0; 143 144 pX = pA; 145 pY = pB; 146 /* 147 * compute 4 accumulators per loop 148 * size is fixed for all accumulators 149 * X pointer is incrementing for successive accumulators 150 */ 151 MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count); 152 *pDst++ = (q7_t) acc0; 153 *pDst++ = (q7_t) acc1; 154 *pDst++ = (q7_t) acc2; 155 *pDst++ = (q7_t) acc3; 156 pA += 4; 157 } 158 for (; i <= block2 - 2; i += 2) 159 { 160 uint32_t count = srcBLen; 161 int32_t acc0 = 0; 162 int32_t acc1 = 0; 163 164 pX = pA; 165 pY = pB; 166 /* 167 * compute 2 accumulators per loop 168 * size is fixed for all accumulators 169 * X pointer is incrementing for successive accumulators 170 */ 171 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count); 172 *pDst++ = (q7_t) acc0; 173 *pDst++ = (q7_t) acc1; 174 pA += 2; 175 } 176 if (block2 & 1) 177 { 178 uint32_t count = srcBLen; 179 int32_t acc = 0; 180 181 pX = pA; 182 pY = pB; 183 184 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count); 185 *pDst++ = (q7_t) acc; 186 pA++; 187 } 188 189 for (i = block3; i >= 1; i -= 2) 190 { 191 uint32_t count = i; 192 int32_t acc0 = 0; 193 int32_t acc1 = 0; 194 195 pX = pA; 196 pY = pB; 197 198 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count); 199 *pDst++ = (q7_t) acc0; 200 *pDst++ = (q7_t) acc1; 201 pA += 2; 202 } 203 for (; i >= 1; i--) 204 { 205 uint32_t count = i; 206 int32_t acc = 0; 207 208 pX = pA; 209 pY = pB; 210 211 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count); 212 *pDst++ = (q7_t) acc; 213 pA++; 214 } 215 } 216 217 #else 218 void arm_conv_q7( 219 const q7_t * pSrcA, 220 uint32_t srcALen, 221 const q7_t * pSrcB, 222 uint32_t srcBLen, 223 q7_t * pDst) 224 { 225 226 #if (1) 227 //#if !defined(ARM_MATH_CM0_FAMILY) 228 229 const q7_t *pIn1; /* InputA pointer */ 230 const q7_t *pIn2; /* InputB pointer */ 231 q7_t *pOut = pDst; /* Output pointer */ 232 const q7_t *px; /* Intermediate inputA pointer */ 233 const q7_t *py; /* Intermediate inputB pointer */ 234 const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 235 q31_t sum; /* Accumulators */ 236 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ 237 uint32_t j, k, count, blkCnt; /* Loop counters */ 238 239 #if defined (ARM_MATH_LOOPUNROLL) 240 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 241 q31_t input1, input2; /* Temporary input variables */ 242 q15_t in1, in2; /* Temporary input variables */ 243 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 244 #endif 245 246 /* The algorithm implementation is based on the lengths of the inputs. */ 247 /* srcB is always made to slide across srcA. */ 248 /* So srcBLen is always considered as shorter or equal to srcALen */ 249 if (srcALen >= srcBLen) 250 { 251 /* Initialization of inputA pointer */ 252 pIn1 = pSrcA; 253 254 /* Initialization of inputB pointer */ 255 pIn2 = pSrcB; 256 } 257 else 258 { 259 /* Initialization of inputA pointer */ 260 pIn1 = pSrcB; 261 262 /* Initialization of inputB pointer */ 263 pIn2 = pSrcA; 264 265 /* srcBLen is always considered as shorter or equal to srcALen */ 266 j = srcBLen; 267 srcBLen = srcALen; 268 srcALen = j; 269 } 270 271 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 272 /* The function is internally 273 * divided into three stages according to the number of multiplications that has to be 274 * taken place between inputA samples and inputB samples. In the first stage of the 275 * algorithm, the multiplications increase by one for every iteration. 276 * In the second stage of the algorithm, srcBLen number of multiplications are done. 277 * In the third stage of the algorithm, the multiplications decrease by one 278 * for every iteration. */ 279 280 /* The algorithm is implemented in three stages. 281 The loop counters of each stage is initiated here. */ 282 blockSize1 = srcBLen - 1U; 283 blockSize2 = srcALen - (srcBLen - 1U); 284 blockSize3 = blockSize1; 285 286 /* -------------------------- 287 * Initializations of stage1 288 * -------------------------*/ 289 290 /* sum = x[0] * y[0] 291 * sum = x[0] * y[1] + x[1] * y[0] 292 * .... 293 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 294 */ 295 296 /* In this stage the MAC operations are increased by 1 for every iteration. 297 The count variable holds the number of MAC operations performed */ 298 count = 1U; 299 300 /* Working pointer of inputA */ 301 px = pIn1; 302 303 /* Working pointer of inputB */ 304 py = pIn2; 305 306 307 /* ------------------------ 308 * Stage1 process 309 * ----------------------*/ 310 311 /* The first stage starts here */ 312 while (blockSize1 > 0U) 313 { 314 /* Accumulator is made zero for every iteration */ 315 sum = 0; 316 317 #if defined (ARM_MATH_LOOPUNROLL) 318 319 /* Loop unrolling: Compute 4 outputs at a time */ 320 k = count >> 2U; 321 322 while (k > 0U) 323 { 324 /* x[0] , x[1] */ 325 in1 = (q15_t) *px++; 326 in2 = (q15_t) *px++; 327 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 328 329 /* y[srcBLen - 1] , y[srcBLen - 2] */ 330 in1 = (q15_t) *py--; 331 in2 = (q15_t) *py--; 332 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 333 334 /* x[0] * y[srcBLen - 1] */ 335 /* x[1] * y[srcBLen - 2] */ 336 sum = __SMLAD(input1, input2, sum); 337 338 /* x[2] , x[3] */ 339 in1 = (q15_t) *px++; 340 in2 = (q15_t) *px++; 341 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 342 343 /* y[srcBLen - 3] , y[srcBLen - 4] */ 344 in1 = (q15_t) *py--; 345 in2 = (q15_t) *py--; 346 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 347 348 /* x[2] * y[srcBLen - 3] */ 349 /* x[3] * y[srcBLen - 4] */ 350 sum = __SMLAD(input1, input2, sum); 351 352 /* Decrement loop counter */ 353 k--; 354 } 355 356 /* Loop unrolling: Compute remaining outputs */ 357 k = count % 0x4U; 358 359 #else 360 361 /* Initialize k with number of samples */ 362 k = count; 363 364 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 365 366 while (k > 0U) 367 { 368 /* Perform the multiply-accumulate */ 369 sum += ((q15_t) *px++ * *py--); 370 371 /* Decrement loop counter */ 372 k--; 373 } 374 375 /* Store the result in the accumulator in the destination buffer. */ 376 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 377 378 /* Update the inputA and inputB pointers for next MAC calculation */ 379 py = pIn2 + count; 380 px = pIn1; 381 382 /* Increment MAC count */ 383 count++; 384 385 /* Decrement loop counter */ 386 blockSize1--; 387 } 388 389 /* -------------------------- 390 * Initializations of stage2 391 * ------------------------*/ 392 393 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 394 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 395 * .... 396 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 397 */ 398 399 /* Working pointer of inputA */ 400 px = pIn1; 401 402 /* Working pointer of inputB */ 403 pSrc2 = pIn2 + (srcBLen - 1U); 404 py = pSrc2; 405 406 /* count is index by which the pointer pIn1 to be incremented */ 407 count = 0U; 408 409 /* ------------------- 410 * Stage2 process 411 * ------------------*/ 412 413 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 414 * So, to loop unroll over blockSize2, 415 * srcBLen should be greater than or equal to 4 */ 416 if (srcBLen >= 4U) 417 { 418 #if defined (ARM_MATH_LOOPUNROLL) 419 420 /* Loop unrolling: Compute 4 outputs at a time */ 421 blkCnt = blockSize2 >> 2U; 422 423 while (blkCnt > 0U) 424 { 425 /* Set all accumulators to zero */ 426 acc0 = 0; 427 acc1 = 0; 428 acc2 = 0; 429 acc3 = 0; 430 431 /* read x[0], x[1], x[2] samples */ 432 x0 = *px++; 433 x1 = *px++; 434 x2 = *px++; 435 436 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 437 k = srcBLen >> 2U; 438 439 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 440 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 441 do 442 { 443 /* Read y[srcBLen - 1] sample */ 444 c0 = *py--; 445 /* Read y[srcBLen - 2] sample */ 446 c1 = *py--; 447 448 /* Read x[3] sample */ 449 x3 = *px++; 450 451 /* x[0] and x[1] are packed */ 452 in1 = (q15_t) x0; 453 in2 = (q15_t) x1; 454 455 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 456 457 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 458 in1 = (q15_t) c0; 459 in2 = (q15_t) c1; 460 461 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 462 463 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 464 acc0 = __SMLAD(input1, input2, acc0); 465 466 /* x[1] and x[2] are packed */ 467 in1 = (q15_t) x1; 468 in2 = (q15_t) x2; 469 470 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 471 472 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 473 acc1 = __SMLAD(input1, input2, acc1); 474 475 /* x[2] and x[3] are packed */ 476 in1 = (q15_t) x2; 477 in2 = (q15_t) x3; 478 479 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 480 481 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 482 acc2 = __SMLAD(input1, input2, acc2); 483 484 /* Read x[4] sample */ 485 x0 = *px++; 486 487 /* x[3] and x[4] are packed */ 488 in1 = (q15_t) x3; 489 in2 = (q15_t) x0; 490 491 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 492 493 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 494 acc3 = __SMLAD(input1, input2, acc3); 495 496 /* Read y[srcBLen - 3] sample */ 497 c0 = *py--; 498 /* Read y[srcBLen - 4] sample */ 499 c1 = *py--; 500 501 /* Read x[5] sample */ 502 x1 = *px++; 503 504 /* x[2] and x[3] are packed */ 505 in1 = (q15_t) x2; 506 in2 = (q15_t) x3; 507 508 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 509 510 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 511 in1 = (q15_t) c0; 512 in2 = (q15_t) c1; 513 514 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 515 516 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 517 acc0 = __SMLAD(input1, input2, acc0); 518 519 /* x[3] and x[4] are packed */ 520 in1 = (q15_t) x3; 521 in2 = (q15_t) x0; 522 523 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 524 525 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 526 acc1 = __SMLAD(input1, input2, acc1); 527 528 /* x[4] and x[5] are packed */ 529 in1 = (q15_t) x0; 530 in2 = (q15_t) x1; 531 532 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 533 534 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 535 acc2 = __SMLAD(input1, input2, acc2); 536 537 /* Read x[6] sample */ 538 x2 = *px++; 539 540 /* x[5] and x[6] are packed */ 541 in1 = (q15_t) x1; 542 in2 = (q15_t) x2; 543 544 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 545 546 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 547 acc3 = __SMLAD(input1, input2, acc3); 548 549 } while (--k); 550 551 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 552 ** No loop unrolling is used. */ 553 k = srcBLen % 0x4U; 554 555 while (k > 0U) 556 { 557 /* Read y[srcBLen - 5] sample */ 558 c0 = *py--; 559 /* Read x[7] sample */ 560 x3 = *px++; 561 562 /* Perform the multiply-accumulates */ 563 /* acc0 += x[4] * y[srcBLen - 5] */ 564 acc0 += ((q15_t) x0 * c0); 565 /* acc1 += x[5] * y[srcBLen - 5] */ 566 acc1 += ((q15_t) x1 * c0); 567 /* acc2 += x[6] * y[srcBLen - 5] */ 568 acc2 += ((q15_t) x2 * c0); 569 /* acc3 += x[7] * y[srcBLen - 5] */ 570 acc3 += ((q15_t) x3 * c0); 571 572 /* Reuse the present samples for the next MAC */ 573 x0 = x1; 574 x1 = x2; 575 x2 = x3; 576 577 /* Decrement loop counter */ 578 k--; 579 } 580 581 /* Store the result in the accumulator in the destination buffer. */ 582 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 583 *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8)); 584 *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8)); 585 *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8)); 586 587 /* Increment the pointer pIn1 index, count by 4 */ 588 count += 4U; 589 590 /* Update the inputA and inputB pointers for next MAC calculation */ 591 px = pIn1 + count; 592 py = pSrc2; 593 594 /* Decrement loop counter */ 595 blkCnt--; 596 } 597 598 /* Loop unrolling: Compute remaining outputs */ 599 blkCnt = blockSize2 % 0x4U; 600 601 #else 602 603 /* Initialize blkCnt with number of samples */ 604 blkCnt = blockSize2; 605 606 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 607 608 while (blkCnt > 0U) 609 { 610 /* Accumulator is made zero for every iteration */ 611 sum = 0; 612 613 #if defined (ARM_MATH_LOOPUNROLL) 614 615 /* Loop unrolling: Compute 4 outputs at a time */ 616 k = srcBLen >> 2U; 617 618 while (k > 0U) 619 { 620 621 /* Reading two inputs of SrcA buffer and packing */ 622 in1 = (q15_t) *px++; 623 in2 = (q15_t) *px++; 624 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 625 626 /* Reading two inputs of SrcB buffer and packing */ 627 in1 = (q15_t) *py--; 628 in2 = (q15_t) *py--; 629 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 630 631 /* Perform the multiply-accumulate */ 632 sum = __SMLAD(input1, input2, sum); 633 634 /* Reading two inputs of SrcA buffer and packing */ 635 in1 = (q15_t) *px++; 636 in2 = (q15_t) *px++; 637 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 638 639 /* Reading two inputs of SrcB buffer and packing */ 640 in1 = (q15_t) *py--; 641 in2 = (q15_t) *py--; 642 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 643 644 /* Perform the multiply-accumulate */ 645 sum = __SMLAD(input1, input2, sum); 646 647 /* Decrement loop counter */ 648 k--; 649 } 650 651 /* Loop unrolling: Compute remaining outputs */ 652 k = srcBLen % 0x4U; 653 654 #else 655 656 /* Initialize blkCnt with number of samples */ 657 k = srcBLen; 658 659 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 660 661 while (k > 0U) 662 { 663 /* Perform the multiply-accumulate */ 664 sum += ((q15_t) *px++ * *py--); 665 666 /* Decrement the loop counter */ 667 k--; 668 } 669 670 /* Store the result in the accumulator in the destination buffer. */ 671 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 672 673 /* Increment the pointer pIn1 index, count by 1 */ 674 count++; 675 676 /* Update the inputA and inputB pointers for next MAC calculation */ 677 px = pIn1 + count; 678 py = pSrc2; 679 680 /* Decrement the loop counter */ 681 blkCnt--; 682 } 683 } 684 else 685 { 686 /* If the srcBLen is not a multiple of 4, 687 * the blockSize2 loop cannot be unrolled by 4 */ 688 blkCnt = blockSize2; 689 690 while (blkCnt > 0U) 691 { 692 /* Accumulator is made zero for every iteration */ 693 sum = 0; 694 695 /* srcBLen number of MACS should be performed */ 696 k = srcBLen; 697 698 while (k > 0U) 699 { 700 /* Perform the multiply-accumulate */ 701 sum += ((q15_t) *px++ * *py--); 702 703 /* Decrement the loop counter */ 704 k--; 705 } 706 707 /* Store the result in the accumulator in the destination buffer. */ 708 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 709 710 /* Increment the MAC count */ 711 count++; 712 713 /* Update the inputA and inputB pointers for next MAC calculation */ 714 px = pIn1 + count; 715 py = pSrc2; 716 717 /* Decrement loop counter */ 718 blkCnt--; 719 } 720 } 721 722 723 /* -------------------------- 724 * Initializations of stage3 725 * -------------------------*/ 726 727 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 728 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 729 * .... 730 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 731 * sum += x[srcALen-1] * y[srcBLen-1] 732 */ 733 734 /* In this stage the MAC operations are decreased by 1 for every iteration. 735 The blockSize3 variable holds the number of MAC operations performed */ 736 737 /* Working pointer of inputA */ 738 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); 739 px = pSrc1; 740 741 /* Working pointer of inputB */ 742 pSrc2 = pIn2 + (srcBLen - 1U); 743 py = pSrc2; 744 745 /* ------------------- 746 * Stage3 process 747 * ------------------*/ 748 749 while (blockSize3 > 0U) 750 { 751 /* Accumulator is made zero for every iteration */ 752 sum = 0; 753 754 #if defined (ARM_MATH_LOOPUNROLL) 755 756 /* Loop unrolling: Compute 4 outputs at a time */ 757 k = blockSize3 >> 2U; 758 759 while (k > 0U) 760 { 761 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 762 in1 = (q15_t) *px++; 763 in2 = (q15_t) *px++; 764 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 765 766 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 767 in1 = (q15_t) *py--; 768 in2 = (q15_t) *py--; 769 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 770 771 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 772 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 773 sum = __SMLAD(input1, input2, sum); 774 775 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 776 in1 = (q15_t) *px++; 777 in2 = (q15_t) *px++; 778 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 779 780 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 781 in1 = (q15_t) *py--; 782 in2 = (q15_t) *py--; 783 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 784 785 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 786 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 787 sum = __SMLAD(input1, input2, sum); 788 789 /* Decrement loop counter */ 790 k--; 791 } 792 793 /* Loop unrolling: Compute remaining outputs */ 794 k = blockSize3 % 0x4U; 795 796 #else 797 798 /* Initialize blkCnt with number of samples */ 799 k = blockSize3; 800 801 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 802 803 while (k > 0U) 804 { 805 /* Perform the multiply-accumulate */ 806 /* sum += x[srcALen-1] * y[srcBLen-1] */ 807 sum += ((q15_t) *px++ * *py--); 808 809 /* Decrement loop counter */ 810 k--; 811 } 812 813 /* Store the result in the accumulator in the destination buffer. */ 814 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 815 816 /* Update the inputA and inputB pointers for next MAC calculation */ 817 px = ++pSrc1; 818 py = pSrc2; 819 820 /* Decrement loop counter */ 821 blockSize3--; 822 } 823 824 #else 825 /* alternate version for CM0_FAMILY */ 826 827 const q7_t *pIn1 = pSrcA; /* InputA pointer */ 828 const q7_t *pIn2 = pSrcB; /* InputB pointer */ 829 q31_t sum; /* Accumulator */ 830 uint32_t i, j; /* Loop counters */ 831 832 /* Loop to calculate convolution for output length number of times */ 833 for (i = 0U; i < (srcALen + srcBLen - 1U); i++) 834 { 835 /* Initialize sum with zero to carry out MAC operations */ 836 sum = 0; 837 838 /* Loop to perform MAC operations according to convolution equation */ 839 for (j = 0U; j <= i; j++) 840 { 841 /* Check the array limitations */ 842 if (((i - j) < srcBLen) && (j < srcALen)) 843 { 844 /* z[i] += x[i-j] * y[j] */ 845 sum += ((q15_t) pIn1[j] * pIn2[i - j]); 846 } 847 } 848 849 /* Store the output in the destination buffer */ 850 pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U); 851 } 852 853 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */ 854 855 } 856 #endif /* defined(ARM_MATH_MVEI) */ 857 858 /** 859 @} end of Conv group 860 */