arm_conv_q31.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_conv_q31.c 4 * Description: Convolution of Q31 sequences 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @addtogroup Conv 37 @{ 38 */ 39 40 /** 41 @brief Convolution of Q31 sequences. 42 @param[in] pSrcA points to the first input sequence 43 @param[in] srcALen length of the first input sequence 44 @param[in] pSrcB points to the second input sequence 45 @param[in] srcBLen length of the second input sequence 46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 47 @return none 48 49 @par Scaling and Overflow Behavior 50 The function is implemented using an internal 64-bit accumulator. 51 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. 52 There is no saturation on intermediate additions. 53 Thus, if the accumulator overflows it wraps around and distorts the result. 54 The input signals should be scaled down to avoid intermediate overflows. 55 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 56 as maximum of min(srcALen, srcBLen) number of additions are carried internally. 57 The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 58 59 @remark 60 Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function. 61 */ 62 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 63 #include "arm_helium_utils.h" 64 #include "arm_vec_filtering.h" 65 66 void arm_conv_q31( 67 const q31_t * pSrcA, 68 uint32_t srcALen, 69 const q31_t * pSrcB, 70 uint32_t srcBLen, 71 q31_t * pDst) 72 { 73 const q31_t *pIn1 = pSrcA; /* inputA pointer */ 74 const q31_t *pIn2 = pSrcB; /* inputB pointer */ 75 /* 76 * Loop to perform MAC operations according to correlation equation 77 */ 78 const q31_t *pX; 79 const q31_t *pY; 80 const q31_t *pA; 81 const q31_t *pB; 82 int32_t i = 0U, j = 0; /* loop counters */ 83 int32_t block1, block2, block3; 84 uint32_t vddupStartIdx = 3; 85 uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1); 86 87 if (srcALen < srcBLen) 88 { 89 /* 90 * Initialization to inputB pointer 91 */ 92 pIn1 = pSrcB; 93 /* 94 * Initialization to the end of inputA pointer 95 */ 96 pIn2 = pSrcA; 97 /* 98 * Swapping the lengths 99 */ 100 j = srcALen; 101 srcALen = srcBLen; 102 srcBLen = j; 103 } 104 105 block1 = srcBLen - 1; 106 block2 = srcALen - srcBLen + 1; 107 block3 = srcBLen - 1; 108 109 pA = pIn1; 110 pB = pIn2 - 3; 111 112 for (i = 0; i <= block1 - 2; i += 2) 113 { 114 uint32_t count = i + 1; 115 int64_t acc0 = 0LL; 116 int64_t acc1 = 0LL; 117 118 pX = pA; 119 pY = pB; 120 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count); 121 122 *pDst++ = (q31_t) acc0; 123 *pDst++ = (q31_t) acc1; 124 pB += 2; 125 } 126 for (; i < block1; i++) 127 { 128 uint32_t count = i + 1; 129 int64_t acc = 0LL; 130 131 pX = pA; 132 pY = pB; 133 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count); 134 135 *pDst++ = (q31_t) acc; 136 pB++; 137 } 138 139 for (i = 0; i <= block2 - 4; i += 4) 140 { 141 uint32_t count = srcBLen; 142 int64_t acc0 = 0LL; 143 int64_t acc1 = 0LL; 144 int64_t acc2 = 0LL; 145 int64_t acc3 = 0LL; 146 147 pX = pA; 148 pY = pB; 149 /* 150 * compute 4 accumulators per loop 151 * size is fixed for all accumulators 152 * X pointer is incrementing for successive accumulators 153 */ 154 MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count); 155 *pDst++ = (q31_t) acc0; 156 *pDst++ = (q31_t) acc1; 157 *pDst++ = (q31_t) acc2; 158 *pDst++ = (q31_t) acc3; 159 160 pA += 4; 161 } 162 163 for (; i <= block2 - 2; i += 2) 164 { 165 uint32_t count = srcBLen; 166 int64_t acc0 = 0LL; 167 int64_t acc1 = 0LL; 168 169 pX = pA; 170 pY = pB; 171 /* 172 * compute 2 accumulators per loop 173 * size is fixed for all accumulators 174 * X pointer is incrementing for successive accumulators 175 */ 176 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count); 177 *pDst++ = (q31_t) acc0; 178 *pDst++ = (q31_t) acc1; 179 180 pA += 2; 181 } 182 if (block2 & 1) 183 { 184 uint32_t count = srcBLen; 185 int64_t acc = 0LL; 186 187 pX = pA; 188 pY = pB; 189 190 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count); 191 *pDst++ = (q31_t) acc; 192 pA++; 193 } 194 195 for (i = block3; i >= 2; i -= 2) 196 { 197 uint32_t count = i; 198 int64_t acc0 = 0LL; 199 int64_t acc1 = 0LL; 200 201 pX = pA; 202 pY = pB; 203 204 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count); 205 *pDst++ = (q31_t) acc0; 206 *pDst++ = (q31_t) acc1; 207 pA += 2; 208 } 209 210 for (; i >= 1; i--) 211 { 212 uint32_t count = i; 213 int64_t acc = 0LL; 214 215 pX = pA; 216 pY = pB; 217 218 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count); 219 *pDst++ = (q31_t) acc; 220 pA++; 221 } 222 223 } 224 225 #else 226 void arm_conv_q31( 227 const q31_t * pSrcA, 228 uint32_t srcALen, 229 const q31_t * pSrcB, 230 uint32_t srcBLen, 231 q31_t * pDst) 232 { 233 234 #if (1) 235 //#if !defined(ARM_MATH_CM0_FAMILY) 236 237 const q31_t *pIn1; /* InputA pointer */ 238 const q31_t *pIn2; /* InputB pointer */ 239 q31_t *pOut = pDst; /* Output pointer */ 240 const q31_t *px; /* Intermediate inputA pointer */ 241 const q31_t *py; /* Intermediate inputB pointer */ 242 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 243 q63_t sum; /* Accumulators */ 244 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ 245 uint32_t j, k, count, blkCnt; /* Loop counters */ 246 247 #if defined (ARM_MATH_LOOPUNROLL) 248 q63_t acc0, acc1, acc2; /* Accumulators */ 249 q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ 250 #endif 251 252 /* The algorithm implementation is based on the lengths of the inputs. */ 253 /* srcB is always made to slide across srcA. */ 254 /* So srcBLen is always considered as shorter or equal to srcALen */ 255 if (srcALen >= srcBLen) 256 { 257 /* Initialization of inputA pointer */ 258 pIn1 = pSrcA; 259 260 /* Initialization of inputB pointer */ 261 pIn2 = pSrcB; 262 } 263 else 264 { 265 /* Initialization of inputA pointer */ 266 pIn1 = pSrcB; 267 268 /* Initialization of inputB pointer */ 269 pIn2 = pSrcA; 270 271 /* srcBLen is always considered as shorter or equal to srcALen */ 272 j = srcBLen; 273 srcBLen = srcALen; 274 srcALen = j; 275 } 276 277 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 278 /* The function is internally 279 * divided into three stages according to the number of multiplications that has to be 280 * taken place between inputA samples and inputB samples. In the first stage of the 281 * algorithm, the multiplications increase by one for every iteration. 282 * In the second stage of the algorithm, srcBLen number of multiplications are done. 283 * In the third stage of the algorithm, the multiplications decrease by one 284 * for every iteration. */ 285 286 /* The algorithm is implemented in three stages. 287 The loop counters of each stage is initiated here. */ 288 blockSize1 = srcBLen - 1U; 289 blockSize2 = srcALen - (srcBLen - 1U); 290 blockSize3 = blockSize1; 291 292 /* -------------------------- 293 * Initializations of stage1 294 * -------------------------*/ 295 296 /* sum = x[0] * y[0] 297 * sum = x[0] * y[1] + x[1] * y[0] 298 * .... 299 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 300 */ 301 302 /* In this stage the MAC operations are increased by 1 for every iteration. 303 The count variable holds the number of MAC operations performed */ 304 count = 1U; 305 306 /* Working pointer of inputA */ 307 px = pIn1; 308 309 /* Working pointer of inputB */ 310 py = pIn2; 311 312 313 /* ------------------------ 314 * Stage1 process 315 * ----------------------*/ 316 317 /* The first stage starts here */ 318 while (blockSize1 > 0U) 319 { 320 /* Accumulator is made zero for every iteration */ 321 sum = 0; 322 323 #if defined (ARM_MATH_LOOPUNROLL) 324 325 /* Loop unrolling: Compute 4 outputs at a time */ 326 k = count >> 2U; 327 328 while (k > 0U) 329 { 330 /* x[0] * y[srcBLen - 1] */ 331 sum += (q63_t) *px++ * (*py--); 332 333 /* x[1] * y[srcBLen - 2] */ 334 sum += (q63_t) *px++ * (*py--); 335 336 /* x[2] * y[srcBLen - 3] */ 337 sum += (q63_t) *px++ * (*py--); 338 339 /* x[3] * y[srcBLen - 4] */ 340 sum += (q63_t) *px++ * (*py--); 341 342 /* Decrement loop counter */ 343 k--; 344 } 345 346 /* Loop unrolling: Compute remaining outputs */ 347 k = count % 0x4U; 348 349 #else 350 351 /* Initialize k with number of samples */ 352 k = count; 353 354 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 355 356 while (k > 0U) 357 { 358 /* Perform the multiply-accumulate */ 359 sum += (q63_t) *px++ * *py--; 360 361 /* Decrement loop counter */ 362 k--; 363 } 364 365 /* Store the result in the accumulator in the destination buffer. */ 366 *pOut++ = (q31_t) (sum >> 31); 367 368 /* Update the inputA and inputB pointers for next MAC calculation */ 369 py = pIn2 + count; 370 px = pIn1; 371 372 /* Increment MAC count */ 373 count++; 374 375 /* Decrement loop counter */ 376 blockSize1--; 377 } 378 379 /* -------------------------- 380 * Initializations of stage2 381 * ------------------------*/ 382 383 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 384 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 385 * .... 386 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 387 */ 388 389 /* Working pointer of inputA */ 390 px = pIn1; 391 392 /* Working pointer of inputB */ 393 pSrc2 = pIn2 + (srcBLen - 1U); 394 py = pSrc2; 395 396 /* count is index by which the pointer pIn1 to be incremented */ 397 count = 0U; 398 399 /* ------------------- 400 * Stage2 process 401 * ------------------*/ 402 403 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 404 * So, to loop unroll over blockSize2, 405 * srcBLen should be greater than or equal to 4 */ 406 if (srcBLen >= 4U) 407 { 408 #if defined (ARM_MATH_LOOPUNROLL) 409 410 /* Loop unroll by 3 */ 411 blkCnt = blockSize2 / 3; 412 413 while (blkCnt > 0U) 414 { 415 /* Set all accumulators to zero */ 416 acc0 = 0; 417 acc1 = 0; 418 acc2 = 0; 419 420 /* read x[0], x[1], x[2] samples */ 421 x0 = *px++; 422 x1 = *px++; 423 424 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 425 k = srcBLen / 3; 426 427 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 428 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 429 do 430 { 431 /* Read y[srcBLen - 1] sample */ 432 c0 = *(py); 433 /* Read x[3] sample */ 434 x2 = *(px); 435 436 /* Perform the multiply-accumulate */ 437 /* acc0 += x[0] * y[srcBLen - 1] */ 438 acc0 += ((q63_t) x0 * c0); 439 /* acc1 += x[1] * y[srcBLen - 1] */ 440 acc1 += ((q63_t) x1 * c0); 441 /* acc2 += x[2] * y[srcBLen - 1] */ 442 acc2 += ((q63_t) x2 * c0); 443 444 /* Read y[srcBLen - 2] sample */ 445 c0 = *(py - 1U); 446 /* Read x[4] sample */ 447 x0 = *(px + 1U); 448 449 /* Perform the multiply-accumulate */ 450 /* acc0 += x[1] * y[srcBLen - 2] */ 451 acc0 += ((q63_t) x1 * c0); 452 /* acc1 += x[2] * y[srcBLen - 2] */ 453 acc1 += ((q63_t) x2 * c0); 454 /* acc2 += x[3] * y[srcBLen - 2] */ 455 acc2 += ((q63_t) x0 * c0); 456 457 /* Read y[srcBLen - 3] sample */ 458 c0 = *(py - 2U); 459 /* Read x[5] sample */ 460 x1 = *(px + 2U); 461 462 /* Perform the multiply-accumulate */ 463 /* acc0 += x[2] * y[srcBLen - 3] */ 464 acc0 += ((q63_t) x2 * c0); 465 /* acc1 += x[3] * y[srcBLen - 2] */ 466 acc1 += ((q63_t) x0 * c0); 467 /* acc2 += x[4] * y[srcBLen - 2] */ 468 acc2 += ((q63_t) x1 * c0); 469 470 /* update scratch pointers */ 471 px += 3U; 472 py -= 3U; 473 474 } while (--k); 475 476 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 477 ** No loop unrolling is used. */ 478 k = srcBLen - (3 * (srcBLen / 3)); 479 480 while (k > 0U) 481 { 482 /* Read y[srcBLen - 5] sample */ 483 c0 = *py--; 484 /* Read x[7] sample */ 485 x2 = *px++; 486 487 /* Perform the multiply-accumulates */ 488 /* acc0 += x[4] * y[srcBLen - 5] */ 489 acc0 += ((q63_t) x0 * c0); 490 /* acc1 += x[5] * y[srcBLen - 5] */ 491 acc1 += ((q63_t) x1 * c0); 492 /* acc2 += x[6] * y[srcBLen - 5] */ 493 acc2 += ((q63_t) x2 * c0); 494 495 /* Reuse the present samples for the next MAC */ 496 x0 = x1; 497 x1 = x2; 498 499 /* Decrement loop counter */ 500 k--; 501 } 502 503 /* Store the result in the accumulator in the destination buffer. */ 504 *pOut++ = (q31_t) (acc0 >> 31); 505 *pOut++ = (q31_t) (acc1 >> 31); 506 *pOut++ = (q31_t) (acc2 >> 31); 507 508 /* Increment the pointer pIn1 index, count by 3 */ 509 count += 3U; 510 511 /* Update the inputA and inputB pointers for next MAC calculation */ 512 px = pIn1 + count; 513 py = pSrc2; 514 515 /* Decrement loop counter */ 516 blkCnt--; 517 } 518 519 /* Loop unrolling: Compute remaining outputs */ 520 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 521 522 #else 523 524 /* Initialize blkCnt with number of samples */ 525 blkCnt = blockSize2; 526 527 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 528 529 while (blkCnt > 0U) 530 { 531 /* Accumulator is made zero for every iteration */ 532 sum = 0; 533 534 #if defined (ARM_MATH_LOOPUNROLL) 535 536 /* Loop unrolling: Compute 4 outputs at a time */ 537 k = srcBLen >> 2U; 538 539 while (k > 0U) 540 { 541 /* Perform the multiply-accumulates */ 542 sum += (q63_t) *px++ * *py--; 543 sum += (q63_t) *px++ * *py--; 544 sum += (q63_t) *px++ * *py--; 545 sum += (q63_t) *px++ * *py--; 546 547 /* Decrement loop counter */ 548 k--; 549 } 550 551 /* Loop unrolling: Compute remaining outputs */ 552 k = srcBLen % 0x4U; 553 554 #else 555 556 /* Initialize blkCnt with number of samples */ 557 k = srcBLen; 558 559 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 560 561 while (k > 0U) 562 { 563 /* Perform the multiply-accumulate */ 564 sum += (q63_t) *px++ * *py--; 565 566 /* Decrement the loop counter */ 567 k--; 568 } 569 570 /* Store the result in the accumulator in the destination buffer. */ 571 *pOut++ = (q31_t) (sum >> 31); 572 573 /* Increment MAC count */ 574 count++; 575 576 /* Update the inputA and inputB pointers for next MAC calculation */ 577 px = pIn1 + count; 578 py = pSrc2; 579 580 /* Decrement loop counter */ 581 blkCnt--; 582 } 583 } 584 else 585 { 586 /* If the srcBLen is not a multiple of 4, 587 * the blockSize2 loop cannot be unrolled by 4 */ 588 blkCnt = blockSize2; 589 590 while (blkCnt > 0U) 591 { 592 /* Accumulator is made zero for every iteration */ 593 sum = 0; 594 595 /* srcBLen number of MACS should be performed */ 596 k = srcBLen; 597 598 while (k > 0U) 599 { 600 /* Perform the multiply-accumulate */ 601 sum += (q63_t) *px++ * *py--; 602 603 /* Decrement the loop counter */ 604 k--; 605 } 606 607 /* Store the result in the accumulator in the destination buffer. */ 608 *pOut++ = (q31_t) (sum >> 31); 609 610 /* Increment MAC count */ 611 count++; 612 613 /* Update the inputA and inputB pointers for next MAC calculation */ 614 px = pIn1 + count; 615 py = pSrc2; 616 617 /* Decrement loop counter */ 618 blkCnt--; 619 } 620 } 621 622 623 /* -------------------------- 624 * Initializations of stage3 625 * -------------------------*/ 626 627 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 628 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 629 * .... 630 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 631 * sum += x[srcALen-1] * y[srcBLen-1] 632 */ 633 634 /* In this stage the MAC operations are decreased by 1 for every iteration. 635 The blockSize3 variable holds the number of MAC operations performed */ 636 637 /* Working pointer of inputA */ 638 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 639 px = pSrc1; 640 641 /* Working pointer of inputB */ 642 pSrc2 = pIn2 + (srcBLen - 1U); 643 py = pSrc2; 644 645 /* ------------------- 646 * Stage3 process 647 * ------------------*/ 648 649 while (blockSize3 > 0U) 650 { 651 /* Accumulator is made zero for every iteration */ 652 sum = 0; 653 654 #if defined (ARM_MATH_LOOPUNROLL) 655 656 /* Loop unrolling: Compute 4 outputs at a time */ 657 k = blockSize3 >> 2U; 658 659 while (k > 0U) 660 { 661 /* Perform the multiply-accumulate */ 662 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 663 sum += (q63_t) *px++ * *py--; 664 665 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 666 sum += (q63_t) *px++ * *py--; 667 668 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 669 sum += (q63_t) *px++ * *py--; 670 671 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 672 sum += (q63_t) *px++ * *py--; 673 674 /* Decrement loop counter */ 675 k--; 676 } 677 678 /* Loop unrolling: Compute remaining outputs */ 679 k = blockSize3 % 0x4U; 680 681 #else 682 683 /* Initialize blkCnt with number of samples */ 684 k = blockSize3; 685 686 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 687 688 while (k > 0U) 689 { 690 /* Perform the multiply-accumulate */ 691 /* sum += x[srcALen-1] * y[srcBLen-1] */ 692 sum += (q63_t) *px++ * *py--; 693 694 /* Decrement loop counter */ 695 k--; 696 } 697 698 /* Store the result in the accumulator in the destination buffer. */ 699 *pOut++ = (q31_t) (sum >> 31); 700 701 /* Update the inputA and inputB pointers for next MAC calculation */ 702 px = ++pSrc1; 703 py = pSrc2; 704 705 /* Decrement loop counter */ 706 blockSize3--; 707 } 708 709 #else 710 /* alternate version for CM0_FAMILY */ 711 712 const q31_t *pIn1 = pSrcA; /* InputA pointer */ 713 const q31_t *pIn2 = pSrcB; /* InputB pointer */ 714 q63_t sum; /* Accumulators */ 715 uint32_t i, j; /* Loop counters */ 716 717 /* Loop to calculate convolution for output length number of times */ 718 for (i = 0U; i < (srcALen + srcBLen - 1U); i++) 719 { 720 /* Initialize sum with zero to carry out MAC operations */ 721 sum = 0; 722 723 /* Loop to perform MAC operations according to convolution equation */ 724 for (j = 0U; j <= i; j++) 725 { 726 /* Check the array limitations */ 727 if (((i - j) < srcBLen) && (j < srcALen)) 728 { 729 /* z[i] += x[i-j] * y[j] */ 730 sum += ((q63_t) pIn1[j] * pIn2[i - j]); 731 } 732 } 733 734 /* Store the output in the destination buffer */ 735 pDst[i] = (q31_t) (sum >> 31U); 736 } 737 738 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */ 739 740 } 741 #endif /* defined(ARM_MATH_MVEI) */ 742 743 /** 744 @} end of Conv group 745 */