arm_fir_q15.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_fir_q15.c 4 * Description: Q15 FIR filter processing function 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @addtogroup FIR 37 @{ 38 */ 39 40 /** 41 @brief Processing function for the Q15 FIR filter. 42 @param[in] S points to an instance of the Q15 FIR filter structure 43 @param[in] pSrc points to the block of input data 44 @param[out] pDst points to the block of output data 45 @param[in] blockSize number of samples to process 46 @return none 47 48 @par Scaling and Overflow Behavior 49 The function is implemented using a 64-bit internal accumulator. 50 Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. 51 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 52 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. 53 After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. 54 Lastly, the accumulator is saturated to yield a result in 1.15 format. 55 56 @remark 57 Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function. 58 */ 59 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 60 61 #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff) 62 63 64 #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \ 65 for (int j = 0; j < nbAcc; j++) { \ 66 const q15_t *pSmp = &pSample[j]; \ 67 q63_t acc[4]; \ 68 \ 69 acc[j] = 0; \ 70 for (int i = 0; i < nbVecTaps; i++) { \ 71 vecIn0 = vld1q(pSmp + 8 * i); \ 72 acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \ 73 } \ 74 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \ 75 } 76 77 #define FIR_Q15_MAIN_CORE() \ 78 { \ 79 q15_t *pState = S->pState; /* State pointer */ \ 80 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ 81 q15_t *pStateCur; /* Points to the current sample of the state */ \ 82 const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \ 83 q15_t *pOutput; /* Temporary pointer to the output buffer */ \ 84 const q15_t *pTempSrc; /* Temporary pointer to the source data */ \ 85 q15_t *pTempDest; /* Temporary pointer to the destination buffer */\ 86 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ 87 int32_t blkCnt; \ 88 q15x8_t vecIn0; \ 89 \ 90 /* \ 91 * load coefs \ 92 */ \ 93 q15x8_t vecCoeffs[NBVECTAPS]; \ 94 \ 95 for (int i = 0; i < NBVECTAPS; i++) \ 96 vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \ 97 \ 98 /* \ 99 * pState points to state array which contains previous frame (numTaps - 1) samples \ 100 * pStateCur points to the location where the new input data should be written \ 101 */ \ 102 pStateCur = &(pState[(numTaps - 1u)]); \ 103 pTempSrc = pSrc; \ 104 pSamples = pState; \ 105 pOutput = pDst; \ 106 \ 107 blkCnt = blockSize >> 2; \ 108 while (blkCnt > 0) { \ 109 /* \ 110 * Save 4 input samples in the history buffer \ 111 */ \ 112 vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \ 113 pStateCur += 4; \ 114 pTempSrc += 4; \ 115 \ 116 FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \ 117 pSamples += 4; \ 118 \ 119 blkCnt--; \ 120 } \ 121 \ 122 /* tail */ \ 123 int32_t residual = blockSize & 3; \ 124 \ 125 for (int i = 0; i < residual; i++) \ 126 *pStateCur++ = *pTempSrc++; \ 127 \ 128 FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \ 129 \ 130 /* \ 131 * Copy the samples back into the history buffer start \ 132 */ \ 133 pTempSrc = &pState[blockSize]; \ 134 pTempDest = pState; \ 135 \ 136 /* current compiler limitation */ \ 137 blkCnt = (numTaps - 1) >> 3; \ 138 while (blkCnt > 0) \ 139 { \ 140 vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \ 141 pTempSrc += 8; \ 142 pTempDest += 8; \ 143 blkCnt--; \ 144 } \ 145 blkCnt = (numTaps - 1) & 7; \ 146 if (blkCnt > 0) \ 147 { \ 148 mve_pred16_t p = vctp16q(blkCnt); \ 149 vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \ 150 } \ 151 } 152 153 static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, 154 const q15_t * __restrict pSrc, 155 q15_t * __restrict pDst, uint32_t blockSize) 156 { 157 #define NBTAPS 32 158 #define NBVECTAPS (NBTAPS / 8) 159 FIR_Q15_MAIN_CORE(); 160 #undef NBVECTAPS 161 #undef NBTAPS 162 } 163 164 static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, 165 const q15_t * __restrict pSrc, 166 q15_t * __restrict pDst, uint32_t blockSize) 167 { 168 #define NBTAPS 24 169 #define NBVECTAPS (NBTAPS / 8) 170 FIR_Q15_MAIN_CORE(); 171 #undef NBVECTAPS 172 #undef NBTAPS 173 } 174 175 176 static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, 177 const q15_t * __restrict pSrc, 178 q15_t * __restrict pDst, uint32_t blockSize) 179 { 180 #define NBTAPS 16 181 #define NBVECTAPS (NBTAPS / 8) 182 FIR_Q15_MAIN_CORE(); 183 #undef NBVECTAPS 184 #undef NBTAPS 185 } 186 187 static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, 188 const q15_t * __restrict pSrc, 189 q15_t * __restrict pDst, uint32_t blockSize) 190 { 191 #define NBTAPS 8 192 #define NBVECTAPS (NBTAPS / 8) 193 FIR_Q15_MAIN_CORE(); 194 #undef NBVECTAPS 195 #undef NBTAPS 196 } 197 198 199 void arm_fir_q15( 200 const arm_fir_instance_q15 * S, 201 const q15_t * pSrc, 202 q15_t * pDst, 203 uint32_t blockSize) 204 { 205 q15_t *pState = S->pState; /* State pointer */ 206 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 207 q15_t *pStateCur; /* Points to the current sample of the state */ 208 const q15_t *pSamples; /* Temporary pointer to the sample buffer */ 209 q15_t *pOutput; /* Temporary pointer to the output buffer */ 210 const q15_t *pTempSrc; /* Temporary pointer to the source data */ 211 q15_t *pTempDest; /* Temporary pointer to the destination buffer */ 212 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 213 uint32_t blkCnt; 214 q15x8_t vecIn0; 215 uint32_t tapsBlkCnt = (numTaps + 7) / 8; 216 q63_t acc0, acc1, acc2, acc3; 217 218 219 int32_t nbTaps = (numTaps + 7) >> 3; 220 221 switch(nbTaps) { 222 223 case 1: 224 arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize); 225 return; 226 case 2: 227 arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize); 228 return; 229 case 3: 230 arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize); 231 return; 232 case 4: 233 arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize); 234 return; 235 } 236 /* 237 * pState points to state array which contains previous frame (numTaps - 1) samples 238 * pStateCur points to the location where the new input data should be written 239 */ 240 pStateCur = &(pState[(numTaps - 1u)]); 241 pTempSrc = pSrc; 242 pSamples = pState; 243 pOutput = pDst; 244 blkCnt = blockSize >> 2; 245 246 while (blkCnt > 0U) 247 { 248 const q15_t *pCoeffsTmp = pCoeffs; 249 const q15_t *pSamplesTmp = pSamples; 250 251 acc0 = 0LL; 252 acc1 = 0LL; 253 acc2 = 0LL; 254 acc3 = 0LL; 255 256 /* 257 * Save 8 input samples in the history buffer 258 */ 259 vst1q(pStateCur, vld1q(pTempSrc)); 260 pStateCur += 8; 261 pTempSrc += 8; 262 263 int i = tapsBlkCnt; 264 while (i > 0) 265 { 266 /* 267 * load 8 coefs 268 */ 269 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; 270 271 vecIn0 = vld1q(pSamplesTmp); 272 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); 273 274 vecIn0 = vld1q(&pSamplesTmp[1]); 275 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); 276 277 vecIn0 = vld1q(&pSamplesTmp[2]); 278 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); 279 280 vecIn0 = vld1q(&pSamplesTmp[3]); 281 acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs); 282 283 pSamplesTmp += 8; 284 pCoeffsTmp += 8; 285 /* 286 * Decrement the taps block loop counter 287 */ 288 i--; 289 } 290 291 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); 292 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); 293 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); 294 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15); 295 296 pSamples += 4; 297 /* 298 * Decrement the sample block loop counter 299 */ 300 blkCnt--; 301 } 302 303 uint32_t residual = blockSize & 3; 304 switch (residual) 305 { 306 case 3: 307 { 308 const q15_t *pCoeffsTmp = pCoeffs; 309 const q15_t *pSamplesTmp = pSamples; 310 311 acc0 = 0LL; 312 acc1 = 0LL; 313 acc2 = 0LL; 314 315 /* 316 * Save 8 input samples in the history buffer 317 */ 318 *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc; 319 pStateCur += 8; 320 pTempSrc += 8; 321 322 int i = tapsBlkCnt; 323 while (i > 0) 324 { 325 /* 326 * load 8 coefs 327 */ 328 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; 329 330 vecIn0 = vld1q(pSamplesTmp); 331 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); 332 333 vecIn0 = vld1q(&pSamplesTmp[2]); 334 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); 335 336 vecIn0 = vld1q(&pSamplesTmp[4]); 337 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); 338 339 pSamplesTmp += 8; 340 pCoeffsTmp += 8; 341 /* 342 * Decrement the taps block loop counter 343 */ 344 i--; 345 } 346 347 acc0 = asrl(acc0, 15); 348 acc1 = asrl(acc1, 15); 349 acc2 = asrl(acc2, 15); 350 351 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); 352 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); 353 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); 354 } 355 break; 356 357 case 2: 358 { 359 const q15_t *pCoeffsTmp = pCoeffs; 360 const q15_t *pSamplesTmp = pSamples; 361 362 acc0 = 0LL; 363 acc1 = 0LL; 364 /* 365 * Save 8 input samples in the history buffer 366 */ 367 vst1q(pStateCur, vld1q(pTempSrc)); 368 pStateCur += 8; 369 pTempSrc += 8; 370 371 int i = tapsBlkCnt; 372 while (i > 0) 373 { 374 /* 375 * load 8 coefs 376 */ 377 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; 378 379 vecIn0 = vld1q(pSamplesTmp); 380 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); 381 382 vecIn0 = vld1q(&pSamplesTmp[2]); 383 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); 384 385 pSamplesTmp += 8; 386 pCoeffsTmp += 8; 387 /* 388 * Decrement the taps block loop counter 389 */ 390 i--; 391 } 392 393 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); 394 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); 395 } 396 break; 397 398 case 1: 399 { 400 const q15_t *pCoeffsTmp = pCoeffs; 401 const q15_t *pSamplesTmp = pSamples; 402 403 acc0 = 0LL; 404 405 /* 406 * Save 8 input samples in the history buffer 407 */ 408 vst1q(pStateCur, vld1q(pTempSrc)); 409 pStateCur += 8; 410 pTempSrc += 8; 411 412 int i = tapsBlkCnt; 413 while (i > 0) 414 { 415 /* 416 * load 8 coefs 417 */ 418 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; 419 420 vecIn0 = vld1q(pSamplesTmp); 421 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); 422 423 pSamplesTmp += 8; 424 pCoeffsTmp += 8; 425 /* 426 * Decrement the taps block loop counter 427 */ 428 i--; 429 } 430 431 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); 432 } 433 break; 434 } 435 436 /* 437 * Copy the samples back into the history buffer start 438 */ 439 pTempSrc = &pState[blockSize]; 440 pTempDest = pState; 441 442 blkCnt = numTaps >> 3; 443 while (blkCnt > 0U) 444 { 445 vst1q(pTempDest, vld1q(pTempSrc)); 446 pTempSrc += 8; 447 pTempDest += 8; 448 blkCnt--; 449 } 450 blkCnt = numTaps & 7; 451 if (blkCnt > 0U) 452 { 453 mve_pred16_t p0 = vctp16q(blkCnt); 454 vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0); 455 } 456 } 457 458 #else 459 void arm_fir_q15( 460 const arm_fir_instance_q15 * S, 461 const q15_t * pSrc, 462 q15_t * pDst, 463 uint32_t blockSize) 464 { 465 q15_t *pState = S->pState; /* State pointer */ 466 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 467 q15_t *pStateCurnt; /* Points to the current sample of the state */ 468 q15_t *px; /* Temporary pointer for state buffer */ 469 const q15_t *pb; /* Temporary pointer for coefficient buffer */ 470 q63_t acc0; /* Accumulators */ 471 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 472 uint32_t tapCnt, blkCnt; /* Loop counters */ 473 474 #if defined (ARM_MATH_LOOPUNROLL) 475 q63_t acc1, acc2, acc3; /* Accumulators */ 476 q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ 477 #endif 478 479 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 480 /* pStateCurnt points to the location where the new input data should be written */ 481 pStateCurnt = &(S->pState[(numTaps - 1U)]); 482 483 #if defined (ARM_MATH_LOOPUNROLL) 484 485 /* Loop unrolling: Compute 4 output values simultaneously. 486 * The variables acc0 ... acc3 hold output values that are being computed: 487 * 488 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 489 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 490 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 491 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 492 */ 493 blkCnt = blockSize >> 2U; 494 495 while (blkCnt > 0U) 496 { 497 /* Copy 4 new input samples into the state buffer. */ 498 *pStateCurnt++ = *pSrc++; 499 *pStateCurnt++ = *pSrc++; 500 *pStateCurnt++ = *pSrc++; 501 *pStateCurnt++ = *pSrc++; 502 503 /* Set all accumulators to zero */ 504 acc0 = 0; 505 acc1 = 0; 506 acc2 = 0; 507 acc3 = 0; 508 509 /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ 510 px = pState; 511 512 /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ 513 pb = pCoeffs; 514 515 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ 516 x0 = read_q15x2_ia (&px); 517 518 /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ 519 x2 = read_q15x2_ia (&px); 520 521 /* Loop over the number of taps. Unroll by a factor of 4. 522 Repeat until we've computed numTaps-(numTaps%4) coefficients. */ 523 tapCnt = numTaps >> 2U; 524 525 while (tapCnt > 0U) 526 { 527 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ 528 c0 = read_q15x2_ia (&pb); 529 530 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 531 acc0 = __SMLALD(x0, c0, acc0); 532 533 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 534 acc2 = __SMLALD(x2, c0, acc2); 535 536 /* pack x[n-N-1] and x[n-N-2] */ 537 #ifndef ARM_MATH_BIG_ENDIAN 538 x1 = __PKHBT(x2, x0, 0); 539 #else 540 x1 = __PKHBT(x0, x2, 0); 541 #endif 542 543 /* Read state x[n-N-4], x[n-N-5] */ 544 x0 = read_q15x2_ia (&px); 545 546 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 547 acc1 = __SMLALDX(x1, c0, acc1); 548 549 /* pack x[n-N-3] and x[n-N-4] */ 550 #ifndef ARM_MATH_BIG_ENDIAN 551 x1 = __PKHBT(x0, x2, 0); 552 #else 553 x1 = __PKHBT(x2, x0, 0); 554 #endif 555 556 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 557 acc3 = __SMLALDX(x1, c0, acc3); 558 559 /* Read coefficients b[N-2], b[N-3] */ 560 c0 = read_q15x2_ia (&pb); 561 562 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 563 acc0 = __SMLALD(x2, c0, acc0); 564 565 /* Read state x[n-N-6], x[n-N-7] with offset */ 566 x2 = read_q15x2_ia (&px); 567 568 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 569 acc2 = __SMLALD(x0, c0, acc2); 570 571 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 572 acc1 = __SMLALDX(x1, c0, acc1); 573 574 /* pack x[n-N-5] and x[n-N-6] */ 575 #ifndef ARM_MATH_BIG_ENDIAN 576 x1 = __PKHBT(x2, x0, 0); 577 #else 578 x1 = __PKHBT(x0, x2, 0); 579 #endif 580 581 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 582 acc3 = __SMLALDX(x1, c0, acc3); 583 584 /* Decrement tap count */ 585 tapCnt--; 586 } 587 588 /* If the filter length is not a multiple of 4, compute the remaining filter taps. 589 This is always be 2 taps since the filter length is even. */ 590 if ((numTaps & 0x3U) != 0U) 591 { 592 /* Read last two coefficients */ 593 c0 = read_q15x2_ia (&pb); 594 595 /* Perform the multiply-accumulates */ 596 acc0 = __SMLALD(x0, c0, acc0); 597 acc2 = __SMLALD(x2, c0, acc2); 598 599 /* pack state variables */ 600 #ifndef ARM_MATH_BIG_ENDIAN 601 x1 = __PKHBT(x2, x0, 0); 602 #else 603 x1 = __PKHBT(x0, x2, 0); 604 #endif 605 606 /* Read last state variables */ 607 x0 = read_q15x2 (px); 608 609 /* Perform the multiply-accumulates */ 610 acc1 = __SMLALDX(x1, c0, acc1); 611 612 /* pack state variables */ 613 #ifndef ARM_MATH_BIG_ENDIAN 614 x1 = __PKHBT(x0, x2, 0); 615 #else 616 x1 = __PKHBT(x2, x0, 0); 617 #endif 618 619 /* Perform the multiply-accumulates */ 620 acc3 = __SMLALDX(x1, c0, acc3); 621 } 622 623 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. 624 Then store the 4 outputs in the destination buffer. */ 625 #ifndef ARM_MATH_BIG_ENDIAN 626 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16)); 627 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16)); 628 #else 629 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16)); 630 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16)); 631 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 632 633 /* Advance the state pointer by 4 to process the next group of 4 samples */ 634 pState = pState + 4U; 635 636 /* Decrement loop counter */ 637 blkCnt--; 638 } 639 640 /* Loop unrolling: Compute remaining output samples */ 641 blkCnt = blockSize % 0x4U; 642 643 #else 644 645 /* Initialize blkCnt with number of taps */ 646 blkCnt = blockSize; 647 648 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 649 650 while (blkCnt > 0U) 651 { 652 /* Copy two samples into state buffer */ 653 *pStateCurnt++ = *pSrc++; 654 655 /* Set the accumulator to zero */ 656 acc0 = 0; 657 658 /* Use SIMD to hold states and coefficients */ 659 px = pState; 660 pb = pCoeffs; 661 662 tapCnt = numTaps >> 1U; 663 664 while (tapCnt > 0U) 665 { 666 acc0 += (q31_t) *px++ * *pb++; 667 acc0 += (q31_t) *px++ * *pb++; 668 669 tapCnt--; 670 } 671 672 673 /* The result is in 2.30 format. Convert to 1.15 with saturation. 674 Then store the output in the destination buffer. */ 675 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 676 677 /* Advance state pointer by 1 for the next sample */ 678 pState = pState + 1U; 679 680 /* Decrement loop counter */ 681 blkCnt--; 682 } 683 684 /* Processing is complete. 685 Now copy the last numTaps - 1 samples to the start of the state buffer. 686 This prepares the state buffer for the next function call. */ 687 688 /* Points to the start of the state buffer */ 689 pStateCurnt = S->pState; 690 691 #if defined (ARM_MATH_LOOPUNROLL) 692 693 /* Loop unrolling: Compute 4 taps at a time */ 694 tapCnt = (numTaps - 1U) >> 2U; 695 696 /* Copy data */ 697 while (tapCnt > 0U) 698 { 699 *pStateCurnt++ = *pState++; 700 *pStateCurnt++ = *pState++; 701 *pStateCurnt++ = *pState++; 702 *pStateCurnt++ = *pState++; 703 704 /* Decrement loop counter */ 705 tapCnt--; 706 } 707 708 /* Calculate remaining number of copies */ 709 tapCnt = (numTaps - 1U) % 0x4U; 710 711 #else 712 713 /* Initialize tapCnt with number of taps */ 714 tapCnt = (numTaps - 1U); 715 716 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 717 718 /* Copy remaining data */ 719 while (tapCnt > 0U) 720 { 721 *pStateCurnt++ = *pState++; 722 723 /* Decrement loop counter */ 724 tapCnt--; 725 } 726 727 } 728 #endif /* defined(ARM_MATH_MVEI) */ 729 730 /** 731 @} end of FIR group 732 */