arm_fir_q31.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_fir_q31.c 4 * Description: Q31 FIR filter processing function 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 32 /** 33 @ingroup groupFilters 34 */ 35 36 /** 37 @addtogroup FIR 38 @{ 39 */ 40 41 /** 42 @brief Processing function for Q31 FIR filter. 43 @param[in] S points to an instance of the Q31 FIR filter structure 44 @param[in] pSrc points to the block of input data 45 @param[out] pDst points to the block of output data 46 @param[in] blockSize number of samples to process 47 @return none 48 49 @par Scaling and Overflow Behavior 50 The function is implemented using an internal 64-bit accumulator. 51 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. 52 Thus, if the accumulator result overflows it wraps around rather than clip. 53 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. 54 After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 55 56 @remark 57 Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter. 58 */ 59 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 60 61 #include "arm_helium_utils.h" 62 63 64 #define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \ 65 for (int j = 0; j < nbAcc; j++) { \ 66 const q31_t *pSmp = &pSamples[j]; \ 67 q31x4_t vecIn0; \ 68 q63_t acc[4]; \ 69 \ 70 acc[j] = 0; \ 71 for (int i = 0; i < nbVecTaps; i++) { \ 72 vecIn0 = vld1q(pSmp + 4 * i); \ 73 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ 74 } \ 75 *pOutput++ = (q31_t)asrl(acc[j], 23); \ 76 } 77 78 79 #define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \ 80 for (int j = 0; j < nbAcc; j++) { \ 81 const q31_t *pSmp = &pSamples[j]; \ 82 q31x4_t vecIn0; \ 83 \ 84 acc[j] = 0; \ 85 for (int i = 0; i < nbVecTaps; i++) { \ 86 vecIn0 = vld1q(pSmp + 4 * i); \ 87 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ 88 } \ 89 *arm_fir_partial_accu_ptr++ = acc[j]; \ 90 } 91 92 93 #define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \ 94 for (int j = 0; j < nbAcc; j++) { \ 95 const q31_t *pSmp = &pSamples[j]; \ 96 q31x4_t vecIn0; \ 97 \ 98 acc[j] = *arm_fir_partial_accu_ptr++; \ 99 \ 100 for (int i = 0; i < nbVecTaps; i++) { \ 101 vecIn0 = vld1q(pSmp + 4 * i); \ 102 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ 103 } \ 104 *pOutput++ = (q31_t)asrl(acc[j], 23); \ 105 } 106 107 108 #define FIR_Q31_MAIN_CORE() \ 109 { \ 110 q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \ 111 q31_t *pState = pRefStatePtr; /* State pointer */ \ 112 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ 113 q31_t *pStateCur; /* Points to the current sample of the state */ \ 114 const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \ 115 q31_t *pOutput; /* Temporary pointer to the output buffer */ \ 116 const q31_t *pTempSrc; /* Temporary pointer to the source data */ \ 117 q31_t *pTempDest; /* Temporary pointer to the destination buffer */\ 118 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ 119 int32_t blkCnt; \ 120 \ 121 /* \ 122 * load coefs \ 123 */ \ 124 q31x4_t vecCoeffs[NBVECTAPS]; \ 125 \ 126 for (int i = 0; i < NBVECTAPS; i++) \ 127 vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \ 128 \ 129 /* \ 130 * pState points to state array which contains previous frame (numTaps - 1) samples \ 131 * pStateCur points to the location where the new input data should be written \ 132 */ \ 133 pStateCur = &(pState[(numTaps - 1u)]); \ 134 pTempSrc = pSrc; \ 135 pSamples = pState; \ 136 pOutput = pDst; \ 137 \ 138 blkCnt = blockSize >> 2; \ 139 while (blkCnt > 0) { \ 140 /* \ 141 * Save 4 input samples in the history buffer \ 142 */ \ 143 vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \ 144 pStateCur += 4; \ 145 pTempSrc += 4; \ 146 \ 147 FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs); \ 148 \ 149 pSamples += 4; \ 150 /* \ 151 * Decrement the sample block loop counter \ 152 */ \ 153 blkCnt--; \ 154 } \ 155 \ 156 /* tail */ \ 157 int32_t residual = blockSize & 3; \ 158 switch (residual) { \ 159 case 3: \ 160 { \ 161 for (int i = 0; i < residual; i++) \ 162 *pStateCur++ = *pTempSrc++; \ 163 \ 164 FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs); \ 165 } \ 166 break; \ 167 \ 168 case 2: \ 169 { \ 170 for (int i = 0; i < residual; i++) \ 171 *pStateCur++ = *pTempSrc++; \ 172 \ 173 FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs); \ 174 } \ 175 break; \ 176 \ 177 case 1: \ 178 { \ 179 for (int i = 0; i < residual; i++) \ 180 *pStateCur++ = *pTempSrc++; \ 181 \ 182 FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs); \ 183 } \ 184 break; \ 185 } \ 186 \ 187 /* \ 188 * Copy the samples back into the history buffer start \ 189 */ \ 190 pTempSrc = &pState[blockSize]; \ 191 pTempDest = pState; \ 192 \ 193 blkCnt =(numTaps - 1) >> 2; \ 194 while (blkCnt > 0) \ 195 { \ 196 vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \ 197 pTempSrc += 4; \ 198 pTempDest += 4; \ 199 blkCnt--; \ 200 } \ 201 blkCnt = (numTaps - 1) & 3; \ 202 if (blkCnt > 0) \ 203 { \ 204 mve_pred16_t p0 = vctp32q(blkCnt); \ 205 vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \ 206 } \ 207 } 208 209 static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, 210 const q31_t * __restrict pSrc, 211 q31_t * __restrict pDst, uint32_t blockSize) 212 { 213 q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); 214 q31_t *pState = pRefStatePtr; /* State pointer */ 215 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 216 q31_t *pStateCur; /* Points to the current sample of the state */ 217 const q31_t *pSamples; /* Temporary pointer to the sample buffer */ 218 q31_t *pOutput; /* Temporary pointer to the output buffer */ 219 const q31_t *pTempSrc; /* Temporary pointer to the source data */ 220 q31_t *pTempDest; /* Temporary pointer to the destination buffer */ 221 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 222 uint32_t blkCnt; 223 q31x4_t vecIn0; 224 225 226 /* 227 * pState points to state array which contains previous frame (numTaps - 1) samples 228 * pStateCur points to the location where the new input data should be written 229 */ 230 pStateCur = &(pState[(numTaps - 1u)]); 231 pTempSrc = pSrc; 232 pSamples = pState; 233 pOutput = pDst; 234 235 q63_t acc0=0, acc1=0, acc2=0, acc3=0; 236 /* 237 * load 4 coefs 238 */ 239 q31x4_t vecCoeffs = *(q31x4_t *) pCoeffs; 240 241 blkCnt = blockSize >> 2; 242 while (blkCnt > 0U) 243 { 244 const q31_t *pSamplesTmp = pSamples; 245 246 /* 247 * Save 4 input samples in the history buffer 248 */ 249 vst1q(pStateCur, vld1q(pTempSrc)); 250 pStateCur += 4; 251 pTempSrc += 4; 252 253 vecIn0 = vld1q(pSamplesTmp); 254 acc0 = vrmlaldavhq(vecIn0, vecCoeffs); 255 256 vecIn0 = vld1q(&pSamplesTmp[1]); 257 acc1 = vrmlaldavhq(vecIn0, vecCoeffs); 258 259 vecIn0 = vld1q(&pSamplesTmp[2]); 260 acc2 = vrmlaldavhq(vecIn0, vecCoeffs); 261 262 vecIn0 = vld1q(&pSamplesTmp[3]); 263 acc3 = vrmlaldavhq(vecIn0, vecCoeffs); 264 265 acc0 = asrl(acc0, 23); 266 acc1 = asrl(acc1, 23); 267 acc2 = asrl(acc2, 23); 268 acc3 = asrl(acc3, 23); 269 270 *pOutput++ = (q31_t) acc0; 271 *pOutput++ = (q31_t) acc1; 272 *pOutput++ = (q31_t) acc2; 273 *pOutput++ = (q31_t) acc3; 274 275 pSamples += 4; 276 /* 277 * Decrement the sample block loop counter 278 */ 279 blkCnt--; 280 } 281 282 uint32_t residual = blockSize & 3; 283 switch (residual) 284 { 285 case 3: 286 { 287 /* 288 * Save 4 input samples in the history buffer 289 */ 290 *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc; 291 pStateCur += 4; 292 pTempSrc += 4; 293 294 vecIn0 = vld1q(pSamples); 295 acc0 = vrmlaldavhq(vecIn0, vecCoeffs); 296 297 vecIn0 = vld1q(&pSamples[1]); 298 acc1 = vrmlaldavhq(vecIn0, vecCoeffs); 299 300 vecIn0 = vld1q(&pSamples[2]); 301 acc2 = vrmlaldavhq(vecIn0, vecCoeffs); 302 303 acc0 = asrl(acc0, 23); 304 acc1 = asrl(acc1, 23); 305 acc2 = asrl(acc2, 23); 306 307 *pOutput++ = (q31_t) acc0; 308 *pOutput++ = (q31_t) acc1; 309 *pOutput++ = (q31_t) acc2; 310 } 311 break; 312 313 case 2: 314 { 315 /* 316 * Save 4 input samples in the history buffer 317 */ 318 vst1q(pStateCur, vld1q(pTempSrc)); 319 pStateCur += 4; 320 pTempSrc += 4; 321 322 vecIn0 = vld1q(pSamples); 323 acc0 = vrmlaldavhq(vecIn0, vecCoeffs); 324 325 vecIn0 = vld1q(&pSamples[1]); 326 acc1 = vrmlaldavhq(vecIn0, vecCoeffs); 327 328 acc0 = asrl(acc0, 23); 329 acc1 = asrl(acc1, 23); 330 331 *pOutput++ = (q31_t) acc0; 332 *pOutput++ = (q31_t) acc1; 333 } 334 break; 335 336 case 1: 337 { 338 /* 339 * Save 4 input samples in the history buffer 340 */ 341 vst1q(pStateCur, vld1q(pTempSrc)); 342 pStateCur += 4; 343 pTempSrc += 4; 344 345 vecIn0 = vld1q(pSamples); 346 acc0 = vrmlaldavhq(vecIn0, vecCoeffs); 347 348 acc0 = asrl(acc0, 23); 349 350 *pOutput++ = (q31_t) acc0; 351 } 352 break; 353 } 354 355 /* 356 * Copy the samples back into the history buffer start 357 */ 358 pTempSrc = &pState[blockSize]; 359 pTempDest = pState; 360 361 blkCnt = (numTaps-1) >> 2; 362 while (blkCnt > 0U) 363 { 364 vst1q(pTempDest, vld1q(pTempSrc)); 365 pTempSrc += 4; 366 pTempDest += 4; 367 blkCnt--; 368 } 369 blkCnt = (numTaps-1) & 3; 370 if (blkCnt > 0U) 371 { 372 mve_pred16_t p0 = vctp32q(blkCnt); 373 vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0); 374 } 375 } 376 377 378 379 static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, 380 const q31_t * __restrict pSrc, 381 q31_t * __restrict pDst, uint32_t blockSize) 382 { 383 #define NBTAPS 8 384 #define NBVECTAPS (NBTAPS / 4) 385 FIR_Q31_MAIN_CORE(); 386 #undef NBVECTAPS 387 #undef NBTAPS 388 } 389 390 391 static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S, 392 const q31_t * __restrict pSrc, 393 q31_t * __restrict pDst, uint32_t blockSize) 394 { 395 #define NBTAPS 12 396 #define NBVECTAPS (NBTAPS / 4) 397 FIR_Q31_MAIN_CORE(); 398 #undef NBVECTAPS 399 #undef NBTAPS 400 } 401 402 403 static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S, 404 const q31_t * __restrict pSrc, 405 q31_t * __restrict pDst, uint32_t blockSize) 406 { 407 #define NBTAPS 16 408 #define NBVECTAPS (NBTAPS / 4) 409 FIR_Q31_MAIN_CORE(); 410 #undef NBVECTAPS 411 #undef NBTAPS 412 } 413 414 415 static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S, 416 const q31_t * __restrict pSrc, 417 q31_t * __restrict pDst, uint32_t blockSize) 418 { 419 #define NBTAPS 20 420 #define NBVECTAPS (NBTAPS / 4) 421 FIR_Q31_MAIN_CORE(); 422 #undef NBVECTAPS 423 #undef NBTAPS 424 } 425 426 427 static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S, 428 const q31_t * __restrict pSrc, 429 q31_t * __restrict pDst, uint32_t blockSize) 430 { 431 #define NBTAPS 24 432 #define NBVECTAPS (NBTAPS / 4) 433 FIR_Q31_MAIN_CORE(); 434 #undef NBVECTAPS 435 #undef NBTAPS 436 } 437 438 439 static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S, 440 const q31_t * __restrict pSrc, 441 q31_t * __restrict pDst, uint32_t blockSize) 442 { 443 #define NBTAPS 28 444 #define NBVECTAPS (NBTAPS / 4) 445 FIR_Q31_MAIN_CORE(); 446 #undef NBVECTAPS 447 #undef NBTAPS 448 } 449 450 static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S, 451 const q31_t * __restrict pSrc, 452 q31_t * __restrict pDst, 453 uint32_t blockSize) 454 { 455 q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); 456 q31_t *pState = pRefStatePtr; /* State pointer */ 457 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 458 q31_t *pStateCur; /* Points to the current sample of the state */ 459 const q31_t *pSamples; /* Temporary pointer to the sample buffer */ 460 q31_t *pOutput; /* Temporary pointer to the output buffer */ 461 const q31_t *pTempSrc; /* Temporary pointer to the source data */ 462 q31_t *pTempDest; /* Temporary pointer to the destination buffer */ 463 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 464 int32_t blkCnt; 465 q63_t acc0, acc1, acc2, acc3; 466 467 #define MAX_VECT_BATCH 7 468 469 /* 470 * pre-load 28 1st coefs 471 */ 472 q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0); 473 q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1); 474 q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2); 475 q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3); 476 q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4); 477 q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5); 478 q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6); 479 480 /* 481 * pState points to state array which contains previous frame (numTaps - 1) samples 482 * pStateCur points to the location where the new input data should be written 483 */ 484 pStateCur = &(pState[(numTaps - 1u)]); 485 pTempSrc = pSrc; 486 pSamples = pState; 487 488 q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState; 489 490 blkCnt = blockSize >> 2; 491 while (blkCnt > 0) { 492 /* 493 * Save 4 input samples in the history buffer 494 */ 495 vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); 496 pStateCur += 4; 497 pTempSrc += 4; 498 499 const q31_t *pSmp; 500 q31x4_t vecIn0; 501 502 pSmp = &pSamples[0]; 503 504 vecIn0 = vld1q(pSmp); 505 acc0 = vrmlaldavhq(vecIn0, vecCoeffs0); 506 vecIn0 = vld1q(pSmp + 4 * 1); 507 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1); 508 vecIn0 = vld1q(pSmp + 4 * 2); 509 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2); 510 vecIn0 = vld1q(pSmp + 4 * 3); 511 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3); 512 vecIn0 = vld1q(pSmp + 4 * 4); 513 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4); 514 vecIn0 = vld1q(pSmp + 4 * 5); 515 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5); 516 vecIn0 = vld1q(pSmp + 4 * 6); 517 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6); 518 519 *arm_fir_partial_accu_ptr++ = acc0; 520 521 pSmp = &pSamples[1]; 522 523 vecIn0 = vld1q(pSmp); 524 acc1 = vrmlaldavhq(vecIn0, vecCoeffs0); 525 vecIn0 = vld1q(pSmp + 4 * 1); 526 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1); 527 vecIn0 = vld1q(pSmp + 4 * 2); 528 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2); 529 vecIn0 = vld1q(pSmp + 4 * 3); 530 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3); 531 vecIn0 = vld1q(pSmp + 4 * 4); 532 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4); 533 vecIn0 = vld1q(pSmp + 4 * 5); 534 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5); 535 vecIn0 = vld1q(pSmp + 4 * 6); 536 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6); 537 538 *arm_fir_partial_accu_ptr++ = acc1; 539 540 pSmp = &pSamples[2]; 541 542 vecIn0 = vld1q(pSmp); 543 acc2 = vrmlaldavhq(vecIn0, vecCoeffs0); 544 vecIn0 = vld1q(pSmp + 4 * 1); 545 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1); 546 vecIn0 = vld1q(pSmp + 4 * 2); 547 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2); 548 vecIn0 = vld1q(pSmp + 4 * 3); 549 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3); 550 vecIn0 = vld1q(pSmp + 4 * 4); 551 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4); 552 vecIn0 = vld1q(pSmp + 4 * 5); 553 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5); 554 vecIn0 = vld1q(pSmp + 4 * 6); 555 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6); 556 *arm_fir_partial_accu_ptr++ = acc2; 557 558 pSmp = &pSamples[3]; 559 560 vecIn0 = vld1q(pSmp); 561 acc3 = vrmlaldavhq(vecIn0, vecCoeffs0); 562 vecIn0 = vld1q(pSmp + 4 * 1); 563 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1); 564 vecIn0 = vld1q(pSmp + 4 * 2); 565 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2); 566 vecIn0 = vld1q(pSmp + 4 * 3); 567 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3); 568 vecIn0 = vld1q(pSmp + 4 * 4); 569 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4); 570 vecIn0 = vld1q(pSmp + 4 * 5); 571 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5); 572 vecIn0 = vld1q(pSmp + 4 * 6); 573 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6); 574 575 *arm_fir_partial_accu_ptr++ = acc3; 576 577 pSamples += 4; 578 /* 579 * Decrement the sample block loop counter 580 */ 581 blkCnt--; 582 } 583 584 585 /* reminder */ 586 587 /* load last 4 coef */ 588 vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH); 589 arm_fir_partial_accu_ptr = (q63_t*)S->pState; 590 pOutput = pDst; 591 pSamples = pState + (MAX_VECT_BATCH * 4); 592 593 594 blkCnt = blockSize >> 2; 595 while (blkCnt > 0) { 596 q31x4_t vecIn0; 597 598 /* reload intermediate MAC */ 599 acc0 = *arm_fir_partial_accu_ptr++; 600 acc1 = *arm_fir_partial_accu_ptr++; 601 acc2 = *arm_fir_partial_accu_ptr++; 602 acc3 = *arm_fir_partial_accu_ptr++; 603 604 605 vecIn0 = vld1q(&pSamples[0]); 606 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0); 607 608 vecIn0 = vld1q(&pSamples[1]); 609 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0); 610 611 vecIn0 = vld1q(&pSamples[2]); 612 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0); 613 614 vecIn0 = vld1q(&pSamples[3]); 615 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0); 616 617 *pOutput++ = asrl(acc0, 23); 618 *pOutput++ = asrl(acc1, 23); 619 *pOutput++ = asrl(acc2, 23); 620 *pOutput++ = asrl(acc3, 23); 621 622 pSamples += 4; 623 /* 624 * Decrement the sample block loop counter 625 */ 626 blkCnt--; 627 } 628 629 /* 630 * Copy the samples back into the history buffer start 631 */ 632 pTempSrc = &pState[blockSize]; 633 pTempDest = pState; 634 635 blkCnt = numTaps - 1; 636 do { 637 mve_pred16_t p = vctp32q(blkCnt); 638 639 vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p); 640 pTempSrc += 4; 641 pTempDest += 4; 642 blkCnt -= 4; 643 } 644 while (blkCnt > 0); 645 } 646 647 648 649 void arm_fir_q31( 650 const arm_fir_instance_q31 * S, 651 const q31_t * pSrc, 652 q31_t * pDst, 653 uint32_t blockSize) 654 { 655 q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); 656 q31_t *pState = pRefStatePtr; /* State pointer */ 657 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 658 q31_t *pStateCur; /* Points to the current sample of the state */ 659 const q31_t *pSamples; /* Temporary pointer to the sample buffer */ 660 q31_t *pOutput; /* Temporary pointer to the output buffer */ 661 const q31_t *pTempSrc; /* Temporary pointer to the source data */ 662 q31_t *pTempDest; /* Temporary pointer to the destination buffer */ 663 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 664 uint32_t blkCnt; 665 q31x4_t vecIn0; 666 uint32_t tapsBlkCnt = (numTaps + 3) / 4; 667 q63_t acc0, acc1, acc2, acc3; 668 q31x4_t vecCoeffs; 669 670 671 /* 672 * [1 to 32 taps] specialized routines 673 */ 674 if (numTaps <= 4) 675 { 676 arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize); 677 return; 678 } 679 else if (numTaps <= 8) 680 { 681 arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize); 682 return; 683 } 684 else if (numTaps <= 12) 685 { 686 arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize); 687 return; 688 } 689 else if (numTaps <= 16) 690 { 691 arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize); 692 return; 693 } 694 else if (numTaps <= 20) 695 { 696 arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize); 697 return; 698 } 699 else if (numTaps <= 24) 700 { 701 arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize); 702 return; 703 } 704 else if (numTaps <= 28) 705 { 706 arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize); 707 return; 708 } 709 else if ((numTaps <= 32) && (blockSize >= 32)) 710 { 711 arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize); 712 return; 713 } 714 715 /* 716 * pState points to state array which contains previous frame (numTaps - 1) samples 717 * pStateCur points to the location where the new input data should be written 718 */ 719 pStateCur = &(pState[(numTaps - 1u)]); 720 pSamples = pState; 721 pTempSrc = pSrc; 722 pOutput = pDst; 723 blkCnt = blockSize >> 2; 724 while (blkCnt > 0) 725 { 726 const q31_t *pCoeffsTmp = pCoeffs; 727 const q31_t *pSamplesTmp = pSamples; 728 729 acc0 = 0LL; 730 acc1 = 0LL; 731 acc2 = 0LL; 732 acc3 = 0LL; 733 734 /* 735 * Save 4 input samples in the history buffer 736 */ 737 vst1q(pStateCur, vld1q(pTempSrc)); 738 pStateCur += 4; 739 pTempSrc += 4; 740 741 int i = tapsBlkCnt; 742 while (i > 0) 743 { 744 /* 745 * load 4 coefs 746 */ 747 vecCoeffs = *(q31x4_t *) pCoeffsTmp; 748 749 vecIn0 = vld1q(pSamplesTmp); 750 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); 751 752 vecIn0 = vld1q(&pSamplesTmp[1]); 753 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); 754 755 vecIn0 = vld1q(&pSamplesTmp[2]); 756 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); 757 758 vecIn0 = vld1q(&pSamplesTmp[3]); 759 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs); 760 761 pSamplesTmp += 4; 762 pCoeffsTmp += 4; 763 /* 764 * Decrement the taps block loop counter 765 */ 766 i--; 767 } 768 769 /* .54-> .31 conversion and store accumulators */ 770 acc0 = asrl(acc0, 23); 771 acc1 = asrl(acc1, 23); 772 acc2 = asrl(acc2, 23); 773 acc3 = asrl(acc3, 23); 774 775 *pOutput++ = (q31_t) acc0; 776 *pOutput++ = (q31_t) acc1; 777 *pOutput++ = (q31_t) acc2; 778 *pOutput++ = (q31_t) acc3; 779 780 pSamples += 4; 781 782 /* 783 * Decrement the sample block loop counter 784 */ 785 blkCnt--; 786 } 787 788 int32_t residual = blockSize & 3; 789 switch (residual) 790 { 791 case 3: 792 { 793 const q31_t *pCoeffsTmp = pCoeffs; 794 const q31_t *pSamplesTmp = pSamples; 795 796 acc0 = 0LL; 797 acc1 = 0LL; 798 acc2 = 0LL; 799 800 /* 801 * Save 4 input samples in the history buffer 802 */ 803 *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc; 804 pStateCur += 4; 805 pTempSrc += 4; 806 807 int i = tapsBlkCnt; 808 while (i > 0) 809 { 810 vecCoeffs = *(q31x4_t *) pCoeffsTmp; 811 812 vecIn0 = vld1q(pSamplesTmp); 813 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); 814 815 vecIn0 = vld1q(&pSamplesTmp[1]); 816 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); 817 818 vecIn0 = vld1q(&pSamplesTmp[2]); 819 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); 820 821 pSamplesTmp += 4; 822 pCoeffsTmp += 4; 823 i--; 824 } 825 826 acc0 = asrl(acc0, 23); 827 acc1 = asrl(acc1, 23); 828 acc2 = asrl(acc2, 23); 829 830 *pOutput++ = (q31_t) acc0; 831 *pOutput++ = (q31_t) acc1; 832 *pOutput++ = (q31_t) acc2; 833 } 834 break; 835 836 case 2: 837 { 838 const q31_t *pCoeffsTmp = pCoeffs; 839 const q31_t *pSamplesTmp = pSamples; 840 841 acc0 = 0LL; 842 acc1 = 0LL; 843 844 /* 845 * Save 4 input samples in the history buffer 846 */ 847 vst1q(pStateCur, vld1q(pTempSrc)); 848 pStateCur += 4; 849 pTempSrc += 4; 850 851 int i = tapsBlkCnt; 852 while (i > 0) 853 { 854 vecCoeffs = *(q31x4_t *) pCoeffsTmp; 855 856 vecIn0 = vld1q(pSamplesTmp); 857 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); 858 859 vecIn0 = vld1q(&pSamplesTmp[1]); 860 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); 861 862 pSamplesTmp += 4; 863 pCoeffsTmp += 4; 864 i--; 865 } 866 867 acc0 = asrl(acc0, 23); 868 acc1 = asrl(acc1, 23); 869 870 *pOutput++ = (q31_t) acc0; 871 *pOutput++ = (q31_t) acc1; 872 } 873 break; 874 875 case 1: 876 { 877 const q31_t *pCoeffsTmp = pCoeffs; 878 const q31_t *pSamplesTmp = pSamples; 879 880 acc0 = 0LL; 881 882 /* 883 * Save 4 input samples in the history buffer 884 */ 885 vst1q(pStateCur, vld1q(pTempSrc)); 886 pStateCur += 4; 887 pTempSrc += 4; 888 889 int i = tapsBlkCnt; 890 while (i > 0) 891 { 892 vecCoeffs = *(q31x4_t *) pCoeffsTmp; 893 894 vecIn0 = vld1q(pSamplesTmp); 895 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); 896 897 pSamplesTmp += 4; 898 pCoeffsTmp += 4; 899 i--; 900 } 901 902 acc0 = asrl(acc0, 23); 903 904 *pOutput++ = (q31_t) acc0; 905 } 906 break; 907 } 908 909 /* 910 * Copy the samples back into the history buffer start 911 */ 912 pTempSrc = &pState[blockSize]; 913 pTempDest = pState; 914 915 blkCnt = (numTaps - 1U) >> 2; 916 while (blkCnt > 0) 917 { 918 vst1q(pTempDest, vld1q(pTempSrc)); 919 pTempSrc += 4; 920 pTempDest += 4; 921 blkCnt--; 922 } 923 blkCnt = (numTaps - 1U) & 3; 924 if (blkCnt > 0) 925 { 926 mve_pred16_t p0 = vctp32q(blkCnt); 927 vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0); 928 } 929 } 930 931 #else 932 void arm_fir_q31( 933 const arm_fir_instance_q31 * S, 934 const q31_t * pSrc, 935 q31_t * pDst, 936 uint32_t blockSize) 937 { 938 q31_t *pState = S->pState; /* State pointer */ 939 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 940 q31_t *pStateCurnt; /* Points to the current sample of the state */ 941 q31_t *px; /* Temporary pointer for state buffer */ 942 const q31_t *pb; /* Temporary pointer for coefficient buffer */ 943 q63_t acc0; /* Accumulator */ 944 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 945 uint32_t i, tapCnt, blkCnt; /* Loop counters */ 946 947 #if defined (ARM_MATH_LOOPUNROLL) 948 q63_t acc1, acc2; /* Accumulators */ 949 q31_t x0, x1, x2; /* Temporary variables to hold state values */ 950 q31_t c0; /* Temporary variable to hold coefficient value */ 951 #endif 952 953 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 954 /* pStateCurnt points to the location where the new input data should be written */ 955 pStateCurnt = &(S->pState[(numTaps - 1U)]); 956 957 #if defined (ARM_MATH_LOOPUNROLL) 958 959 /* Loop unrolling: Compute 4 output values simultaneously. 960 * The variables acc0 ... acc3 hold output values that are being computed: 961 * 962 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 963 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 964 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 965 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 966 */ 967 968 blkCnt = blockSize / 3; 969 970 while (blkCnt > 0U) 971 { 972 /* Copy 3 new input samples into the state buffer. */ 973 *pStateCurnt++ = *pSrc++; 974 *pStateCurnt++ = *pSrc++; 975 *pStateCurnt++ = *pSrc++; 976 977 /* Set all accumulators to zero */ 978 acc0 = 0; 979 acc1 = 0; 980 acc2 = 0; 981 982 /* Initialize state pointer */ 983 px = pState; 984 985 /* Initialize coefficient pointer */ 986 pb = pCoeffs; 987 988 /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */ 989 x0 = *px++; 990 x1 = *px++; 991 992 /* Loop unrolling: process 3 taps at a time. */ 993 tapCnt = numTaps / 3; 994 995 while (tapCnt > 0U) 996 { 997 /* Read the b[numTaps] coefficient */ 998 c0 = *pb; 999 1000 /* Read x[n-numTaps-2] sample */ 1001 x2 = *(px++); 1002 1003 /* Perform the multiply-accumulates */ 1004 acc0 += ((q63_t) x0 * c0); 1005 acc1 += ((q63_t) x1 * c0); 1006 acc2 += ((q63_t) x2 * c0); 1007 1008 /* Read the coefficient and state */ 1009 c0 = *(pb + 1U); 1010 x0 = *(px++); 1011 1012 /* Perform the multiply-accumulates */ 1013 acc0 += ((q63_t) x1 * c0); 1014 acc1 += ((q63_t) x2 * c0); 1015 acc2 += ((q63_t) x0 * c0); 1016 1017 /* Read the coefficient and state */ 1018 c0 = *(pb + 2U); 1019 x1 = *(px++); 1020 1021 /* update coefficient pointer */ 1022 pb += 3U; 1023 1024 /* Perform the multiply-accumulates */ 1025 acc0 += ((q63_t) x2 * c0); 1026 acc1 += ((q63_t) x0 * c0); 1027 acc2 += ((q63_t) x1 * c0); 1028 1029 /* Decrement loop counter */ 1030 tapCnt--; 1031 } 1032 1033 /* Loop unrolling: Compute remaining outputs */ 1034 tapCnt = numTaps % 0x3U; 1035 1036 while (tapCnt > 0U) 1037 { 1038 /* Read coefficients */ 1039 c0 = *(pb++); 1040 1041 /* Fetch 1 state variable */ 1042 x2 = *(px++); 1043 1044 /* Perform the multiply-accumulates */ 1045 acc0 += ((q63_t) x0 * c0); 1046 acc1 += ((q63_t) x1 * c0); 1047 acc2 += ((q63_t) x2 * c0); 1048 1049 /* Reuse the present sample states for next sample */ 1050 x0 = x1; 1051 x1 = x2; 1052 1053 /* Decrement loop counter */ 1054 tapCnt--; 1055 } 1056 1057 /* Advance the state pointer by 3 to process the next group of 3 samples */ 1058 pState = pState + 3; 1059 1060 /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */ 1061 *pDst++ = (q31_t) (acc0 >> 31U); 1062 *pDst++ = (q31_t) (acc1 >> 31U); 1063 *pDst++ = (q31_t) (acc2 >> 31U); 1064 1065 /* Decrement loop counter */ 1066 blkCnt--; 1067 } 1068 1069 /* Loop unrolling: Compute remaining output samples */ 1070 blkCnt = blockSize % 0x3U; 1071 1072 #else 1073 1074 /* Initialize blkCnt with number of taps */ 1075 blkCnt = blockSize; 1076 1077 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 1078 1079 while (blkCnt > 0U) 1080 { 1081 /* Copy one sample at a time into state buffer */ 1082 *pStateCurnt++ = *pSrc++; 1083 1084 /* Set the accumulator to zero */ 1085 acc0 = 0; 1086 1087 /* Initialize state pointer */ 1088 px = pState; 1089 1090 /* Initialize Coefficient pointer */ 1091 pb = pCoeffs; 1092 1093 i = numTaps; 1094 1095 /* Perform the multiply-accumulates */ 1096 do 1097 { 1098 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ 1099 acc0 += (q63_t) *px++ * *pb++; 1100 1101 i--; 1102 } while (i > 0U); 1103 1104 /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */ 1105 *pDst++ = (q31_t) (acc0 >> 31U); 1106 1107 /* Advance state pointer by 1 for the next sample */ 1108 pState = pState + 1U; 1109 1110 /* Decrement loop counter */ 1111 blkCnt--; 1112 } 1113 1114 /* Processing is complete. 1115 Now copy the last numTaps - 1 samples to the start of the state buffer. 1116 This prepares the state buffer for the next function call. */ 1117 1118 /* Points to the start of the state buffer */ 1119 pStateCurnt = S->pState; 1120 1121 #if defined (ARM_MATH_LOOPUNROLL) 1122 1123 /* Loop unrolling: Compute 4 taps at a time */ 1124 tapCnt = (numTaps - 1U) >> 2U; 1125 1126 /* Copy data */ 1127 while (tapCnt > 0U) 1128 { 1129 *pStateCurnt++ = *pState++; 1130 *pStateCurnt++ = *pState++; 1131 *pStateCurnt++ = *pState++; 1132 *pStateCurnt++ = *pState++; 1133 1134 /* Decrement loop counter */ 1135 tapCnt--; 1136 } 1137 1138 /* Calculate remaining number of copies */ 1139 tapCnt = (numTaps - 1U) % 0x4U; 1140 1141 #else 1142 1143 /* Initialize tapCnt with number of taps */ 1144 tapCnt = (numTaps - 1U); 1145 1146 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 1147 1148 /* Copy remaining data */ 1149 while (tapCnt > 0U) 1150 { 1151 *pStateCurnt++ = *pState++; 1152 1153 /* Decrement loop counter */ 1154 tapCnt--; 1155 } 1156 1157 } 1158 #endif /* defined(ARM_MATH_MVEI) */ 1159 1160 /** 1161 @} end of FIR group 1162 */