arm_fir_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_fir_f32.c 4 * Description: Floating-point FIR filter processing function 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @defgroup FIR Finite Impulse Response (FIR) Filters 37 38 This set of functions implements Finite Impulse Response (FIR) filters 39 for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided. 40 The functions operate on blocks of input and output data and each call to the function processes 41 <code>blockSize</code> samples through the filter. <code>pSrc</code> and 42 <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values. 43 44 @par Algorithm 45 The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations. 46 Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>. 47 <pre> 48 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1] 49 </pre> 50 @par 51 \image html FIR.GIF "Finite Impulse Response filter" 52 @par 53 <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>. 54 Coefficients are stored in time reversed order. 55 @par 56 <pre> 57 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]} 58 </pre> 59 @par 60 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>. 61 Samples in the state buffer are stored in the following order. 62 @par 63 <pre> 64 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])} 65 </pre> 66 @par 67 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>. 68 The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters, 69 to be avoided and yields a significant speed improvement. 70 The state variables are updated after each block of data is processed; the coefficients are untouched. 71 72 @par Instance Structure 73 The coefficients and state variables for a filter are stored together in an instance data structure. 74 A separate instance structure must be defined for each filter. 75 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. 76 There are separate instance structure declarations for each of the 4 supported data types. 77 78 @par Initialization Functions 79 There is also an associated initialization function for each data type. 80 The initialization function performs the following operations: 81 - Sets the values of the internal structure fields. 82 - Zeros out the values in the state buffer. 83 To do this manually without calling the init function, assign the follow subfields of the instance structure: 84 numTaps, pCoeffs, pState. Also set all of the values in pState to zero. 85 @par 86 Use of the initialization function is optional. 87 However, if the initialization function is used, then the instance structure cannot be placed into a const data section. 88 To place an instance structure into a const data section, the instance structure must be manually initialized. 89 Set the values in the state buffer to zeros before static initialization. 90 The code below statically initializes each of the 4 different data type filter instance structures 91 <pre> 92 arm_fir_instance_f32 S = {numTaps, pState, pCoeffs}; 93 arm_fir_instance_q31 S = {numTaps, pState, pCoeffs}; 94 arm_fir_instance_q15 S = {numTaps, pState, pCoeffs}; 95 arm_fir_instance_q7 S = {numTaps, pState, pCoeffs}; 96 </pre> 97 where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer; 98 <code>pCoeffs</code> is the address of the coefficient buffer. 99 @par Initialization of Helium version 100 For Helium version the array of coefficients must be padded with zero to contain 101 a full number of lanes. 102 103 The array length L must be a multiple of x. L = x * a : 104 - x is 4 for f32 105 - x is 4 for q31 106 - x is 4 for f16 (so managed like the f32 version and not like the q15 one) 107 - x is 8 for q15 108 - x is 16 for q7 109 110 The additional coefficients 111 (x * a - numTaps) must be set to 0. 112 numTaps is still set to its right value in the init function. It means that 113 the implementation may require to read more coefficients due to the vectorization and 114 to avoid having to manage too many different cases in the code. 115 116 117 @par Helium state buffer 118 The state buffer must contain some additional temporary data 119 used during the computation but which is not the state of the FIR. 120 The first A samples are temporary data. 121 The remaining samples are the state of the FIR filter. 122 @par 123 So the state buffer has size <code> numTaps + A + blockSize - 1 </code> : 124 - A is blockSize for f32 125 - A is 8*ceil(blockSize/8) for f16 126 - A is 8*ceil(blockSize/4) for q31 127 - A is 0 for other datatypes (q15 and q7) 128 129 130 @par Fixed-Point Behavior 131 Care must be taken when using the fixed-point versions of the FIR filter functions. 132 In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. 133 Refer to the function specific documentation below for usage guidelines. 134 135 */ 136 137 /** 138 @addtogroup FIR 139 @{ 140 */ 141 142 /** 143 @brief Processing function for floating-point FIR filter. 144 @param[in] S points to an instance of the floating-point FIR filter structure 145 @param[in] pSrc points to the block of input data 146 @param[out] pDst points to the block of output data 147 @param[in] blockSize number of samples to process 148 @return none 149 */ 150 151 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 152 153 #define FIR_F32_MAX_COEF_BLK 8 154 155 #define FIR_F32_CORE(pSamples, c, NB_TAPS) \ 156 vecAcc0 = vdupq_n_f32(0.0f); \ 157 for (int i = 0; i < NB_TAPS; i++) { \ 158 vecIn0 = vld1q(&pSamples[i]); \ 159 vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \ 160 } 161 162 163 #define NB_TAPS 4 164 __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 165 const float32_t * __restrict pSrc, 166 float32_t * __restrict pDst, uint32_t blockSize) 167 { 168 float32_t *pRefStatePtr = S->pState + blockSize; 169 float32_t *pState = pRefStatePtr; /* State pointer */ 170 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 171 float32_t *pStateCur; /* Points to the current sample of the state */ 172 const float32_t *pSamples; /* Temporary pointer to the sample buffer */ 173 float32_t *pOutput; /* Temporary pointer to the output buffer */ 174 const float32_t *pTempSrc; /* Temporary pointer to the source data */ 175 float32_t *pTempDest; /* Temporary pointer to the destination buffer */ 176 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 177 int32_t blkCnt; 178 float32x4_t vecIn0; 179 float32x4_t vecAcc0; 180 float32_t c[NB_TAPS]; 181 const float32_t *pCoeffsCur = pCoeffs; 182 183 /* 184 * pState points to state array which contains previous frame (numTaps - 1) samples 185 * pStateCur points to the location where the new input data should be written 186 */ 187 pStateCur = &(pState[(numTaps - 1u)]); 188 pTempSrc = pSrc; 189 190 pSamples = pState; 191 pOutput = pDst; 192 193 for (int i = 0; i < NB_TAPS; i++) 194 c[i] = *pCoeffsCur++; 195 196 blkCnt = blockSize >> 2; 197 while (blkCnt > 0) { 198 /* 199 * Save 4 input samples in the history buffer 200 */ 201 vst1q(pStateCur, vld1q(pTempSrc)); 202 pStateCur += 4; 203 pTempSrc += 4; 204 205 FIR_F32_CORE(pSamples, c, NB_TAPS); 206 207 vst1q(pOutput, vecAcc0); 208 209 pOutput += 4; 210 pSamples += 4; 211 212 blkCnt--; 213 } 214 215 blkCnt = blockSize & 3; 216 if (blkCnt) 217 { 218 mve_pred16_t p0 = vctp32q(blkCnt); 219 220 vst1q(pStateCur, vld1q(pTempSrc)); 221 pStateCur += 4; 222 pTempSrc += 4; 223 224 FIR_F32_CORE(pSamples, c, NB_TAPS); 225 226 vstrwq_p_f32(pOutput, vecAcc0, p0); 227 } 228 229 /* 230 * Copy the samples back into the history buffer start 231 */ 232 pTempSrc = &pState[blockSize]; 233 pTempDest = pState; 234 235 blkCnt = numTaps - 1; 236 do { 237 mve_pred16_t p = vctp32q(blkCnt); 238 239 vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p); 240 pTempSrc += 4; 241 pTempDest += 4; 242 blkCnt -= 4; 243 } 244 while (blkCnt > 0); 245 } 246 #undef NB_TAPS 247 248 __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 249 const float32_t * __restrict pSrc, 250 float32_t * __restrict pDst, uint32_t blockSize) 251 { 252 float32_t *pRefStatePtr = S->pState + blockSize; 253 float32_t *pState = pRefStatePtr; /* State pointer */ 254 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 255 const float32_t *pSamples; /* Temporary pointer to the sample buffer */ 256 const float32_t *pTempSrc; /* Temporary pointer to the source data */ 257 float32_t *pTempDest; /* Temporary pointer to the destination buffer */ 258 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 259 int32_t blkCnt; 260 float32_t c0, c1, c2, c3; 261 float32_t c4, c5, c6, c7; 262 263 264 pTempSrc = pSrc; 265 pTempDest = &(pState[(numTaps - 1u)]); 266 int cnt = blockSize; 267 do { 268 mve_pred16_t p0 = vctp32q(cnt); 269 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0); 270 pTempDest += 4; 271 pTempSrc += 4; 272 cnt -= 4; 273 } while(cnt > 0); 274 275 276 277 pSamples = pState; 278 c0 = *pCoeffs++; 279 c1 = *pCoeffs++; 280 c2 = *pCoeffs++; 281 c3 = *pCoeffs++; 282 c4 = *pCoeffs++; 283 c5 = *pCoeffs++; 284 c6 = *pCoeffs++; 285 c7 = *pCoeffs++; 286 287 cnt = blockSize >> 2; 288 while(cnt > 0) 289 { 290 float32x4_t vecAcc0; 291 float32x4_t vecIn0; 292 293 vecIn0 = vld1q(pSamples); 294 vecAcc0 = vmulq(vecIn0, c0); 295 vecIn0 = vld1q(&pSamples[1]); 296 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 297 vecIn0 = vld1q(&pSamples[2]); 298 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 299 vecIn0 = vld1q(&pSamples[3]); 300 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 301 vecIn0 = vld1q(&pSamples[4]); 302 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 303 vecIn0 = vld1q(&pSamples[5]); 304 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 305 vecIn0 = vld1q(&pSamples[6]); 306 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 307 vecIn0 = vld1q(&pSamples[7]); 308 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 309 pSamples += 4; 310 vst1q(pDst, vecAcc0); 311 cnt--; 312 pDst += 4; 313 } 314 315 cnt = blockSize & 3; 316 if (cnt > 0) 317 { 318 float32x4_t vecAcc0; 319 float32x4_t vecIn0; 320 321 mve_pred16_t p0 = vctp32q(cnt); 322 323 vecIn0 = vld1q(pSamples); 324 vecAcc0 = vmulq(vecIn0, c0); 325 vecIn0 = vld1q(&pSamples[1]); 326 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 327 vecIn0 = vld1q(&pSamples[2]); 328 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 329 vecIn0 = vld1q(&pSamples[3]); 330 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 331 vecIn0 = vld1q(&pSamples[4]); 332 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 333 vecIn0 = vld1q(&pSamples[5]); 334 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 335 vecIn0 = vld1q(&pSamples[6]); 336 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 337 vecIn0 = vld1q(&pSamples[7]); 338 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 339 vstrwq_p_f32(pDst, vecAcc0,p0); 340 } 341 342 343 /* 344 * Copy the samples back into the history buffer start 345 */ 346 pTempSrc = &pState[blockSize]; 347 pTempDest = pState; 348 blkCnt = numTaps; 349 while (blkCnt > 0) 350 { 351 *pTempDest++ = *pTempSrc++; 352 blkCnt--; 353 } 354 } 355 356 357 358 void arm_fir_f32( 359 const arm_fir_instance_f32 * S, 360 const float32_t * pSrc, 361 float32_t * pDst, 362 uint32_t blockSize) 363 { 364 /* 365 S->pState is the arm_fir_partial_accu 366 S->pState + blockSize is the FIR state 367 */ 368 float32_t *pRefStatePtr = S->pState + blockSize; 369 float32_t *pState = pRefStatePtr ; /* State pointer */ 370 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 371 const float32_t *pSamples; /* Temporary pointer to the sample buffer */ 372 float32_t *pOutput; /* Temporary pointer to the output buffer */ 373 const float32_t *pTempSrc; /* Temporary pointer to the source data */ 374 float32_t *pTempDest; /* Temporary pointer to the destination buffer */ 375 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 376 uint32_t blkCnt; 377 float32_t c0, c1, c2, c3; 378 float32_t c4, c5, c6, c7; 379 380 /* 381 * [1 to 8 taps] specialized routines 382 */ 383 if (numTaps <= 4) 384 { 385 arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize); 386 return; 387 } 388 else if (numTaps <= 8) 389 { 390 arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize); 391 return; 392 } 393 394 pTempSrc = pSrc; 395 pTempDest = &(pState[(numTaps - 1u)]); 396 int cnt = blockSize; 397 do { 398 mve_pred16_t p0 = vctp32q(cnt); 399 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0); 400 pTempDest += 4; 401 pTempSrc += 4; 402 cnt -= 4; 403 } while(cnt > 0); 404 405 float32_t *partial_accu_ptr = S->pState; 406 407 pSamples = pState; 408 c0 = *pCoeffs++; 409 c1 = *pCoeffs++; 410 c2 = *pCoeffs++; 411 c3 = *pCoeffs++; 412 c4 = *pCoeffs++; 413 c5 = *pCoeffs++; 414 c6 = *pCoeffs++; 415 c7 = *pCoeffs++; 416 417 cnt = blockSize >> 2; 418 while(cnt > 0) { 419 float32x4_t vecAcc0; 420 float32x4_t vecIn0; 421 422 vecIn0 = vld1q(pSamples); 423 vecAcc0 = vmulq(vecIn0, c0); 424 vecIn0 = vld1q(&pSamples[1]); 425 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 426 vecIn0 = vld1q(&pSamples[2]); 427 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 428 vecIn0 = vld1q(&pSamples[3]); 429 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 430 vecIn0 = vld1q(&pSamples[4]); 431 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 432 vecIn0 = vld1q(&pSamples[5]); 433 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 434 vecIn0 = vld1q(&pSamples[6]); 435 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 436 vecIn0 = vld1q(&pSamples[7]); 437 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 438 pSamples += 4; 439 vst1q(partial_accu_ptr, vecAcc0); 440 cnt--; 441 partial_accu_ptr += 4; 442 } 443 444 cnt = blockSize & 3; 445 if (cnt > 0) 446 { 447 float32x4_t vecAcc0; 448 float32x4_t vecIn0; 449 450 mve_pred16_t p0 = vctp32q(cnt); 451 452 vecIn0 = vld1q(pSamples); 453 vecAcc0 = vmulq(vecIn0, c0); 454 vecIn0 = vld1q(&pSamples[1]); 455 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 456 vecIn0 = vld1q(&pSamples[2]); 457 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 458 vecIn0 = vld1q(&pSamples[3]); 459 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 460 vecIn0 = vld1q(&pSamples[4]); 461 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 462 vecIn0 = vld1q(&pSamples[5]); 463 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 464 vecIn0 = vld1q(&pSamples[6]); 465 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 466 vecIn0 = vld1q(&pSamples[7]); 467 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 468 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0); 469 } 470 471 int localTaps = numTaps - FIR_F32_MAX_COEF_BLK; 472 int sample_offset = FIR_F32_MAX_COEF_BLK; 473 while (localTaps > FIR_F32_MAX_COEF_BLK) { 474 c0 = *pCoeffs++; 475 c1 = *pCoeffs++; 476 c2 = *pCoeffs++; 477 c3 = *pCoeffs++; 478 c4 = *pCoeffs++; 479 c5 = *pCoeffs++; 480 c6 = *pCoeffs++; 481 c7 = *pCoeffs++; 482 483 partial_accu_ptr = S->pState; 484 pSamples = pState + sample_offset; 485 int cnt = blockSize >> 2; 486 while(cnt > 0) { 487 float32x4_t vecAcc0; 488 float32x4_t vecIn0; 489 490 vecIn0 = vld1q(pSamples); 491 vecAcc0 = vmulq(vecIn0, c0); 492 vecIn0 = vld1q(&pSamples[1]); 493 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 494 vecIn0 = vld1q(&pSamples[2]); 495 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 496 vecIn0 = vld1q(&pSamples[3]); 497 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 498 vecIn0 = vld1q(&pSamples[4]); 499 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 500 vecIn0 = vld1q(&pSamples[5]); 501 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 502 vecIn0 = vld1q(&pSamples[6]); 503 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 504 vecIn0 = vld1q(&pSamples[7]); 505 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 506 pSamples += 4; 507 vecAcc0 += vld1q_f32(partial_accu_ptr); 508 vst1q(partial_accu_ptr, vecAcc0); 509 cnt--; 510 partial_accu_ptr += 4; 511 } 512 513 cnt = blockSize & 3; 514 if (cnt > 0) { 515 float32x4_t vecAcc0; 516 float32x4_t vecIn0; 517 518 mve_pred16_t p0 = vctp32q(cnt); 519 520 vecIn0 = vld1q(pSamples); 521 vecAcc0 = vmulq(vecIn0, c0); 522 vecIn0 = vld1q(&pSamples[1]); 523 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 524 vecIn0 = vld1q(&pSamples[2]); 525 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 526 vecIn0 = vld1q(&pSamples[3]); 527 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 528 vecIn0 = vld1q(&pSamples[4]); 529 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 530 vecIn0 = vld1q(&pSamples[5]); 531 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 532 vecIn0 = vld1q(&pSamples[6]); 533 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 534 vecIn0 = vld1q(&pSamples[7]); 535 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 536 vecAcc0 += vld1q_f32(partial_accu_ptr); 537 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0); 538 } 539 540 localTaps -= FIR_F32_MAX_COEF_BLK; 541 sample_offset += FIR_F32_MAX_COEF_BLK; 542 } 543 544 pSamples = pState + sample_offset; 545 546 if (localTaps > 4) { 547 c0 = *pCoeffs++; 548 c1 = *pCoeffs++; 549 c2 = *pCoeffs++; 550 c3 = *pCoeffs++; 551 c4 = *pCoeffs++; 552 c5 = *pCoeffs++; 553 c6 = *pCoeffs++; 554 c7 = *pCoeffs++; 555 pOutput = pDst; 556 557 partial_accu_ptr = S->pState; 558 cnt = blockSize >> 2; 559 while(cnt > 0) { 560 float32x4_t vecAcc0; 561 float32x4_t vecIn0; 562 563 vecIn0 = vld1q(pSamples); 564 vecAcc0 = vmulq(vecIn0, c0); 565 vecIn0 = vld1q(&pSamples[1]); 566 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 567 vecIn0 = vld1q(&pSamples[2]); 568 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 569 vecIn0 = vld1q(&pSamples[3]); 570 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 571 vecIn0 = vld1q(&pSamples[4]); 572 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 573 vecIn0 = vld1q(&pSamples[5]); 574 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 575 vecIn0 = vld1q(&pSamples[6]); 576 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 577 vecIn0 = vld1q(&pSamples[7]); 578 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 579 pSamples += 4; 580 float32x4_t pap = vld1q_f32(partial_accu_ptr); 581 vst1q(pOutput, vecAcc0+pap); 582 cnt--; 583 partial_accu_ptr += 4; 584 pOutput += 4; 585 } 586 587 cnt = blockSize & 3; 588 if (cnt > 0) { 589 float32x4_t vecAcc0; 590 float32x4_t vecIn0; 591 592 mve_pred16_t p0 = vctp32q(cnt); 593 594 vecIn0 = vld1q(pSamples); 595 vecAcc0 = vmulq(vecIn0, c0); 596 vecIn0 = vld1q(&pSamples[1]); 597 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 598 vecIn0 = vld1q(&pSamples[2]); 599 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 600 vecIn0 = vld1q(&pSamples[3]); 601 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 602 vecIn0 = vld1q(&pSamples[4]); 603 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); 604 vecIn0 = vld1q(&pSamples[5]); 605 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); 606 vecIn0 = vld1q(&pSamples[6]); 607 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); 608 vecIn0 = vld1q(&pSamples[7]); 609 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); 610 float32x4_t pap = vld1q_f32(partial_accu_ptr); 611 vstrwq_p_f32(pOutput, vecAcc0+pap,p0); 612 pOutput += cnt; 613 } 614 } 615 else { 616 c0 = *pCoeffs++; 617 c1 = *pCoeffs++; 618 c2 = *pCoeffs++; 619 c3 = *pCoeffs++; 620 pOutput = pDst; 621 622 partial_accu_ptr = S->pState; 623 cnt = blockSize >> 2; 624 while(cnt > 0) { 625 float32x4_t vecAcc0; 626 float32x4_t vecIn0; 627 628 vecIn0 = vld1q(pSamples); 629 vecAcc0 = vmulq(vecIn0, c0); 630 vecIn0 = vld1q(&pSamples[1]); 631 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 632 vecIn0 = vld1q(&pSamples[2]); 633 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 634 vecIn0 = vld1q(&pSamples[3]); 635 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 636 pSamples += 4; 637 float32x4_t pap = vld1q_f32(partial_accu_ptr); 638 vst1q(pOutput, vecAcc0+pap); 639 cnt--; 640 partial_accu_ptr += 4; 641 pOutput += 4; 642 } 643 644 cnt = blockSize & 3; 645 if (cnt > 0) { 646 float32x4_t vecAcc0; 647 float32x4_t vecIn0; 648 649 mve_pred16_t p0 = vctp32q(cnt); 650 651 vecIn0 = vld1q(pSamples); 652 vecAcc0 = vmulq(vecIn0, c0); 653 vecIn0 = vld1q(&pSamples[1]); 654 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); 655 vecIn0 = vld1q(&pSamples[2]); 656 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); 657 vecIn0 = vld1q(&pSamples[3]); 658 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); 659 float32x4_t pap = vld1q_f32(partial_accu_ptr); 660 vstrwq_p_f32(pOutput, vecAcc0+pap,p0); 661 pOutput += cnt; 662 } 663 } 664 665 /* 666 * Copy the samples back into the history buffer start 667 */ 668 pTempSrc = &pRefStatePtr[blockSize]; 669 pTempDest = pRefStatePtr; 670 671 blkCnt = numTaps >> 2; 672 while (blkCnt > 0) 673 { 674 vst1q(pTempDest, vld1q(pTempSrc)); 675 pTempSrc += 4; 676 pTempDest += 4; 677 blkCnt--; 678 } 679 blkCnt = numTaps & 3; 680 if (blkCnt > 0) 681 { 682 mve_pred16_t p0 = vctp32q(blkCnt); 683 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0); 684 } 685 } 686 687 #else 688 #if defined(ARM_MATH_NEON) 689 690 void arm_fir_f32( 691 const arm_fir_instance_f32 * S, 692 const float32_t * pSrc, 693 float32_t * pDst, 694 uint32_t blockSize) 695 { 696 float32_t *pState = S->pState; /* State pointer */ 697 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 698 float32_t *pStateCurnt; /* Points to the current sample of the state */ 699 float32_t *px; /* Temporary pointers for state buffer */ 700 const float32_t *pb; /* Temporary pointers for coefficient buffer */ 701 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 702 uint32_t i, tapCnt, blkCnt; /* Loop counters */ 703 704 float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b; 705 float32_t acc; 706 707 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 708 /* pStateCurnt points to the location where the new input data should be written */ 709 pStateCurnt = &(S->pState[(numTaps - 1U)]); 710 711 /* Loop unrolling */ 712 blkCnt = blockSize >> 3; 713 714 while (blkCnt > 0U) 715 { 716 /* Copy 8 samples at a time into state buffers */ 717 samples0 = vld1q_f32(pSrc); 718 vst1q_f32(pStateCurnt,samples0); 719 720 pStateCurnt += 4; 721 pSrc += 4 ; 722 723 samples1 = vld1q_f32(pSrc); 724 vst1q_f32(pStateCurnt,samples1); 725 726 pStateCurnt += 4; 727 pSrc += 4 ; 728 729 /* Set the accumulators to zero */ 730 accv0 = vdupq_n_f32(0); 731 accv1 = vdupq_n_f32(0); 732 733 /* Initialize state pointer */ 734 px = pState; 735 736 /* Initialize coefficient pointer */ 737 pb = pCoeffs; 738 739 /* Loop unroling */ 740 i = numTaps >> 2; 741 742 /* Perform the multiply-accumulates */ 743 x0 = vld1q_f32(px); 744 x1 = vld1q_f32(px + 4); 745 746 while(i > 0) 747 { 748 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ 749 x2 = vld1q_f32(px + 8); 750 b = vld1q_f32(pb); 751 xa = x0; 752 xb = x1; 753 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0)); 754 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0)); 755 756 xa = vextq_f32(x0,x1,1); 757 xb = vextq_f32(x1,x2,1); 758 759 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1)); 760 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1)); 761 762 xa = vextq_f32(x0,x1,2); 763 xb = vextq_f32(x1,x2,2); 764 765 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2)); 766 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2)); 767 768 xa = vextq_f32(x0,x1,3); 769 xb = vextq_f32(x1,x2,3); 770 771 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3)); 772 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3)); 773 774 pb += 4; 775 x0 = x1; 776 x1 = x2; 777 px += 4; 778 i--; 779 780 } 781 782 /* Tail */ 783 i = numTaps & 3; 784 x2 = vld1q_f32(px + 8); 785 786 /* Perform the multiply-accumulates */ 787 switch(i) 788 { 789 case 3: 790 { 791 accv0 = vmlaq_n_f32(accv0,x0,*pb); 792 accv1 = vmlaq_n_f32(accv1,x1,*pb); 793 794 pb++; 795 796 xa = vextq_f32(x0,x1,1); 797 xb = vextq_f32(x1,x2,1); 798 799 accv0 = vmlaq_n_f32(accv0,xa,*pb); 800 accv1 = vmlaq_n_f32(accv1,xb,*pb); 801 802 pb++; 803 804 xa = vextq_f32(x0,x1,2); 805 xb = vextq_f32(x1,x2,2); 806 807 accv0 = vmlaq_n_f32(accv0,xa,*pb); 808 accv1 = vmlaq_n_f32(accv1,xb,*pb); 809 810 } 811 break; 812 case 2: 813 { 814 accv0 = vmlaq_n_f32(accv0,x0,*pb); 815 accv1 = vmlaq_n_f32(accv1,x1,*pb); 816 817 pb++; 818 819 xa = vextq_f32(x0,x1,1); 820 xb = vextq_f32(x1,x2,1); 821 822 accv0 = vmlaq_n_f32(accv0,xa,*pb); 823 accv1 = vmlaq_n_f32(accv1,xb,*pb); 824 825 } 826 break; 827 case 1: 828 { 829 830 accv0 = vmlaq_n_f32(accv0,x0,*pb); 831 accv1 = vmlaq_n_f32(accv1,x1,*pb); 832 833 } 834 break; 835 default: 836 break; 837 } 838 839 /* The result is stored in the destination buffer. */ 840 vst1q_f32(pDst,accv0); 841 pDst += 4; 842 vst1q_f32(pDst,accv1); 843 pDst += 4; 844 845 /* Advance state pointer by 8 for the next 8 samples */ 846 pState = pState + 8; 847 848 blkCnt--; 849 } 850 851 /* Tail */ 852 blkCnt = blockSize & 0x7; 853 854 while (blkCnt > 0U) 855 { 856 /* Copy one sample at a time into state buffer */ 857 *pStateCurnt++ = *pSrc++; 858 859 /* Set the accumulator to zero */ 860 acc = 0.0f; 861 862 /* Initialize state pointer */ 863 px = pState; 864 865 /* Initialize Coefficient pointer */ 866 pb = pCoeffs; 867 868 i = numTaps; 869 870 /* Perform the multiply-accumulates */ 871 do 872 { 873 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ 874 acc += *px++ * *pb++; 875 i--; 876 877 } while (i > 0U); 878 879 /* The result is stored in the destination buffer. */ 880 *pDst++ = acc; 881 882 /* Advance state pointer by 1 for the next sample */ 883 pState = pState + 1; 884 885 blkCnt--; 886 } 887 888 /* Processing is complete. 889 ** Now copy the last numTaps - 1 samples to the starting of the state buffer. 890 ** This prepares the state buffer for the next function call. */ 891 892 /* Points to the start of the state buffer */ 893 pStateCurnt = S->pState; 894 895 /* Copy numTaps number of values */ 896 tapCnt = numTaps - 1U; 897 898 /* Copy data */ 899 while (tapCnt > 0U) 900 { 901 *pStateCurnt++ = *pState++; 902 903 /* Decrement the loop counter */ 904 tapCnt--; 905 } 906 907 } 908 #else 909 void arm_fir_f32( 910 const arm_fir_instance_f32 * S, 911 const float32_t * pSrc, 912 float32_t * pDst, 913 uint32_t blockSize) 914 { 915 float32_t *pState = S->pState; /* State pointer */ 916 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 917 float32_t *pStateCurnt; /* Points to the current sample of the state */ 918 float32_t *px; /* Temporary pointer for state buffer */ 919 const float32_t *pb; /* Temporary pointer for coefficient buffer */ 920 float32_t acc0; /* Accumulator */ 921 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 922 uint32_t i, tapCnt, blkCnt; /* Loop counters */ 923 924 #if defined (ARM_MATH_LOOPUNROLL) 925 float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ 926 float32_t x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */ 927 float32_t c0; /* Temporary variable to hold coefficient value */ 928 #endif 929 930 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 931 /* pStateCurnt points to the location where the new input data should be written */ 932 pStateCurnt = &(S->pState[(numTaps - 1U)]); 933 934 #if defined (ARM_MATH_LOOPUNROLL) 935 936 /* Loop unrolling: Compute 8 output values simultaneously. 937 * The variables acc0 ... acc7 hold output values that are being computed: 938 * 939 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 940 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 941 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 942 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 943 */ 944 945 blkCnt = blockSize >> 3U; 946 947 while (blkCnt > 0U) 948 { 949 /* Copy 4 new input samples into the state buffer. */ 950 *pStateCurnt++ = *pSrc++; 951 *pStateCurnt++ = *pSrc++; 952 *pStateCurnt++ = *pSrc++; 953 *pStateCurnt++ = *pSrc++; 954 955 /* Set all accumulators to zero */ 956 acc0 = 0.0f; 957 acc1 = 0.0f; 958 acc2 = 0.0f; 959 acc3 = 0.0f; 960 acc4 = 0.0f; 961 acc5 = 0.0f; 962 acc6 = 0.0f; 963 acc7 = 0.0f; 964 965 /* Initialize state pointer */ 966 px = pState; 967 968 /* Initialize coefficient pointer */ 969 pb = pCoeffs; 970 971 /* This is separated from the others to avoid 972 * a call to __aeabi_memmove which would be slower 973 */ 974 *pStateCurnt++ = *pSrc++; 975 *pStateCurnt++ = *pSrc++; 976 *pStateCurnt++ = *pSrc++; 977 *pStateCurnt++ = *pSrc++; 978 979 /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ 980 x0 = *px++; 981 x1 = *px++; 982 x2 = *px++; 983 x3 = *px++; 984 x4 = *px++; 985 x5 = *px++; 986 x6 = *px++; 987 988 /* Loop unrolling: process 8 taps at a time. */ 989 tapCnt = numTaps >> 3U; 990 991 while (tapCnt > 0U) 992 { 993 /* Read the b[numTaps-1] coefficient */ 994 c0 = *(pb++); 995 996 /* Read x[n-numTaps-3] sample */ 997 x7 = *(px++); 998 999 /* acc0 += b[numTaps-1] * x[n-numTaps] */ 1000 acc0 += x0 * c0; 1001 1002 /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ 1003 acc1 += x1 * c0; 1004 1005 /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ 1006 acc2 += x2 * c0; 1007 1008 /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ 1009 acc3 += x3 * c0; 1010 1011 /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ 1012 acc4 += x4 * c0; 1013 1014 /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ 1015 acc5 += x5 * c0; 1016 1017 /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ 1018 acc6 += x6 * c0; 1019 1020 /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ 1021 acc7 += x7 * c0; 1022 1023 /* Read the b[numTaps-2] coefficient */ 1024 c0 = *(pb++); 1025 1026 /* Read x[n-numTaps-4] sample */ 1027 x0 = *(px++); 1028 1029 /* Perform the multiply-accumulate */ 1030 acc0 += x1 * c0; 1031 acc1 += x2 * c0; 1032 acc2 += x3 * c0; 1033 acc3 += x4 * c0; 1034 acc4 += x5 * c0; 1035 acc5 += x6 * c0; 1036 acc6 += x7 * c0; 1037 acc7 += x0 * c0; 1038 1039 /* Read the b[numTaps-3] coefficient */ 1040 c0 = *(pb++); 1041 1042 /* Read x[n-numTaps-5] sample */ 1043 x1 = *(px++); 1044 1045 /* Perform the multiply-accumulates */ 1046 acc0 += x2 * c0; 1047 acc1 += x3 * c0; 1048 acc2 += x4 * c0; 1049 acc3 += x5 * c0; 1050 acc4 += x6 * c0; 1051 acc5 += x7 * c0; 1052 acc6 += x0 * c0; 1053 acc7 += x1 * c0; 1054 1055 /* Read the b[numTaps-4] coefficient */ 1056 c0 = *(pb++); 1057 1058 /* Read x[n-numTaps-6] sample */ 1059 x2 = *(px++); 1060 1061 /* Perform the multiply-accumulates */ 1062 acc0 += x3 * c0; 1063 acc1 += x4 * c0; 1064 acc2 += x5 * c0; 1065 acc3 += x6 * c0; 1066 acc4 += x7 * c0; 1067 acc5 += x0 * c0; 1068 acc6 += x1 * c0; 1069 acc7 += x2 * c0; 1070 1071 /* Read the b[numTaps-4] coefficient */ 1072 c0 = *(pb++); 1073 1074 /* Read x[n-numTaps-6] sample */ 1075 x3 = *(px++); 1076 /* Perform the multiply-accumulates */ 1077 acc0 += x4 * c0; 1078 acc1 += x5 * c0; 1079 acc2 += x6 * c0; 1080 acc3 += x7 * c0; 1081 acc4 += x0 * c0; 1082 acc5 += x1 * c0; 1083 acc6 += x2 * c0; 1084 acc7 += x3 * c0; 1085 1086 /* Read the b[numTaps-4] coefficient */ 1087 c0 = *(pb++); 1088 1089 /* Read x[n-numTaps-6] sample */ 1090 x4 = *(px++); 1091 1092 /* Perform the multiply-accumulates */ 1093 acc0 += x5 * c0; 1094 acc1 += x6 * c0; 1095 acc2 += x7 * c0; 1096 acc3 += x0 * c0; 1097 acc4 += x1 * c0; 1098 acc5 += x2 * c0; 1099 acc6 += x3 * c0; 1100 acc7 += x4 * c0; 1101 1102 /* Read the b[numTaps-4] coefficient */ 1103 c0 = *(pb++); 1104 1105 /* Read x[n-numTaps-6] sample */ 1106 x5 = *(px++); 1107 1108 /* Perform the multiply-accumulates */ 1109 acc0 += x6 * c0; 1110 acc1 += x7 * c0; 1111 acc2 += x0 * c0; 1112 acc3 += x1 * c0; 1113 acc4 += x2 * c0; 1114 acc5 += x3 * c0; 1115 acc6 += x4 * c0; 1116 acc7 += x5 * c0; 1117 1118 /* Read the b[numTaps-4] coefficient */ 1119 c0 = *(pb++); 1120 1121 /* Read x[n-numTaps-6] sample */ 1122 x6 = *(px++); 1123 1124 /* Perform the multiply-accumulates */ 1125 acc0 += x7 * c0; 1126 acc1 += x0 * c0; 1127 acc2 += x1 * c0; 1128 acc3 += x2 * c0; 1129 acc4 += x3 * c0; 1130 acc5 += x4 * c0; 1131 acc6 += x5 * c0; 1132 acc7 += x6 * c0; 1133 1134 /* Decrement loop counter */ 1135 tapCnt--; 1136 } 1137 1138 /* Loop unrolling: Compute remaining outputs */ 1139 tapCnt = numTaps % 0x8U; 1140 1141 while (tapCnt > 0U) 1142 { 1143 /* Read coefficients */ 1144 c0 = *(pb++); 1145 1146 /* Fetch 1 state variable */ 1147 x7 = *(px++); 1148 1149 /* Perform the multiply-accumulates */ 1150 acc0 += x0 * c0; 1151 acc1 += x1 * c0; 1152 acc2 += x2 * c0; 1153 acc3 += x3 * c0; 1154 acc4 += x4 * c0; 1155 acc5 += x5 * c0; 1156 acc6 += x6 * c0; 1157 acc7 += x7 * c0; 1158 1159 /* Reuse the present sample states for next sample */ 1160 x0 = x1; 1161 x1 = x2; 1162 x2 = x3; 1163 x3 = x4; 1164 x4 = x5; 1165 x5 = x6; 1166 x6 = x7; 1167 1168 /* Decrement loop counter */ 1169 tapCnt--; 1170 } 1171 1172 /* Advance the state pointer by 8 to process the next group of 8 samples */ 1173 pState = pState + 8; 1174 1175 /* The results in the 8 accumulators, store in the destination buffer. */ 1176 *pDst++ = acc0; 1177 *pDst++ = acc1; 1178 *pDst++ = acc2; 1179 *pDst++ = acc3; 1180 *pDst++ = acc4; 1181 *pDst++ = acc5; 1182 *pDst++ = acc6; 1183 *pDst++ = acc7; 1184 1185 1186 /* Decrement loop counter */ 1187 blkCnt--; 1188 } 1189 1190 /* Loop unrolling: Compute remaining output samples */ 1191 blkCnt = blockSize % 0x8U; 1192 1193 #else 1194 1195 /* Initialize blkCnt with number of taps */ 1196 blkCnt = blockSize; 1197 1198 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 1199 1200 while (blkCnt > 0U) 1201 { 1202 /* Copy one sample at a time into state buffer */ 1203 *pStateCurnt++ = *pSrc++; 1204 1205 /* Set the accumulator to zero */ 1206 acc0 = 0.0f; 1207 1208 /* Initialize state pointer */ 1209 px = pState; 1210 1211 /* Initialize Coefficient pointer */ 1212 pb = pCoeffs; 1213 1214 i = numTaps; 1215 1216 /* Perform the multiply-accumulates */ 1217 while (i > 0U) 1218 { 1219 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ 1220 acc0 += *px++ * *pb++; 1221 1222 i--; 1223 } 1224 1225 /* Store result in destination buffer. */ 1226 *pDst++ = acc0; 1227 1228 /* Advance state pointer by 1 for the next sample */ 1229 pState = pState + 1U; 1230 1231 /* Decrement loop counter */ 1232 blkCnt--; 1233 } 1234 1235 /* Processing is complete. 1236 Now copy the last numTaps - 1 samples to the start of the state buffer. 1237 This prepares the state buffer for the next function call. */ 1238 1239 /* Points to the start of the state buffer */ 1240 pStateCurnt = S->pState; 1241 1242 #if defined (ARM_MATH_LOOPUNROLL) 1243 1244 /* Loop unrolling: Compute 4 taps at a time */ 1245 tapCnt = (numTaps - 1U) >> 2U; 1246 1247 /* Copy data */ 1248 while (tapCnt > 0U) 1249 { 1250 *pStateCurnt++ = *pState++; 1251 *pStateCurnt++ = *pState++; 1252 *pStateCurnt++ = *pState++; 1253 *pStateCurnt++ = *pState++; 1254 1255 /* Decrement loop counter */ 1256 tapCnt--; 1257 } 1258 1259 /* Calculate remaining number of copies */ 1260 tapCnt = (numTaps - 1U) % 0x4U; 1261 1262 #else 1263 1264 /* Initialize tapCnt with number of taps */ 1265 tapCnt = (numTaps - 1U); 1266 1267 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 1268 1269 /* Copy remaining data */ 1270 while (tapCnt > 0U) 1271 { 1272 *pStateCurnt++ = *pState++; 1273 1274 /* Decrement loop counter */ 1275 tapCnt--; 1276 } 1277 1278 } 1279 1280 #endif /* #if defined(ARM_MATH_NEON) */ 1281 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 1282 1283 /** 1284 * @} end of FIR group 1285 */