arm_lms_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_lms_f32.c 4 * Description: Processing function for the floating-point LMS filter 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/filtering_functions.h" 30 31 /** 32 @ingroup groupFilters 33 */ 34 35 /** 36 @defgroup LMS Least Mean Square (LMS) Filters 37 38 LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions. 39 LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal. 40 Adaptive filters are often used in communication systems, equalizers, and noise removal. 41 The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types. 42 The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal. 43 44 An LMS filter consists of two components as shown below. 45 The first component is a standard transversal or FIR filter. 46 The second component is a coefficient update mechanism. 47 The LMS filter has two input signals. 48 The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter. 49 That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input. 50 The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input. 51 This "error signal" tends towards zero as the filter adapts. 52 The LMS processing functions accept the input and reference input signals and generate the filter output and error signal. 53 \image html LMS.gif "Internal structure of the Least Mean Square filter" 54 55 The functions operate on blocks of data and each call to the function processes 56 <code>blockSize</code> samples through the filter. 57 <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal, 58 <code>pOut</code> points to output signal and <code>pErr</code> points to error signal. 59 All arrays contain <code>blockSize</code> values. 60 61 The functions operate on a block-by-block basis. 62 Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis. 63 The convergence of the LMS filter is slower compared to the normalized LMS algorithm. 64 65 @par Algorithm 66 The output signal <code>y[n]</code> is computed by a standard FIR filter: 67 <pre> 68 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1] 69 </pre> 70 71 @par 72 The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output: 73 <pre> 74 e[n] = d[n] - y[n]. 75 </pre> 76 77 @par 78 After each sample of the error signal is computed, the filter coefficients <code>b[k]</code> are updated on a sample-by-sample basis: 79 <pre> 80 b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1 81 </pre> 82 where <code>mu</code> is the step size and controls the rate of coefficient convergence. 83 @par 84 In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>. 85 Coefficients are stored in time reversed order. 86 @par 87 <pre> 88 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]} 89 </pre> 90 @par 91 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>. 92 Samples in the state buffer are stored in the order: 93 @par 94 <pre> 95 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]} 96 </pre> 97 @par 98 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples. 99 The increased state buffer length allows circular addressing, which is traditionally used in FIR filters, 100 to be avoided and yields a significant speed improvement. 101 The state variables are updated after each block of data is processed. 102 @par Instance Structure 103 The coefficients and state variables for a filter are stored together in an instance data structure. 104 A separate instance structure must be defined for each filter and 105 coefficient and state arrays cannot be shared among instances. 106 There are separate instance structure declarations for each of the 3 supported data types. 107 108 @par Initialization Functions 109 There is also an associated initialization function for each data type. 110 The initialization function performs the following operations: 111 - Sets the values of the internal structure fields. 112 - Zeros out the values in the state buffer. 113 To do this manually without calling the init function, assign the follow subfields of the instance structure: 114 numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero. 115 116 @par 117 Use of the initialization function is optional. 118 However, if the initialization function is used, then the instance structure cannot be placed into a const data section. 119 To place an instance structure into a const data section, the instance structure must be manually initialized. 120 Set the values in the state buffer to zeros before static initialization. 121 The code below statically initializes each of the 3 different data type filter instance structures 122 <pre> 123 arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu}; 124 arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift}; 125 arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift}; 126 </pre> 127 where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer; 128 <code>pCoeffs</code> is the address of the coefficient buffer; <code>mu</code> is the step size parameter; and <code>postShift</code> is the shift applied to coefficients. 129 130 @par Fixed-Point Behavior 131 Care must be taken when using the Q15 and Q31 versions of the LMS filter. 132 The following issues must be considered: 133 - Scaling of coefficients 134 - Overflow and saturation 135 136 @par Scaling of Coefficients 137 Filter coefficients are represented as fractional values and 138 coefficients are restricted to lie in the range <code>[-1 +1)</code>. 139 The fixed-point functions have an additional scaling parameter <code>postShift</code>. 140 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits. 141 This essentially scales the filter coefficients by <code>2^postShift</code> and 142 allows the filter coefficients to exceed the range <code>[+1 -1)</code>. 143 The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled. 144 145 @par Overflow and Saturation 146 Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are 147 described separately as part of the function specific documentation below. 148 */ 149 150 /** 151 @addtogroup LMS 152 @{ 153 */ 154 155 /** 156 @brief Processing function for floating-point LMS filter. 157 @param[in] S points to an instance of the floating-point LMS filter structure 158 @param[in] pSrc points to the block of input data 159 @param[in] pRef points to the block of reference data 160 @param[out] pOut points to the block of output data 161 @param[out] pErr points to the block of error data 162 @param[in] blockSize number of samples to process 163 @return none 164 */ 165 #if defined(ARM_MATH_NEON) 166 void arm_lms_f32( 167 const arm_lms_instance_f32 * S, 168 const float32_t * pSrc, 169 float32_t * pRef, 170 float32_t * pOut, 171 float32_t * pErr, 172 uint32_t blockSize) 173 { 174 float32_t *pState = S->pState; /* State pointer */ 175 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 176 float32_t *pStateCurnt; /* Points to the current sample of the state */ 177 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ 178 float32_t mu = S->mu; /* Adaptive factor */ 179 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 180 uint32_t tapCnt, blkCnt; /* Loop counters */ 181 float32_t sum, e, d; /* accumulator, error, reference data sample */ 182 float32_t w = 0.0f; /* weight factor */ 183 184 float32x4_t tempV, sumV, xV, bV; 185 float32x2_t tempV2; 186 187 e = 0.0f; 188 d = 0.0f; 189 190 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 191 /* pStateCurnt points to the location where the new input data should be written */ 192 pStateCurnt = &(S->pState[(numTaps - 1U)]); 193 194 blkCnt = blockSize; 195 196 while (blkCnt > 0U) 197 { 198 /* Copy the new input sample into the state buffer */ 199 *pStateCurnt++ = *pSrc++; 200 201 /* Initialize pState pointer */ 202 px = pState; 203 204 /* Initialize coeff pointer */ 205 pb = (pCoeffs); 206 207 /* Set the accumulator to zero */ 208 sum = 0.0f; 209 sumV = vdupq_n_f32(0.0); 210 211 /* Process 4 taps at a time. */ 212 tapCnt = numTaps >> 2; 213 214 while (tapCnt > 0U) 215 { 216 /* Perform the multiply-accumulate */ 217 xV = vld1q_f32(px); 218 bV = vld1q_f32(pb); 219 sumV = vmlaq_f32(sumV, xV, bV); 220 221 px += 4; 222 pb += 4; 223 224 /* Decrement the loop counter */ 225 tapCnt--; 226 } 227 tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); 228 sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1); 229 230 231 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 232 tapCnt = numTaps % 0x4U; 233 234 while (tapCnt > 0U) 235 { 236 /* Perform the multiply-accumulate */ 237 sum += (*px++) * (*pb++); 238 239 /* Decrement the loop counter */ 240 tapCnt--; 241 } 242 243 /* The result in the accumulator, store in the destination buffer. */ 244 *pOut++ = sum; 245 246 /* Compute and store error */ 247 d = (float32_t) (*pRef++); 248 e = d - sum; 249 *pErr++ = e; 250 251 /* Calculation of Weighting factor for the updating filter coefficients */ 252 w = e * mu; 253 254 /* Initialize pState pointer */ 255 px = pState; 256 257 /* Initialize coeff pointer */ 258 pb = (pCoeffs); 259 260 /* Process 4 taps at a time. */ 261 tapCnt = numTaps >> 2; 262 263 /* Update filter coefficients */ 264 while (tapCnt > 0U) 265 { 266 /* Perform the multiply-accumulate */ 267 xV = vld1q_f32(px); 268 bV = vld1q_f32(pb); 269 px += 4; 270 bV = vmlaq_n_f32(bV,xV,w); 271 272 vst1q_f32(pb,bV); 273 pb += 4; 274 275 276 /* Decrement the loop counter */ 277 tapCnt--; 278 } 279 280 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 281 tapCnt = numTaps % 0x4U; 282 283 while (tapCnt > 0U) 284 { 285 /* Perform the multiply-accumulate */ 286 *pb = *pb + (w * (*px++)); 287 pb++; 288 289 /* Decrement the loop counter */ 290 tapCnt--; 291 } 292 293 /* Advance state pointer by 1 for the next sample */ 294 pState = pState + 1; 295 296 /* Decrement the loop counter */ 297 blkCnt--; 298 } 299 300 301 /* Processing is complete. Now copy the last numTaps - 1 samples to the 302 satrt of the state buffer. This prepares the state buffer for the 303 next function call. */ 304 305 /* Points to the start of the pState buffer */ 306 pStateCurnt = S->pState; 307 308 /* Process 4 taps at a time for (numTaps - 1U) samples copy */ 309 tapCnt = (numTaps - 1U) >> 2U; 310 311 /* copy data */ 312 while (tapCnt > 0U) 313 { 314 tempV = vld1q_f32(pState); 315 vst1q_f32(pStateCurnt,tempV); 316 pState += 4; 317 pStateCurnt += 4; 318 319 /* Decrement the loop counter */ 320 tapCnt--; 321 } 322 323 /* Calculate remaining number of copies */ 324 tapCnt = (numTaps - 1U) % 0x4U; 325 326 /* Copy the remaining q31_t data */ 327 while (tapCnt > 0U) 328 { 329 *pStateCurnt++ = *pState++; 330 331 /* Decrement the loop counter */ 332 tapCnt--; 333 } 334 335 336 } 337 #else 338 void arm_lms_f32( 339 const arm_lms_instance_f32 * S, 340 const float32_t * pSrc, 341 float32_t * pRef, 342 float32_t * pOut, 343 float32_t * pErr, 344 uint32_t blockSize) 345 { 346 float32_t *pState = S->pState; /* State pointer */ 347 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 348 float32_t *pStateCurnt; /* Points to the current sample of the state */ 349 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ 350 float32_t mu = S->mu; /* Adaptive factor */ 351 float32_t acc, e; /* Accumulator, error */ 352 float32_t w; /* Weight factor */ 353 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 354 uint32_t tapCnt, blkCnt; /* Loop counters */ 355 356 /* Initializations of error, difference, Coefficient update */ 357 e = 0.0f; 358 w = 0.0f; 359 360 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 361 /* pStateCurnt points to the location where the new input data should be written */ 362 pStateCurnt = &(S->pState[(numTaps - 1U)]); 363 364 /* initialise loop count */ 365 blkCnt = blockSize; 366 367 while (blkCnt > 0U) 368 { 369 /* Copy the new input sample into the state buffer */ 370 *pStateCurnt++ = *pSrc++; 371 372 /* Initialize pState pointer */ 373 px = pState; 374 375 /* Initialize coefficient pointer */ 376 pb = pCoeffs; 377 378 /* Set the accumulator to zero */ 379 acc = 0.0f; 380 381 #if defined (ARM_MATH_LOOPUNROLL) 382 383 /* Loop unrolling: Compute 4 taps at a time. */ 384 tapCnt = numTaps >> 2U; 385 386 while (tapCnt > 0U) 387 { 388 /* Perform the multiply-accumulate */ 389 acc += (*px++) * (*pb++); 390 391 acc += (*px++) * (*pb++); 392 393 acc += (*px++) * (*pb++); 394 395 acc += (*px++) * (*pb++); 396 397 /* Decrement loop counter */ 398 tapCnt--; 399 } 400 401 /* Loop unrolling: Compute remaining taps */ 402 tapCnt = numTaps % 0x4U; 403 404 #else 405 406 /* Initialize tapCnt with number of samples */ 407 tapCnt = numTaps; 408 409 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 410 411 while (tapCnt > 0U) 412 { 413 /* Perform the multiply-accumulate */ 414 acc += (*px++) * (*pb++); 415 416 /* Decrement the loop counter */ 417 tapCnt--; 418 } 419 420 /* Store the result from accumulator into the destination buffer. */ 421 *pOut++ = acc; 422 423 /* Compute and store error */ 424 e = (float32_t) *pRef++ - acc; 425 *pErr++ = e; 426 427 /* Calculation of Weighting factor for updating filter coefficients */ 428 w = e * mu; 429 430 /* Initialize pState pointer */ 431 /* Advance state pointer by 1 for the next sample */ 432 px = pState++; 433 434 /* Initialize coefficient pointer */ 435 pb = pCoeffs; 436 437 #if defined (ARM_MATH_LOOPUNROLL) 438 439 /* Loop unrolling: Compute 4 taps at a time. */ 440 tapCnt = numTaps >> 2U; 441 442 /* Update filter coefficients */ 443 while (tapCnt > 0U) 444 { 445 /* Perform the multiply-accumulate */ 446 *pb += w * (*px++); 447 pb++; 448 449 *pb += w * (*px++); 450 pb++; 451 452 *pb += w * (*px++); 453 pb++; 454 455 *pb += w * (*px++); 456 pb++; 457 458 /* Decrement loop counter */ 459 tapCnt--; 460 } 461 462 /* Loop unrolling: Compute remaining taps */ 463 tapCnt = numTaps % 0x4U; 464 465 #else 466 467 /* Initialize tapCnt with number of samples */ 468 tapCnt = numTaps; 469 470 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 471 472 while (tapCnt > 0U) 473 { 474 /* Perform the multiply-accumulate */ 475 *pb += w * (*px++); 476 pb++; 477 478 /* Decrement loop counter */ 479 tapCnt--; 480 } 481 482 /* Decrement loop counter */ 483 blkCnt--; 484 } 485 486 /* Processing is complete. 487 Now copy the last numTaps - 1 samples to the start of the state buffer. 488 This prepares the state buffer for the next function call. */ 489 490 /* Points to the start of the pState buffer */ 491 pStateCurnt = S->pState; 492 493 /* copy data */ 494 #if defined (ARM_MATH_LOOPUNROLL) 495 496 /* Loop unrolling: Compute 4 taps at a time. */ 497 tapCnt = (numTaps - 1U) >> 2U; 498 499 while (tapCnt > 0U) 500 { 501 *pStateCurnt++ = *pState++; 502 *pStateCurnt++ = *pState++; 503 *pStateCurnt++ = *pState++; 504 *pStateCurnt++ = *pState++; 505 506 /* Decrement loop counter */ 507 tapCnt--; 508 } 509 510 /* Loop unrolling: Compute remaining taps */ 511 tapCnt = (numTaps - 1U) % 0x4U; 512 513 #else 514 515 /* Initialize tapCnt with number of samples */ 516 tapCnt = (numTaps - 1U); 517 518 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 519 520 while (tapCnt > 0U) 521 { 522 *pStateCurnt++ = *pState++; 523 524 /* Decrement loop counter */ 525 tapCnt--; 526 } 527 528 } 529 #endif /* #if defined(ARM_MATH_NEON) */ 530 531 /** 532 @} end of LMS group 533 */