/ Drivers / CMSIS / DSP / Source / FilteringFunctions / arm_lms_f32.c
arm_lms_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_lms_f32.c
  4   * Description:  Processing function for the floating-point LMS filter
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/filtering_functions.h"
 30  
 31  /**
 32    @ingroup groupFilters
 33   */
 34  
 35  /**
 36    @defgroup LMS Least Mean Square (LMS) Filters
 37  
 38    LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
 39    LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
 40    Adaptive filters are often used in communication systems, equalizers, and noise removal.
 41    The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
 42    The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
 43  
 44    An LMS filter consists of two components as shown below.
 45    The first component is a standard transversal or FIR filter.
 46    The second component is a coefficient update mechanism.
 47    The LMS filter has two input signals.
 48    The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
 49    That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
 50    The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
 51    This "error signal" tends towards zero as the filter adapts.
 52    The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
 53    \image html LMS.gif "Internal structure of the Least Mean Square filter"
 54  
 55    The functions operate on blocks of data and each call to the function processes
 56    <code>blockSize</code> samples through the filter.
 57    <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal,
 58    <code>pOut</code> points to output signal and <code>pErr</code> points to error signal.
 59    All arrays contain <code>blockSize</code> values.
 60  
 61    The functions operate on a block-by-block basis.
 62    Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis.
 63    The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
 64  
 65    @par           Algorithm
 66                     The output signal <code>y[n]</code> is computed by a standard FIR filter:
 67    <pre>
 68        y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
 69    </pre>
 70  
 71    @par
 72                     The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output:
 73    <pre>
 74        e[n] = d[n] - y[n].
 75    </pre>
 76  
 77    @par
 78                     After each sample of the error signal is computed, the filter coefficients <code>b[k]</code> are updated on a sample-by-sample basis:
 79    <pre>
 80        b[k] = b[k] + e[n] * mu * x[n-k],  for k=0, 1, ..., numTaps-1
 81    </pre>
 82                     where <code>mu</code> is the step size and controls the rate of coefficient convergence.
 83    @par
 84                     In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
 85                     Coefficients are stored in time reversed order.
 86    @par
 87    <pre>
 88       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
 89    </pre>
 90    @par
 91                     <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
 92                     Samples in the state buffer are stored in the order:
 93    @par
 94    <pre>
 95       {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
 96    </pre>
 97    @par
 98                     Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples.
 99                     The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
100                     to be avoided and yields a significant speed improvement.
101                     The state variables are updated after each block of data is processed.
102    @par           Instance Structure
103                     The coefficients and state variables for a filter are stored together in an instance data structure.
104                     A separate instance structure must be defined for each filter and
105                     coefficient and state arrays cannot be shared among instances.
106                     There are separate instance structure declarations for each of the 3 supported data types.
107  
108    @par           Initialization Functions
109                     There is also an associated initialization function for each data type.
110                     The initialization function performs the following operations:
111                     - Sets the values of the internal structure fields.
112                     - Zeros out the values in the state buffer.
113                     To do this manually without calling the init function, assign the follow subfields of the instance structure:
114                     numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
115  
116    @par
117                   Use of the initialization function is optional.
118                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
119                   To place an instance structure into a const data section, the instance structure must be manually initialized.
120                   Set the values in the state buffer to zeros before static initialization.
121                   The code below statically initializes each of the 3 different data type filter instance structures
122    <pre>
123       arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
124       arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
125       arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
126    </pre>
127                   where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
128                   <code>pCoeffs</code> is the address of the coefficient buffer; <code>mu</code> is the step size parameter; and <code>postShift</code> is the shift applied to coefficients.
129  
130    @par           Fixed-Point Behavior
131                     Care must be taken when using the Q15 and Q31 versions of the LMS filter.
132                     The following issues must be considered:
133                     - Scaling of coefficients
134                     - Overflow and saturation
135  
136    @par           Scaling of Coefficients
137                     Filter coefficients are represented as fractional values and
138                     coefficients are restricted to lie in the range <code>[-1 +1)</code>.
139                     The fixed-point functions have an additional scaling parameter <code>postShift</code>.
140                     At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
141                     This essentially scales the filter coefficients by <code>2^postShift</code> and
142                     allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
143                     The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.
144  
145    @par           Overflow and Saturation
146                     Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
147                     described separately as part of the function specific documentation below.
148   */
149  
150  /**
151    @addtogroup LMS
152    @{
153   */
154  
155  /**
156    @brief         Processing function for floating-point LMS filter.
157    @param[in]     S          points to an instance of the floating-point LMS filter structure
158    @param[in]     pSrc       points to the block of input data
159    @param[in]     pRef       points to the block of reference data
160    @param[out]    pOut       points to the block of output data
161    @param[out]    pErr       points to the block of error data
162    @param[in]     blockSize  number of samples to process
163    @return        none
164   */
165  #if defined(ARM_MATH_NEON)
166  void arm_lms_f32(
167    const arm_lms_instance_f32 * S,
168    const float32_t * pSrc,
169    float32_t * pRef,
170    float32_t * pOut,
171    float32_t * pErr,
172    uint32_t blockSize)
173  {
174    float32_t *pState = S->pState;                 /* State pointer */
175    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
176    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
177    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
178    float32_t mu = S->mu;                          /* Adaptive factor */
179    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
180    uint32_t tapCnt, blkCnt;                       /* Loop counters */
181    float32_t sum, e, d;                           /* accumulator, error, reference data sample */
182    float32_t w = 0.0f;                            /* weight factor */
183  
184    float32x4_t tempV, sumV, xV, bV;
185    float32x2_t tempV2;
186  
187    e = 0.0f;
188    d = 0.0f;
189  
190    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
191    /* pStateCurnt points to the location where the new input data should be written */
192    pStateCurnt = &(S->pState[(numTaps - 1U)]);
193  
194    blkCnt = blockSize;
195  
196    while (blkCnt > 0U)
197    {
198      /* Copy the new input sample into the state buffer */
199      *pStateCurnt++ = *pSrc++;
200  
201      /* Initialize pState pointer */
202      px = pState;
203  
204      /* Initialize coeff pointer */
205      pb = (pCoeffs);
206  
207      /* Set the accumulator to zero */
208      sum = 0.0f;
209      sumV = vdupq_n_f32(0.0);
210  
211      /* Process 4 taps at a time. */
212      tapCnt = numTaps >> 2;
213  
214      while (tapCnt > 0U)
215      {
216        /* Perform the multiply-accumulate */
217        xV = vld1q_f32(px);
218        bV = vld1q_f32(pb);
219        sumV = vmlaq_f32(sumV, xV, bV);
220  
221        px += 4; 
222        pb += 4;
223  
224        /* Decrement the loop counter */
225        tapCnt--;
226      }
227      tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
228      sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
229  
230  
231      /* If the filter length is not a multiple of 4, compute the remaining filter taps */
232      tapCnt = numTaps % 0x4U;
233  
234      while (tapCnt > 0U)
235      {
236        /* Perform the multiply-accumulate */
237        sum += (*px++) * (*pb++);
238  
239        /* Decrement the loop counter */
240        tapCnt--;
241      }
242  
243      /* The result in the accumulator, store in the destination buffer. */
244      *pOut++ = sum;
245  
246      /* Compute and store error */
247      d = (float32_t) (*pRef++);
248      e = d - sum;
249      *pErr++ = e;
250  
251      /* Calculation of Weighting factor for the updating filter coefficients */
252      w = e * mu;
253  
254      /* Initialize pState pointer */
255      px = pState;
256  
257      /* Initialize coeff pointer */
258      pb = (pCoeffs);
259  
260      /* Process 4 taps at a time. */
261      tapCnt = numTaps >> 2;
262  
263      /* Update filter coefficients */
264      while (tapCnt > 0U)
265      {
266        /* Perform the multiply-accumulate */
267        xV = vld1q_f32(px);
268        bV = vld1q_f32(pb);
269        px += 4;
270        bV = vmlaq_n_f32(bV,xV,w);
271  
272        vst1q_f32(pb,bV); 
273        pb += 4;
274  
275  
276        /* Decrement the loop counter */
277        tapCnt--;
278      }
279  
280      /* If the filter length is not a multiple of 4, compute the remaining filter taps */
281      tapCnt = numTaps % 0x4U;
282  
283      while (tapCnt > 0U)
284      {
285        /* Perform the multiply-accumulate */
286        *pb = *pb + (w * (*px++));
287        pb++;
288  
289        /* Decrement the loop counter */
290        tapCnt--;
291      }
292  
293      /* Advance state pointer by 1 for the next sample */
294      pState = pState + 1;
295  
296      /* Decrement the loop counter */
297      blkCnt--;
298    }
299  
300  
301    /* Processing is complete. Now copy the last numTaps - 1 samples to the
302       satrt of the state buffer. This prepares the state buffer for the
303       next function call. */
304  
305    /* Points to the start of the pState buffer */
306    pStateCurnt = S->pState;
307  
308    /* Process 4 taps at a time for (numTaps - 1U) samples copy */
309    tapCnt = (numTaps - 1U) >> 2U;
310  
311    /* copy data */
312    while (tapCnt > 0U)
313    {
314      tempV = vld1q_f32(pState);
315      vst1q_f32(pStateCurnt,tempV); 
316      pState += 4;
317      pStateCurnt += 4;
318  
319      /* Decrement the loop counter */
320      tapCnt--;
321    }
322  
323    /* Calculate remaining number of copies */
324    tapCnt = (numTaps - 1U) % 0x4U;
325  
326    /* Copy the remaining q31_t data */
327    while (tapCnt > 0U)
328    {
329      *pStateCurnt++ = *pState++;
330  
331      /* Decrement the loop counter */
332      tapCnt--;
333    }
334  
335  
336  }
337  #else
338  void arm_lms_f32(
339    const arm_lms_instance_f32 * S,
340    const float32_t * pSrc,
341          float32_t * pRef,
342          float32_t * pOut,
343          float32_t * pErr,
344          uint32_t blockSize)
345  {       
346          float32_t *pState = S->pState;                 /* State pointer */
347          float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
348          float32_t *pStateCurnt;                        /* Points to the current sample of the state */
349          float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
350          float32_t mu = S->mu;                          /* Adaptive factor */
351          float32_t acc, e;                              /* Accumulator, error */
352          float32_t w;                                   /* Weight factor */
353          uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
354          uint32_t tapCnt, blkCnt;                       /* Loop counters */
355  
356    /* Initializations of error,  difference, Coefficient update */
357    e = 0.0f;
358    w = 0.0f;
359  
360    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
361    /* pStateCurnt points to the location where the new input data should be written */
362    pStateCurnt = &(S->pState[(numTaps - 1U)]);
363  
364    /* initialise loop count */
365    blkCnt = blockSize;
366  
367    while (blkCnt > 0U)
368    {
369      /* Copy the new input sample into the state buffer */
370      *pStateCurnt++ = *pSrc++;
371  
372      /* Initialize pState pointer */
373      px = pState;
374  
375      /* Initialize coefficient pointer */
376      pb = pCoeffs;
377  
378      /* Set the accumulator to zero */
379      acc = 0.0f;
380  
381  #if defined (ARM_MATH_LOOPUNROLL)
382  
383      /* Loop unrolling: Compute 4 taps at a time. */
384      tapCnt = numTaps >> 2U;
385  
386      while (tapCnt > 0U)
387      {
388        /* Perform the multiply-accumulate */
389        acc += (*px++) * (*pb++);
390  
391        acc += (*px++) * (*pb++);
392  
393        acc += (*px++) * (*pb++);
394  
395        acc += (*px++) * (*pb++);
396  
397        /* Decrement loop counter */
398        tapCnt--;
399      }
400  
401      /* Loop unrolling: Compute remaining taps */
402      tapCnt = numTaps % 0x4U;
403  
404  #else
405  
406      /* Initialize tapCnt with number of samples */
407      tapCnt = numTaps;
408  
409  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
410  
411      while (tapCnt > 0U)
412      {
413        /* Perform the multiply-accumulate */
414        acc += (*px++) * (*pb++);
415  
416        /* Decrement the loop counter */
417        tapCnt--;
418      }
419  
420      /* Store the result from accumulator into the destination buffer. */
421      *pOut++ = acc;
422  
423      /* Compute and store error */
424      e = (float32_t) *pRef++ - acc;
425      *pErr++ = e;
426  
427      /* Calculation of Weighting factor for updating filter coefficients */
428      w = e * mu;
429  
430      /* Initialize pState pointer */
431      /* Advance state pointer by 1 for the next sample */
432      px = pState++;
433  
434      /* Initialize coefficient pointer */
435      pb = pCoeffs;
436  
437  #if defined (ARM_MATH_LOOPUNROLL)
438  
439      /* Loop unrolling: Compute 4 taps at a time. */
440      tapCnt = numTaps >> 2U;
441  
442      /* Update filter coefficients */
443      while (tapCnt > 0U)
444      {
445        /* Perform the multiply-accumulate */
446        *pb += w * (*px++);
447        pb++;
448  
449        *pb += w * (*px++);
450        pb++;
451  
452        *pb += w * (*px++);
453        pb++;
454  
455        *pb += w * (*px++);
456        pb++;
457  
458        /* Decrement loop counter */
459        tapCnt--;
460      }
461  
462      /* Loop unrolling: Compute remaining taps */
463      tapCnt = numTaps % 0x4U;
464  
465  #else
466  
467      /* Initialize tapCnt with number of samples */
468      tapCnt = numTaps;
469  
470  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
471  
472      while (tapCnt > 0U)
473      {
474        /* Perform the multiply-accumulate */
475        *pb += w * (*px++);
476        pb++;
477  
478        /* Decrement loop counter */
479        tapCnt--;
480      }
481  
482      /* Decrement loop counter */
483      blkCnt--;
484    }
485  
486    /* Processing is complete.
487       Now copy the last numTaps - 1 samples to the start of the state buffer.
488       This prepares the state buffer for the next function call. */
489  
490    /* Points to the start of the pState buffer */
491    pStateCurnt = S->pState;
492  
493    /* copy data */
494  #if defined (ARM_MATH_LOOPUNROLL)
495  
496    /* Loop unrolling: Compute 4 taps at a time. */
497    tapCnt = (numTaps - 1U) >> 2U;
498  
499    while (tapCnt > 0U)
500    {
501      *pStateCurnt++ = *pState++;
502      *pStateCurnt++ = *pState++;
503      *pStateCurnt++ = *pState++;
504      *pStateCurnt++ = *pState++;
505  
506      /* Decrement loop counter */
507      tapCnt--;
508    }
509  
510    /* Loop unrolling: Compute remaining taps */
511    tapCnt = (numTaps - 1U) % 0x4U;
512  
513  #else
514  
515    /* Initialize tapCnt with number of samples */
516    tapCnt = (numTaps - 1U);
517  
518  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
519  
520    while (tapCnt > 0U)
521    {
522      *pStateCurnt++ = *pState++;
523  
524      /* Decrement loop counter */
525      tapCnt--;
526    }
527  
528  }
529  #endif /* #if defined(ARM_MATH_NEON) */
530  
531  /**
532    @} end of LMS group
533   */