/ Drivers / CMSIS / DSP / Source / SVMFunctions / arm_svm_linear_predict_f32.c
arm_svm_linear_predict_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_svm_linear_predict_f32.c
  4   * Description:  SVM Linear Classifier
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/svm_functions.h"
 30  #include <limits.h>
 31  #include <math.h>
 32  
 33  
 34  /**
 35   * @addtogroup linearsvm
 36   * @{
 37   */
 38  
 39  
 40  /**
 41   * @brief SVM linear prediction
 42   * @param[in]    S          Pointer to an instance of the linear SVM structure.
 43   * @param[in]    in         Pointer to input vector
 44   * @param[out]   pResult    Decision value
 45   * @return none.
 46   *
 47   */
 48  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 49  
 50  #include "arm_helium_utils.h"
 51  
 52  void arm_svm_linear_predict_f32(
 53      const arm_svm_linear_instance_f32 *S,
 54      const float32_t * in,
 55      int32_t * pResult)
 56  {
 57          /* inlined Matrix x Vector function interleaved with dot prod */
 58      uint32_t        numRows = S->nbOfSupportVectors;
 59      uint32_t        numCols = S->vectorDimension;
 60      const float32_t *pSupport = S->supportVectors;
 61      const float32_t *pSrcA = pSupport;
 62      const float32_t *pInA0;
 63      const float32_t *pInA1;
 64      uint32_t         row;
 65      uint32_t         blkCnt;     /* loop counters */
 66      const float32_t *pDualCoef = S->dualCoefficients;
 67      float32_t       sum = S->intercept;
 68      row = numRows;
 69  
 70      /*
 71       * compute 4 rows in parrallel
 72       */
 73      while (row >= 4) 
 74      {
 75          const float32_t *pInA2, *pInA3;
 76          float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
 77          f32x4_t         vecIn, acc0, acc1, acc2, acc3;
 78          float32_t const *pSrcVecPtr = in;
 79  
 80          /*
 81           * Initialize the pointers to 4 consecutive MatrixA rows
 82           */
 83          pInA0 = pSrcA;
 84          pInA1 = pInA0 + numCols;
 85          pInA2 = pInA1 + numCols;
 86          pInA3 = pInA2 + numCols;
 87          /*
 88           * Initialize the vector pointer
 89           */
 90          pInVec = pSrcVecPtr;
 91          /*
 92           * reset accumulators
 93           */
 94          acc0 = vdupq_n_f32(0.0f);
 95          acc1 = vdupq_n_f32(0.0f);
 96          acc2 = vdupq_n_f32(0.0f);
 97          acc3 = vdupq_n_f32(0.0f);
 98  
 99          pSrcA0Vec = pInA0;
100          pSrcA1Vec = pInA1;
101          pSrcA2Vec = pInA2;
102          pSrcA3Vec = pInA3;
103  
104          blkCnt = numCols >> 2;
105          while (blkCnt > 0U) {
106              f32x4_t         vecA;
107  
108              vecIn = vld1q(pInVec);
109              pInVec += 4;
110              vecA = vld1q(pSrcA0Vec);
111              pSrcA0Vec += 4;
112              acc0 = vfmaq(acc0, vecIn, vecA);
113              vecA = vld1q(pSrcA1Vec);
114              pSrcA1Vec += 4;
115              acc1 = vfmaq(acc1, vecIn, vecA);
116              vecA = vld1q(pSrcA2Vec);
117              pSrcA2Vec += 4;
118              acc2 = vfmaq(acc2, vecIn, vecA);
119              vecA = vld1q(pSrcA3Vec);
120              pSrcA3Vec += 4;
121              acc3 = vfmaq(acc3, vecIn, vecA);
122  
123              blkCnt--;
124          }
125          /*
126           * tail
127           * (will be merged thru tail predication)
128           */
129          blkCnt = numCols & 3;
130          if (blkCnt > 0U) {
131              mve_pred16_t    p0 = vctp32q(blkCnt);
132              f32x4_t         vecA;
133  
134              vecIn = vldrwq_z_f32(pInVec, p0);
135              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
136              acc0 = vfmaq(acc0, vecIn, vecA);
137              vecA = vldrwq_z_f32(pSrcA1Vec, p0);
138              acc1 = vfmaq(acc1, vecIn, vecA);
139              vecA = vldrwq_z_f32(pSrcA2Vec, p0);
140              acc2 = vfmaq(acc2, vecIn, vecA);
141              vecA = vldrwq_z_f32(pSrcA3Vec, p0);
142              acc3 = vfmaq(acc3, vecIn, vecA);
143          }
144          /*
145           * Sum the partial parts
146           */
147  
148          acc0 = vmulq_n_f32(acc0,*pDualCoef++);
149          acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
150          acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
151          acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
152  
153          sum += vecAddAcrossF32Mve(acc0);
154  
155          pSrcA += numCols * 4;
156          /*
157           * Decrement the row loop counter
158           */
159          row -= 4;
160      }
161  
162      /*
163       * compute 2 rows in parallel
164       */
165      if (row >= 2) {
166          float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
167          f32x4_t         vecIn, acc0, acc1;
168          float32_t const *pSrcVecPtr = in;
169  
170          /*
171           * Initialize the pointers to 2 consecutive MatrixA rows
172           */
173          pInA0 = pSrcA;
174          pInA1 = pInA0 + numCols;
175          /*
176           * Initialize the vector pointer
177           */
178          pInVec = pSrcVecPtr;
179          /*
180           * reset accumulators
181           */
182          acc0 = vdupq_n_f32(0.0f);
183          acc1 = vdupq_n_f32(0.0f);
184          pSrcA0Vec = pInA0;
185          pSrcA1Vec = pInA1;
186  
187          blkCnt = numCols >> 2;
188          while (blkCnt > 0U) {
189              f32x4_t         vecA;
190  
191              vecIn = vld1q(pInVec);
192              pInVec += 4;
193              vecA = vld1q(pSrcA0Vec);
194              pSrcA0Vec += 4;
195              acc0 = vfmaq(acc0, vecIn, vecA);
196              vecA = vld1q(pSrcA1Vec);
197              pSrcA1Vec += 4;
198              acc1 = vfmaq(acc1, vecIn, vecA);
199  
200              blkCnt--;
201          }
202          /*
203           * tail
204           * (will be merged thru tail predication)
205           */
206          blkCnt = numCols & 3;
207          if (blkCnt > 0U) {
208              mve_pred16_t    p0 = vctp32q(blkCnt);
209              f32x4_t         vecA;
210  
211              vecIn = vldrwq_z_f32(pInVec, p0);
212              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
213              acc0 = vfmaq(acc0, vecIn, vecA);
214              vecA = vldrwq_z_f32(pSrcA1Vec, p0);
215              acc1 = vfmaq(acc1, vecIn, vecA);
216          }
217          /*
218           * Sum the partial parts
219           */
220          acc0 = vmulq_n_f32(acc0,*pDualCoef++);
221          acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
222  
223          sum += vecAddAcrossF32Mve(acc0);
224  
225  
226          pSrcA += numCols * 2;
227          row -= 2;
228      }
229  
230      if (row >= 1) {
231          f32x4_t         vecIn, acc0;
232          float32_t const *pSrcA0Vec, *pInVec;
233          float32_t const *pSrcVecPtr = in;
234          /*
235           * Initialize the pointers to last MatrixA row
236           */
237          pInA0 = pSrcA;
238          /*
239           * Initialize the vector pointer
240           */
241          pInVec = pSrcVecPtr;
242          /*
243           * reset accumulators
244           */
245          acc0 = vdupq_n_f32(0.0f);
246  
247          pSrcA0Vec = pInA0;
248  
249          blkCnt = numCols >> 2;
250          while (blkCnt > 0U) {
251              f32x4_t         vecA;
252  
253              vecIn = vld1q(pInVec);
254              pInVec += 4;
255              vecA = vld1q(pSrcA0Vec);
256              pSrcA0Vec += 4;
257              acc0 = vfmaq(acc0, vecIn, vecA);
258  
259              blkCnt--;
260          }
261          /*
262           * tail
263           * (will be merged thru tail predication)
264           */
265          blkCnt = numCols & 3;
266          if (blkCnt > 0U) {
267              mve_pred16_t    p0 = vctp32q(blkCnt);
268              f32x4_t         vecA;
269  
270              vecIn = vldrwq_z_f32(pInVec, p0);
271              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
272              acc0 = vfmaq(acc0, vecIn, vecA);
273          }
274          /*
275           * Sum the partial parts
276           */
277          sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
278  
279      }
280  
281      *pResult = S->classes[STEP(sum)];
282  }
283  
284  #else
285  #if defined(ARM_MATH_NEON)
286  void arm_svm_linear_predict_f32(
287      const arm_svm_linear_instance_f32 *S,
288      const float32_t * in,
289      int32_t * pResult)
290  {
291      float32_t sum = S->intercept;
292     
293      float32_t dot;
294      float32x4_t dotV; 
295  
296      float32x4_t accuma,accumb,accumc,accumd,accum;
297      float32x2_t accum2;
298      float32x4_t vec1;
299  
300      float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
301  
302      uint32_t blkCnt;   
303      uint32_t vectorBlkCnt;   
304  
305      const float32_t *pIn = in;
306  
307      const float32_t *pSupport = S->supportVectors;
308  
309      const float32_t *pSupporta = S->supportVectors;
310      const float32_t *pSupportb;
311      const float32_t *pSupportc;
312      const float32_t *pSupportd;
313  
314      pSupportb = pSupporta + S->vectorDimension;
315      pSupportc = pSupportb + S->vectorDimension;
316      pSupportd = pSupportc + S->vectorDimension;
317  
318      const float32_t *pDualCoefs = S->dualCoefficients;
319  
320      vectorBlkCnt = S->nbOfSupportVectors >> 2;
321  
322      while (vectorBlkCnt > 0U)
323      {
324          accuma = vdupq_n_f32(0);
325          accumb = vdupq_n_f32(0);
326          accumc = vdupq_n_f32(0);
327          accumd = vdupq_n_f32(0);
328  
329          pIn = in;
330  
331          blkCnt = S->vectorDimension >> 2;
332          while (blkCnt > 0U)
333          {
334          
335              vec1 = vld1q_f32(pIn);
336              vec2a = vld1q_f32(pSupporta);
337              vec2b = vld1q_f32(pSupportb);
338              vec2c = vld1q_f32(pSupportc);
339              vec2d = vld1q_f32(pSupportd);
340  
341              pIn += 4;
342              pSupporta += 4;
343              pSupportb += 4;
344              pSupportc += 4;
345              pSupportd += 4;
346  
347              accuma = vmlaq_f32(accuma, vec1,vec2a);
348              accumb = vmlaq_f32(accumb, vec1,vec2b);
349              accumc = vmlaq_f32(accumc, vec1,vec2c);
350              accumd = vmlaq_f32(accumd, vec1,vec2d);
351  
352              blkCnt -- ;
353          }
354          accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
355          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
356  
357          accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
358          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
359  
360          accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
361          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
362  
363          accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
364          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
365  
366  
367          blkCnt = S->vectorDimension & 3;
368          while (blkCnt > 0U)
369          {
370              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
371              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
372              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
373              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
374  
375              pIn++;
376  
377              blkCnt -- ;
378          }
379  
380          vec1 = vld1q_f32(pDualCoefs);
381          pDualCoefs += 4; 
382  
383          accum = vmulq_f32(vec1,dotV);
384          accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
385          sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
386  
387          pSupporta += 3*S->vectorDimension;
388          pSupportb += 3*S->vectorDimension;
389          pSupportc += 3*S->vectorDimension;
390          pSupportd += 3*S->vectorDimension;
391  
392          vectorBlkCnt -- ;
393      }
394  
395      pSupport = pSupporta;
396      vectorBlkCnt = S->nbOfSupportVectors & 3;
397      while (vectorBlkCnt > 0U)
398      {
399          accum = vdupq_n_f32(0);
400          dot = 0.0f;
401          pIn = in;
402  
403          blkCnt = S->vectorDimension >> 2;
404          while (blkCnt > 0U)
405          {
406          
407              vec1 = vld1q_f32(pIn);
408              vec2 = vld1q_f32(pSupport);
409              pIn += 4;
410              pSupport += 4;
411  
412              accum = vmlaq_f32(accum, vec1,vec2);
413  
414              blkCnt -- ;
415          }
416          accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
417          dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
418  
419  
420          blkCnt = S->vectorDimension & 3;
421          while (blkCnt > 0U)
422          {
423              dot = dot + *pIn++ * *pSupport++;
424  
425              blkCnt -- ;
426          }
427  
428          sum += *pDualCoefs++ * dot;
429          vectorBlkCnt -- ;
430      }
431  
432      *pResult=S->classes[STEP(sum)];
433  }
434  #else
435  void arm_svm_linear_predict_f32(
436      const arm_svm_linear_instance_f32 *S,
437      const float32_t * in,
438      int32_t * pResult)
439  {
440      float32_t sum=S->intercept;
441      float32_t dot=0;
442      uint32_t i,j;
443      const float32_t *pSupport = S->supportVectors;
444  
445      for(i=0; i < S->nbOfSupportVectors; i++)
446      {
447          dot=0;
448          for(j=0; j < S->vectorDimension; j++)
449          {
450              dot = dot + in[j]* *pSupport++;
451          }
452          sum += S->dualCoefficients[i] * dot;
453      }
454      *pResult=S->classes[STEP(sum)];
455  }
456  #endif
457  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
458  
459  /**
460   * @} end of linearsvm group
461   */