/ Drivers / CMSIS / DSP / Source / SVMFunctions / arm_svm_polynomial_predict_f32.c
arm_svm_polynomial_predict_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_svm_polynomial_predict_f32.c
  4   * Description:  SVM Polynomial Classifier
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/svm_functions.h"
 30  #include <limits.h>
 31  #include <math.h>
 32  
 33  #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
 34  #include "arm_vec_math.h"
 35  #endif
 36  
 37  /**
 38   * @addtogroup polysvm
 39   * @{
 40   */
 41  
 42  
 43  /**
 44   * @brief SVM polynomial prediction
 45   * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
 46   * @param[in]    in         Pointer to input vector
 47   * @param[out]   pResult    Decision value
 48   * @return none.
 49   *
 50   */
 51  
 52  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 53  
 54  #include "arm_helium_utils.h"
 55  #include "arm_vec_math.h"
 56  
 57  void arm_svm_polynomial_predict_f32(
 58      const arm_svm_polynomial_instance_f32 *S,
 59      const float32_t * in,
 60      int32_t * pResult)
 61  {
 62          /* inlined Matrix x Vector function interleaved with dot prod */
 63      uint32_t        numRows = S->nbOfSupportVectors;
 64      uint32_t        numCols = S->vectorDimension;
 65      const float32_t *pSupport = S->supportVectors;
 66      const float32_t *pSrcA = pSupport;
 67      const float32_t *pInA0;
 68      const float32_t *pInA1;
 69      uint32_t         row;
 70      uint32_t         blkCnt;     /* loop counters */
 71      const float32_t *pDualCoef = S->dualCoefficients;
 72      float32_t       sum = S->intercept;
 73      f32x4_t         vSum = vdupq_n_f32(0.0f);
 74  
 75      row = numRows;
 76  
 77      /*
 78       * compute 4 rows in parrallel
 79       */
 80      while (row >= 4) {
 81          const float32_t *pInA2, *pInA3;
 82          float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
 83          f32x4_t         vecIn, acc0, acc1, acc2, acc3;
 84          float32_t const *pSrcVecPtr = in;
 85  
 86          /*
 87           * Initialize the pointers to 4 consecutive MatrixA rows
 88           */
 89          pInA0 = pSrcA;
 90          pInA1 = pInA0 + numCols;
 91          pInA2 = pInA1 + numCols;
 92          pInA3 = pInA2 + numCols;
 93          /*
 94           * Initialize the vector pointer
 95           */
 96          pInVec = pSrcVecPtr;
 97          /*
 98           * reset accumulators
 99           */
100          acc0 = vdupq_n_f32(0.0f);
101          acc1 = vdupq_n_f32(0.0f);
102          acc2 = vdupq_n_f32(0.0f);
103          acc3 = vdupq_n_f32(0.0f);
104  
105          pSrcA0Vec = pInA0;
106          pSrcA1Vec = pInA1;
107          pSrcA2Vec = pInA2;
108          pSrcA3Vec = pInA3;
109  
110          blkCnt = numCols >> 2;
111          while (blkCnt > 0U) {
112              f32x4_t         vecA;
113  
114              vecIn = vld1q(pInVec);
115              pInVec += 4;
116              vecA = vld1q(pSrcA0Vec);
117              pSrcA0Vec += 4;
118              acc0 = vfmaq(acc0, vecIn, vecA);
119              vecA = vld1q(pSrcA1Vec);
120              pSrcA1Vec += 4;
121              acc1 = vfmaq(acc1, vecIn, vecA);
122              vecA = vld1q(pSrcA2Vec);
123              pSrcA2Vec += 4;
124              acc2 = vfmaq(acc2, vecIn, vecA);
125              vecA = vld1q(pSrcA3Vec);
126              pSrcA3Vec += 4;
127              acc3 = vfmaq(acc3, vecIn, vecA);
128  
129              blkCnt--;
130          }
131          /*
132           * tail
133           * (will be merged thru tail predication)
134           */
135          blkCnt = numCols & 3;
136          if (blkCnt > 0U) {
137              mve_pred16_t    p0 = vctp32q(blkCnt);
138              f32x4_t         vecA;
139  
140              vecIn = vldrwq_z_f32(pInVec, p0);
141              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
142              acc0 = vfmaq(acc0, vecIn, vecA);
143              vecA = vldrwq_z_f32(pSrcA1Vec, p0);
144              acc1 = vfmaq(acc1, vecIn, vecA);
145              vecA = vldrwq_z_f32(pSrcA2Vec, p0);
146              acc2 = vfmaq(acc2, vecIn, vecA);
147              vecA = vldrwq_z_f32(pSrcA3Vec, p0);
148              acc3 = vfmaq(acc3, vecIn, vecA);
149          }
150          /*
151           * Sum the partial parts
152           */
153          f32x4_t         vtmp = vuninitializedq_f32();
154          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
155          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
156          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
157          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
158  
159          vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
160                               arm_vec_exponent_f32
161                               (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
162          
163          pDualCoef += 4;
164  
165          pSrcA += numCols * 4;
166          /*
167           * Decrement the row loop counter
168           */
169          row -= 4;
170      }
171  
172      /*
173       * compute 2 rows in parrallel
174       */
175      if (row >= 2) {
176          float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
177          f32x4_t         vecIn, acc0, acc1;
178          float32_t const *pSrcVecPtr = in;
179  
180          /*
181           * Initialize the pointers to 2 consecutive MatrixA rows
182           */
183          pInA0 = pSrcA;
184          pInA1 = pInA0 + numCols;
185          /*
186           * Initialize the vector pointer
187           */
188          pInVec = pSrcVecPtr;
189          /*
190           * reset accumulators
191           */
192          acc0 = vdupq_n_f32(0.0f);
193          acc1 = vdupq_n_f32(0.0f);
194          pSrcA0Vec = pInA0;
195          pSrcA1Vec = pInA1;
196  
197          blkCnt = numCols >> 2;
198          while (blkCnt > 0U) {
199              f32x4_t         vecA;
200  
201              vecIn = vld1q(pInVec);
202              pInVec += 4;
203              vecA = vld1q(pSrcA0Vec);
204              pSrcA0Vec += 4;
205              acc0 = vfmaq(acc0, vecIn, vecA);
206              vecA = vld1q(pSrcA1Vec);
207              pSrcA1Vec += 4;
208              acc1 = vfmaq(acc1, vecIn, vecA);
209  
210              blkCnt--;
211          }
212          /*
213           * tail
214           * (will be merged thru tail predication)
215           */
216          blkCnt = numCols & 3;
217          if (blkCnt > 0U) {
218              mve_pred16_t    p0 = vctp32q(blkCnt);
219              f32x4_t         vecA;
220  
221              vecIn = vldrwq_z_f32(pInVec, p0);
222              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
223              acc0 = vfmaq(acc0, vecIn, vecA);
224              vecA = vldrwq_z_f32(pSrcA1Vec, p0);
225              acc1 = vfmaq(acc1, vecIn, vecA);
226          }
227          /*
228           * Sum the partial parts
229           */
230          f32x4_t         vtmp = vuninitializedq_f32();
231          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
232          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
233  
234          vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
235                               arm_vec_exponent_f32
236                               (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 
237                               vctp32q(2));
238          
239          pDualCoef += 2;
240          pSrcA += numCols * 2;
241          row -= 2;
242      }
243  
244      if (row >= 1) {
245          f32x4_t         vecIn, acc0;
246          float32_t const *pSrcA0Vec, *pInVec;
247          float32_t const *pSrcVecPtr = in;
248          /*
249           * Initialize the pointers to last MatrixA row
250           */
251          pInA0 = pSrcA;
252          /*
253           * Initialize the vector pointer
254           */
255          pInVec = pSrcVecPtr;
256          /*
257           * reset accumulators
258           */
259          acc0 = vdupq_n_f32(0.0f);
260  
261          pSrcA0Vec = pInA0;
262  
263          blkCnt = numCols >> 2;
264          while (blkCnt > 0U) {
265              f32x4_t         vecA;
266  
267              vecIn = vld1q(pInVec);
268              pInVec += 4;
269              vecA = vld1q(pSrcA0Vec);
270              pSrcA0Vec += 4;
271              acc0 = vfmaq(acc0, vecIn, vecA);
272  
273              blkCnt--;
274          }
275          /*
276           * tail
277           * (will be merged thru tail predication)
278           */
279          blkCnt = numCols & 3;
280          if (blkCnt > 0U) {
281              mve_pred16_t    p0 = vctp32q(blkCnt);
282              f32x4_t         vecA;
283  
284              vecIn = vldrwq_z_f32(pInVec, p0);
285              vecA = vldrwq_z_f32(pSrcA0Vec, p0);
286              acc0 = vfmaq(acc0, vecIn, vecA);
287          }
288          /*
289           * Sum the partial parts
290           */
291          f32x4_t         vtmp = vuninitializedq_f32();
292          vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
293          vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
294                               arm_vec_exponent_f32
295                               (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 
296                               vctp32q(1));
297      }
298      sum += vecAddAcrossF32Mve(vSum);
299  
300      
301      *pResult = S->classes[STEP(sum)];
302  }
303  
304  #else
305  #if defined(ARM_MATH_NEON)
306  void arm_svm_polynomial_predict_f32(
307      const arm_svm_polynomial_instance_f32 *S,
308      const float32_t * in,
309      int32_t * pResult)
310  {
311      float32_t sum = S->intercept;
312     
313      float32_t dot;
314      float32x4_t dotV; 
315  
316      float32x4_t accuma,accumb,accumc,accumd,accum;
317      float32x2_t accum2;
318      float32x4_t vec1;
319      float32x4_t coef0 = vdupq_n_f32(S->coef0);
320  
321      float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
322  
323      uint32_t blkCnt;   
324      uint32_t vectorBlkCnt;   
325  
326      const float32_t *pIn = in;
327  
328      const float32_t *pSupport = S->supportVectors;
329  
330      const float32_t *pSupporta = S->supportVectors;
331      const float32_t *pSupportb;
332      const float32_t *pSupportc;
333      const float32_t *pSupportd;
334  
335      pSupportb = pSupporta + S->vectorDimension;
336      pSupportc = pSupportb + S->vectorDimension;
337      pSupportd = pSupportc + S->vectorDimension;
338  
339      const float32_t *pDualCoefs = S->dualCoefficients;
340  
341      vectorBlkCnt = S->nbOfSupportVectors >> 2;
342      while (vectorBlkCnt > 0U)
343      {
344          accuma = vdupq_n_f32(0);
345          accumb = vdupq_n_f32(0);
346          accumc = vdupq_n_f32(0);
347          accumd = vdupq_n_f32(0);
348  
349          pIn = in;
350  
351          blkCnt = S->vectorDimension >> 2;
352          while (blkCnt > 0U)
353          {
354          
355              vec1 = vld1q_f32(pIn);
356              vec2a = vld1q_f32(pSupporta);
357              vec2b = vld1q_f32(pSupportb);
358              vec2c = vld1q_f32(pSupportc);
359              vec2d = vld1q_f32(pSupportd);
360  
361              pIn += 4;
362              pSupporta += 4;
363              pSupportb += 4;
364              pSupportc += 4;
365              pSupportd += 4;
366  
367              accuma = vmlaq_f32(accuma, vec1,vec2a);
368              accumb = vmlaq_f32(accumb, vec1,vec2b);
369              accumc = vmlaq_f32(accumc, vec1,vec2c);
370              accumd = vmlaq_f32(accumd, vec1,vec2d);
371  
372              blkCnt -- ;
373          }
374          accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
375          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
376  
377          accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
378          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
379  
380          accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
381          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
382  
383          accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
384          dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
385  
386  
387          blkCnt = S->vectorDimension & 3;
388          while (blkCnt > 0U)
389          {
390              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
391              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
392              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
393              dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
394  
395              pIn++;
396  
397              blkCnt -- ;
398          }
399  
400          vec1 = vld1q_f32(pDualCoefs);
401          pDualCoefs += 4; 
402  
403          // To vectorize later
404          dotV = vmulq_n_f32(dotV, S->gamma);
405          dotV = vaddq_f32(dotV, coef0);
406  
407          dotV = arm_vec_exponent_f32(dotV,S->degree);
408  
409          accum = vmulq_f32(vec1,dotV);
410          accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
411          sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
412  
413          pSupporta += 3*S->vectorDimension;
414          pSupportb += 3*S->vectorDimension;
415          pSupportc += 3*S->vectorDimension;
416          pSupportd += 3*S->vectorDimension;
417  
418          vectorBlkCnt -- ;
419      }
420  
421      pSupport = pSupporta;
422      vectorBlkCnt = S->nbOfSupportVectors & 3;
423  
424      while (vectorBlkCnt > 0U)
425      {
426          accum = vdupq_n_f32(0);
427          dot = 0.0f;
428          pIn = in;
429  
430          blkCnt = S->vectorDimension >> 2;
431          while (blkCnt > 0U)
432          {
433          
434              vec1 = vld1q_f32(pIn);
435              vec2 = vld1q_f32(pSupport);
436              pIn += 4;
437              pSupport += 4;
438  
439              accum = vmlaq_f32(accum, vec1,vec2);
440  
441              blkCnt -- ;
442          }
443          accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
444          dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
445  
446  
447          blkCnt = S->vectorDimension & 3;
448          while (blkCnt > 0U)
449          {
450              dot = dot + *pIn++ * *pSupport++;
451  
452              blkCnt -- ;
453          }
454  
455          sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
456          vectorBlkCnt -- ;
457      }
458  
459      *pResult=S->classes[STEP(sum)];
460  }
461  #else
462  void arm_svm_polynomial_predict_f32(
463      const arm_svm_polynomial_instance_f32 *S,
464      const float32_t * in,
465      int32_t * pResult)
466  {
467      float32_t sum=S->intercept;
468      float32_t dot=0;
469      uint32_t i,j;
470      const float32_t *pSupport = S->supportVectors;
471  
472      for(i=0; i < S->nbOfSupportVectors; i++)
473      {
474          dot=0;
475          for(j=0; j < S->vectorDimension; j++)
476          {
477              dot = dot + in[j]* *pSupport++;
478          }
479          sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
480      }
481  
482      *pResult=S->classes[STEP(sum)];
483  }
484  #endif
485  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
486  
487  
488  /**
489   * @} end of polysvm group
490   */