/ Drivers / CMSIS / DSP / Source / SVMFunctions / arm_svm_polynomial_predict_f16.c
arm_svm_polynomial_predict_f16.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_svm_polynomial_predict_f16.c
  4   * Description:  SVM Polynomial Classifier
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/svm_functions_f16.h"
 30  
 31  #if defined(ARM_FLOAT16_SUPPORTED)
 32  
 33  #include <limits.h>
 34  #include <math.h>
 35  
 36  
 37  /**
 38   * @addtogroup polysvm
 39   * @{
 40   */
 41  
 42  
 43  /**
 44   * @brief SVM polynomial prediction
 45   * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
 46   * @param[in]    in         Pointer to input vector
 47   * @param[out]   pResult    Decision value
 48   * @return none.
 49   *
 50   */
 51  
 52  #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
 53  
 54  #include "arm_helium_utils.h"
 55  #include "arm_vec_math_f16.h"
 56  
 57  void arm_svm_polynomial_predict_f16(
 58      const arm_svm_polynomial_instance_f16 *S,
 59      const float16_t * in,
 60      int32_t * pResult)
 61  {
 62          /* inlined Matrix x Vector function interleaved with dot prod */
 63      uint32_t        numRows = S->nbOfSupportVectors;
 64      uint32_t        numCols = S->vectorDimension;
 65      const float16_t *pSupport = S->supportVectors;
 66      const float16_t *pSrcA = pSupport;
 67      const float16_t *pInA0;
 68      const float16_t *pInA1;
 69      uint32_t         row;
 70      uint32_t         blkCnt;     /* loop counters */
 71      const float16_t *pDualCoef = S->dualCoefficients;
 72      _Float16       sum = S->intercept;
 73      f16x8_t         vSum = vdupq_n_f16(0.0f);
 74  
 75      row = numRows;
 76  
 77      /*
 78       * compute 4 rows in parrallel
 79       */
 80      while (row >= 4) {
 81          const float16_t *pInA2, *pInA3;
 82          float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
 83          f16x8_t         vecIn, acc0, acc1, acc2, acc3;
 84          float16_t const *pSrcVecPtr = in;
 85  
 86          /*
 87           * Initialize the pointers to 4 consecutive MatrixA rows
 88           */
 89          pInA0 = pSrcA;
 90          pInA1 = pInA0 + numCols;
 91          pInA2 = pInA1 + numCols;
 92          pInA3 = pInA2 + numCols;
 93          /*
 94           * Initialize the vector pointer
 95           */
 96          pInVec = pSrcVecPtr;
 97          /*
 98           * reset accumulators
 99           */
100          acc0 = vdupq_n_f16(0.0f);
101          acc1 = vdupq_n_f16(0.0f);
102          acc2 = vdupq_n_f16(0.0f);
103          acc3 = vdupq_n_f16(0.0f);
104  
105          pSrcA0Vec = pInA0;
106          pSrcA1Vec = pInA1;
107          pSrcA2Vec = pInA2;
108          pSrcA3Vec = pInA3;
109  
110          blkCnt = numCols >> 3;
111          while (blkCnt > 0U) {
112              f16x8_t         vecA;
113  
114              vecIn = vld1q(pInVec);
115              pInVec += 8;
116              vecA = vld1q(pSrcA0Vec);
117              pSrcA0Vec += 8;
118              acc0 = vfmaq(acc0, vecIn, vecA);
119              vecA = vld1q(pSrcA1Vec);
120              pSrcA1Vec += 8;
121              acc1 = vfmaq(acc1, vecIn, vecA);
122              vecA = vld1q(pSrcA2Vec);
123              pSrcA2Vec += 8;
124              acc2 = vfmaq(acc2, vecIn, vecA);
125              vecA = vld1q(pSrcA3Vec);
126              pSrcA3Vec += 8;
127              acc3 = vfmaq(acc3, vecIn, vecA);
128  
129              blkCnt--;
130          }
131          /*
132           * tail
133           * (will be merged thru tail predication)
134           */
135          blkCnt = numCols & 7;
136          if (blkCnt > 0U) {
137              mve_pred16_t    p0 = vctp16q(blkCnt);
138              f16x8_t         vecA;
139  
140              vecIn = vldrhq_z_f16(pInVec, p0);
141              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
142              acc0 = vfmaq(acc0, vecIn, vecA);
143              vecA = vldrhq_z_f16(pSrcA1Vec, p0);
144              acc1 = vfmaq(acc1, vecIn, vecA);
145              vecA = vldrhq_z_f16(pSrcA2Vec, p0);
146              acc2 = vfmaq(acc2, vecIn, vecA);
147              vecA = vldrhq_z_f16(pSrcA3Vec, p0);
148              acc3 = vfmaq(acc3, vecIn, vecA);
149          }
150          /*
151           * Sum the partial parts
152           */
153          f16x8_t         vtmp = vuninitializedq_f16();
154          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
155          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
156          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
157          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
158  
159          vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
160                               arm_vec_exponent_f16
161                               (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), 
162                                  S->degree),vctp16q(4));
163          
164          pDualCoef += 4;
165  
166          pSrcA += numCols * 4;
167          /*
168           * Decrement the row loop counter
169           */
170          row -= 4;
171      }
172  
173      /*
174       * compute 2 rows in parrallel
175       */
176      if (row >= 2) {
177          float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
178          f16x8_t         vecIn, acc0, acc1;
179          float16_t const *pSrcVecPtr = in;
180  
181          /*
182           * Initialize the pointers to 2 consecutive MatrixA rows
183           */
184          pInA0 = pSrcA;
185          pInA1 = pInA0 + numCols;
186          /*
187           * Initialize the vector pointer
188           */
189          pInVec = pSrcVecPtr;
190          /*
191           * reset accumulators
192           */
193          acc0 = vdupq_n_f16(0.0f);
194          acc1 = vdupq_n_f16(0.0f);
195          pSrcA0Vec = pInA0;
196          pSrcA1Vec = pInA1;
197  
198          blkCnt = numCols >> 3;
199          while (blkCnt > 0U) {
200              f16x8_t         vecA;
201  
202              vecIn = vld1q(pInVec);
203              pInVec += 8;
204              vecA = vld1q(pSrcA0Vec);
205              pSrcA0Vec += 8;
206              acc0 = vfmaq(acc0, vecIn, vecA);
207              vecA = vld1q(pSrcA1Vec);
208              pSrcA1Vec += 8;
209              acc1 = vfmaq(acc1, vecIn, vecA);
210  
211              blkCnt--;
212          }
213          /*
214           * tail
215           * (will be merged thru tail predication)
216           */
217          blkCnt = numCols & 7;
218          if (blkCnt > 0U) {
219              mve_pred16_t    p0 = vctp16q(blkCnt);
220              f16x8_t         vecA;
221  
222              vecIn = vldrhq_z_f16(pInVec, p0);
223              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
224              acc0 = vfmaq(acc0, vecIn, vecA);
225              vecA = vldrhq_z_f16(pSrcA1Vec, p0);
226              acc1 = vfmaq(acc1, vecIn, vecA);
227          }
228          /*
229           * Sum the partial parts
230           */
231          f16x8_t         vtmp = vuninitializedq_f16();
232          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
233          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
234  
235          vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
236                               arm_vec_exponent_f16
237                               (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
238                               vctp16q(2));
239          
240          pDualCoef += 2;
241          pSrcA += numCols * 2;
242          row -= 2;
243      }
244  
245      if (row >= 1) {
246          f16x8_t         vecIn, acc0;
247          float16_t const *pSrcA0Vec, *pInVec;
248          float16_t const *pSrcVecPtr = in;
249          /*
250           * Initialize the pointers to last MatrixA row
251           */
252          pInA0 = pSrcA;
253          /*
254           * Initialize the vector pointer
255           */
256          pInVec = pSrcVecPtr;
257          /*
258           * reset accumulators
259           */
260          acc0 = vdupq_n_f16(0.0f);
261  
262          pSrcA0Vec = pInA0;
263  
264          blkCnt = numCols >> 3;
265          while (blkCnt > 0U) {
266              f16x8_t         vecA;
267  
268              vecIn = vld1q(pInVec);
269              pInVec += 8;
270              vecA = vld1q(pSrcA0Vec);
271              pSrcA0Vec += 8;
272              acc0 = vfmaq(acc0, vecIn, vecA);
273  
274              blkCnt--;
275          }
276          /*
277           * tail
278           * (will be merged thru tail predication)
279           */
280          blkCnt = numCols & 7;
281          if (blkCnt > 0U) {
282              mve_pred16_t    p0 = vctp16q(blkCnt);
283              f16x8_t         vecA;
284  
285              vecIn = vldrhq_z_f16(pInVec, p0);
286              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
287              acc0 = vfmaq(acc0, vecIn, vecA);
288          }
289          /*
290           * Sum the partial parts
291           */
292          f16x8_t         vtmp = vuninitializedq_f16();
293          vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
294          vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
295                               arm_vec_exponent_f16
296                               (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
297                               vctp16q(1));
298      }
299      sum += (_Float16)vecAddAcrossF16Mve(vSum);
300  
301      
302      *pResult = S->classes[STEP(sum)];
303  }
304  
305  #else
306  void arm_svm_polynomial_predict_f16(
307      const arm_svm_polynomial_instance_f16 *S,
308      const float16_t * in,
309      int32_t * pResult)
310  {
311      _Float16 sum=S->intercept;
312      _Float16 dot=0;
313      uint32_t i,j;
314      const float16_t *pSupport = S->supportVectors;
315  
316      for(i=0; i < S->nbOfSupportVectors; i++)
317      {
318          dot=0;
319          for(j=0; j < S->vectorDimension; j++)
320          {
321              dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
322          }
323          sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
324      }
325  
326      *pResult=S->classes[STEP(sum)];
327  }
328  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
329  
330  
331  /**
332   * @} end of polysvm group
333   */
334  
335  #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
336