/ Drivers / CMSIS / DSP / Source / SVMFunctions / arm_svm_linear_predict_f16.c
arm_svm_linear_predict_f16.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_svm_linear_predict_f16.c
  4   * Description:  SVM Linear Classifier
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/svm_functions_f16.h"
 30  
 31  #if defined(ARM_FLOAT16_SUPPORTED)
 32  
 33  #include <limits.h>
 34  #include <math.h>
 35  
 36  
 37  /**
 38   * @addtogroup linearsvm
 39   * @{
 40   */
 41  
 42  
 43  /**
 44   * @brief SVM linear prediction
 45   * @param[in]    S          Pointer to an instance of the linear SVM structure.
 46   * @param[in]    in         Pointer to input vector
 47   * @param[out]   pResult    Decision value
 48   * @return none.
 49   *
 50   */
 51  #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
 52  
 53  #include "arm_helium_utils.h"
 54  
 55  void arm_svm_linear_predict_f16(
 56      const arm_svm_linear_instance_f16 *S,
 57      const float16_t * in,
 58      int32_t * pResult)
 59  {
 60          /* inlined Matrix x Vector function interleaved with dot prod */
 61      uint32_t        numRows = S->nbOfSupportVectors;
 62      uint32_t        numCols = S->vectorDimension;
 63      const float16_t *pSupport = S->supportVectors;
 64      const float16_t *pSrcA = pSupport;
 65      const float16_t *pInA0;
 66      const float16_t *pInA1;
 67      uint32_t         row;
 68      uint32_t         blkCnt;     /* loop counters */
 69      const float16_t *pDualCoef = S->dualCoefficients;
 70      _Float16       sum = S->intercept;
 71      row = numRows;
 72  
 73      /*
 74       * compute 4 rows in parrallel
 75       */
 76      while (row >= 4) 
 77      {
 78          const float16_t *pInA2, *pInA3;
 79          float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
 80          f16x8_t         vecIn, acc0, acc1, acc2, acc3;
 81          float16_t const *pSrcVecPtr = in;
 82  
 83          /*
 84           * Initialize the pointers to 4 consecutive MatrixA rows
 85           */
 86          pInA0 = pSrcA;
 87          pInA1 = pInA0 + numCols;
 88          pInA2 = pInA1 + numCols;
 89          pInA3 = pInA2 + numCols;
 90          /*
 91           * Initialize the vector pointer
 92           */
 93          pInVec = pSrcVecPtr;
 94          /*
 95           * reset accumulators
 96           */
 97          acc0 = vdupq_n_f16(0.0f);
 98          acc1 = vdupq_n_f16(0.0f);
 99          acc2 = vdupq_n_f16(0.0f);
100          acc3 = vdupq_n_f16(0.0f);
101  
102          pSrcA0Vec = pInA0;
103          pSrcA1Vec = pInA1;
104          pSrcA2Vec = pInA2;
105          pSrcA3Vec = pInA3;
106  
107          blkCnt = numCols >> 3;
108          while (blkCnt > 0U) {
109              f16x8_t         vecA;
110  
111              vecIn = vld1q(pInVec);
112              pInVec += 8;
113              vecA = vld1q(pSrcA0Vec);
114              pSrcA0Vec += 8;
115              acc0 = vfmaq(acc0, vecIn, vecA);
116              vecA = vld1q(pSrcA1Vec);
117              pSrcA1Vec += 8;
118              acc1 = vfmaq(acc1, vecIn, vecA);
119              vecA = vld1q(pSrcA2Vec);
120              pSrcA2Vec += 8;
121              acc2 = vfmaq(acc2, vecIn, vecA);
122              vecA = vld1q(pSrcA3Vec);
123              pSrcA3Vec += 8;
124              acc3 = vfmaq(acc3, vecIn, vecA);
125  
126              blkCnt--;
127          }
128          /*
129           * tail
130           * (will be merged thru tail predication)
131           */
132          blkCnt = numCols & 7;
133          if (blkCnt > 0U) {
134              mve_pred16_t    p0 = vctp16q(blkCnt);
135              f16x8_t         vecA;
136  
137              vecIn = vldrhq_z_f16(pInVec, p0);
138              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
139              acc0 = vfmaq(acc0, vecIn, vecA);
140              vecA = vldrhq_z_f16(pSrcA1Vec, p0);
141              acc1 = vfmaq(acc1, vecIn, vecA);
142              vecA = vldrhq_z_f16(pSrcA2Vec, p0);
143              acc2 = vfmaq(acc2, vecIn, vecA);
144              vecA = vldrhq_z_f16(pSrcA3Vec, p0);
145              acc3 = vfmaq(acc3, vecIn, vecA);
146          }
147          /*
148           * Sum the partial parts
149           */
150          acc0 = vmulq_n_f16(acc0,*pDualCoef++);
151          acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
152          acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
153          acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
154  
155          sum += (_Float16)vecAddAcrossF16Mve(acc0);
156  
157          pSrcA += numCols * 4;
158          /*
159           * Decrement the row loop counter
160           */
161          row -= 4;
162      }
163  
164      /*
165       * compute 2 rows in parallel
166       */
167      if (row >= 2) {
168          float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
169          f16x8_t         vecIn, acc0, acc1;
170          float16_t const *pSrcVecPtr = in;
171  
172          /*
173           * Initialize the pointers to 2 consecutive MatrixA rows
174           */
175          pInA0 = pSrcA;
176          pInA1 = pInA0 + numCols;
177          /*
178           * Initialize the vector pointer
179           */
180          pInVec = pSrcVecPtr;
181          /*
182           * reset accumulators
183           */
184          acc0 = vdupq_n_f16(0.0f);
185          acc1 = vdupq_n_f16(0.0f);
186          pSrcA0Vec = pInA0;
187          pSrcA1Vec = pInA1;
188  
189          blkCnt = numCols >> 3;
190          while (blkCnt > 0U) {
191              f16x8_t         vecA;
192  
193              vecIn = vld1q(pInVec);
194              pInVec += 8;
195              vecA = vld1q(pSrcA0Vec);
196              pSrcA0Vec += 8;
197              acc0 = vfmaq(acc0, vecIn, vecA);
198              vecA = vld1q(pSrcA1Vec);
199              pSrcA1Vec += 8;
200              acc1 = vfmaq(acc1, vecIn, vecA);
201  
202              blkCnt--;
203          }
204          /*
205           * tail
206           * (will be merged thru tail predication)
207           */
208          blkCnt = numCols & 7;
209          if (blkCnt > 0U) {
210              mve_pred16_t    p0 = vctp16q(blkCnt);
211              f16x8_t         vecA;
212  
213              vecIn = vldrhq_z_f16(pInVec, p0);
214              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
215              acc0 = vfmaq(acc0, vecIn, vecA);
216              vecA = vldrhq_z_f16(pSrcA1Vec, p0);
217              acc1 = vfmaq(acc1, vecIn, vecA);
218          }
219          /*
220           * Sum the partial parts
221           */
222          acc0 = vmulq_n_f16(acc0,*pDualCoef++);
223          acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
224  
225          sum += (_Float16)vecAddAcrossF16Mve(acc0);
226  
227          pSrcA += numCols * 2;
228          row -= 2;
229      }
230  
231      if (row >= 1) {
232          f16x8_t         vecIn, acc0;
233          float16_t const *pSrcA0Vec, *pInVec;
234          float16_t const *pSrcVecPtr = in;
235          /*
236           * Initialize the pointers to last MatrixA row
237           */
238          pInA0 = pSrcA;
239          /*
240           * Initialize the vector pointer
241           */
242          pInVec = pSrcVecPtr;
243          /*
244           * reset accumulators
245           */
246          acc0 = vdupq_n_f16(0.0f);
247  
248          pSrcA0Vec = pInA0;
249  
250          blkCnt = numCols >> 3;
251          while (blkCnt > 0U) {
252              f16x8_t         vecA;
253  
254              vecIn = vld1q(pInVec);
255              pInVec += 8;
256              vecA = vld1q(pSrcA0Vec);
257              pSrcA0Vec += 8;
258              acc0 = vfmaq(acc0, vecIn, vecA);
259  
260              blkCnt--;
261          }
262          /*
263           * tail
264           * (will be merged thru tail predication)
265           */
266          blkCnt = numCols & 7;
267          if (blkCnt > 0U) {
268              mve_pred16_t    p0 = vctp16q(blkCnt);
269              f16x8_t         vecA;
270  
271              vecIn = vldrhq_z_f16(pInVec, p0);
272              vecA = vldrhq_z_f16(pSrcA0Vec, p0);
273              acc0 = vfmaq(acc0, vecIn, vecA);
274          }
275          /*
276           * Sum the partial parts
277           */
278          sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
279  
280      }
281  
282      *pResult = S->classes[STEP(sum)];
283  }
284  
285  #else
286  void arm_svm_linear_predict_f16(
287      const arm_svm_linear_instance_f16 *S,
288      const float16_t * in,
289      int32_t * pResult)
290  {
291      _Float16 sum=S->intercept;
292      _Float16 dot=0;
293      uint32_t i,j;
294      const float16_t *pSupport = S->supportVectors;
295  
296      for(i=0; i < S->nbOfSupportVectors; i++)
297      {
298          dot=0;
299          for(j=0; j < S->vectorDimension; j++)
300          {
301              dot = dot + in[j]* *pSupport++;
302          }
303          sum += S->dualCoefficients[i] * dot;
304      }
305      *pResult=S->classes[STEP(sum)];
306  }
307  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
308  
309  /**
310   * @} end of linearsvm group
311   */
312  
313  #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
314