arm_svm_linear_predict_f16.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_svm_linear_predict_f16.c 4 * Description: SVM Linear Classifier 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/svm_functions_f16.h" 30 31 #if defined(ARM_FLOAT16_SUPPORTED) 32 33 #include <limits.h> 34 #include <math.h> 35 36 37 /** 38 * @addtogroup linearsvm 39 * @{ 40 */ 41 42 43 /** 44 * @brief SVM linear prediction 45 * @param[in] S Pointer to an instance of the linear SVM structure. 46 * @param[in] in Pointer to input vector 47 * @param[out] pResult Decision value 48 * @return none. 49 * 50 */ 51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 52 53 #include "arm_helium_utils.h" 54 55 void arm_svm_linear_predict_f16( 56 const arm_svm_linear_instance_f16 *S, 57 const float16_t * in, 58 int32_t * pResult) 59 { 60 /* inlined Matrix x Vector function interleaved with dot prod */ 61 uint32_t numRows = S->nbOfSupportVectors; 62 uint32_t numCols = S->vectorDimension; 63 const float16_t *pSupport = S->supportVectors; 64 const float16_t *pSrcA = pSupport; 65 const float16_t *pInA0; 66 const float16_t *pInA1; 67 uint32_t row; 68 uint32_t blkCnt; /* loop counters */ 69 const float16_t *pDualCoef = S->dualCoefficients; 70 _Float16 sum = S->intercept; 71 row = numRows; 72 73 /* 74 * compute 4 rows in parrallel 75 */ 76 while (row >= 4) 77 { 78 const float16_t *pInA2, *pInA3; 79 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; 80 f16x8_t vecIn, acc0, acc1, acc2, acc3; 81 float16_t const *pSrcVecPtr = in; 82 83 /* 84 * Initialize the pointers to 4 consecutive MatrixA rows 85 */ 86 pInA0 = pSrcA; 87 pInA1 = pInA0 + numCols; 88 pInA2 = pInA1 + numCols; 89 pInA3 = pInA2 + numCols; 90 /* 91 * Initialize the vector pointer 92 */ 93 pInVec = pSrcVecPtr; 94 /* 95 * reset accumulators 96 */ 97 acc0 = vdupq_n_f16(0.0f); 98 acc1 = vdupq_n_f16(0.0f); 99 acc2 = vdupq_n_f16(0.0f); 100 acc3 = vdupq_n_f16(0.0f); 101 102 pSrcA0Vec = pInA0; 103 pSrcA1Vec = pInA1; 104 pSrcA2Vec = pInA2; 105 pSrcA3Vec = pInA3; 106 107 blkCnt = numCols >> 3; 108 while (blkCnt > 0U) { 109 f16x8_t vecA; 110 111 vecIn = vld1q(pInVec); 112 pInVec += 8; 113 vecA = vld1q(pSrcA0Vec); 114 pSrcA0Vec += 8; 115 acc0 = vfmaq(acc0, vecIn, vecA); 116 vecA = vld1q(pSrcA1Vec); 117 pSrcA1Vec += 8; 118 acc1 = vfmaq(acc1, vecIn, vecA); 119 vecA = vld1q(pSrcA2Vec); 120 pSrcA2Vec += 8; 121 acc2 = vfmaq(acc2, vecIn, vecA); 122 vecA = vld1q(pSrcA3Vec); 123 pSrcA3Vec += 8; 124 acc3 = vfmaq(acc3, vecIn, vecA); 125 126 blkCnt--; 127 } 128 /* 129 * tail 130 * (will be merged thru tail predication) 131 */ 132 blkCnt = numCols & 7; 133 if (blkCnt > 0U) { 134 mve_pred16_t p0 = vctp16q(blkCnt); 135 f16x8_t vecA; 136 137 vecIn = vldrhq_z_f16(pInVec, p0); 138 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 139 acc0 = vfmaq(acc0, vecIn, vecA); 140 vecA = vldrhq_z_f16(pSrcA1Vec, p0); 141 acc1 = vfmaq(acc1, vecIn, vecA); 142 vecA = vldrhq_z_f16(pSrcA2Vec, p0); 143 acc2 = vfmaq(acc2, vecIn, vecA); 144 vecA = vldrhq_z_f16(pSrcA3Vec, p0); 145 acc3 = vfmaq(acc3, vecIn, vecA); 146 } 147 /* 148 * Sum the partial parts 149 */ 150 acc0 = vmulq_n_f16(acc0,*pDualCoef++); 151 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++); 152 acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++); 153 acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++); 154 155 sum += (_Float16)vecAddAcrossF16Mve(acc0); 156 157 pSrcA += numCols * 4; 158 /* 159 * Decrement the row loop counter 160 */ 161 row -= 4; 162 } 163 164 /* 165 * compute 2 rows in parallel 166 */ 167 if (row >= 2) { 168 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; 169 f16x8_t vecIn, acc0, acc1; 170 float16_t const *pSrcVecPtr = in; 171 172 /* 173 * Initialize the pointers to 2 consecutive MatrixA rows 174 */ 175 pInA0 = pSrcA; 176 pInA1 = pInA0 + numCols; 177 /* 178 * Initialize the vector pointer 179 */ 180 pInVec = pSrcVecPtr; 181 /* 182 * reset accumulators 183 */ 184 acc0 = vdupq_n_f16(0.0f); 185 acc1 = vdupq_n_f16(0.0f); 186 pSrcA0Vec = pInA0; 187 pSrcA1Vec = pInA1; 188 189 blkCnt = numCols >> 3; 190 while (blkCnt > 0U) { 191 f16x8_t vecA; 192 193 vecIn = vld1q(pInVec); 194 pInVec += 8; 195 vecA = vld1q(pSrcA0Vec); 196 pSrcA0Vec += 8; 197 acc0 = vfmaq(acc0, vecIn, vecA); 198 vecA = vld1q(pSrcA1Vec); 199 pSrcA1Vec += 8; 200 acc1 = vfmaq(acc1, vecIn, vecA); 201 202 blkCnt--; 203 } 204 /* 205 * tail 206 * (will be merged thru tail predication) 207 */ 208 blkCnt = numCols & 7; 209 if (blkCnt > 0U) { 210 mve_pred16_t p0 = vctp16q(blkCnt); 211 f16x8_t vecA; 212 213 vecIn = vldrhq_z_f16(pInVec, p0); 214 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 215 acc0 = vfmaq(acc0, vecIn, vecA); 216 vecA = vldrhq_z_f16(pSrcA1Vec, p0); 217 acc1 = vfmaq(acc1, vecIn, vecA); 218 } 219 /* 220 * Sum the partial parts 221 */ 222 acc0 = vmulq_n_f16(acc0,*pDualCoef++); 223 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++); 224 225 sum += (_Float16)vecAddAcrossF16Mve(acc0); 226 227 pSrcA += numCols * 2; 228 row -= 2; 229 } 230 231 if (row >= 1) { 232 f16x8_t vecIn, acc0; 233 float16_t const *pSrcA0Vec, *pInVec; 234 float16_t const *pSrcVecPtr = in; 235 /* 236 * Initialize the pointers to last MatrixA row 237 */ 238 pInA0 = pSrcA; 239 /* 240 * Initialize the vector pointer 241 */ 242 pInVec = pSrcVecPtr; 243 /* 244 * reset accumulators 245 */ 246 acc0 = vdupq_n_f16(0.0f); 247 248 pSrcA0Vec = pInA0; 249 250 blkCnt = numCols >> 3; 251 while (blkCnt > 0U) { 252 f16x8_t vecA; 253 254 vecIn = vld1q(pInVec); 255 pInVec += 8; 256 vecA = vld1q(pSrcA0Vec); 257 pSrcA0Vec += 8; 258 acc0 = vfmaq(acc0, vecIn, vecA); 259 260 blkCnt--; 261 } 262 /* 263 * tail 264 * (will be merged thru tail predication) 265 */ 266 blkCnt = numCols & 7; 267 if (blkCnt > 0U) { 268 mve_pred16_t p0 = vctp16q(blkCnt); 269 f16x8_t vecA; 270 271 vecIn = vldrhq_z_f16(pInVec, p0); 272 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 273 acc0 = vfmaq(acc0, vecIn, vecA); 274 } 275 /* 276 * Sum the partial parts 277 */ 278 sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0); 279 280 } 281 282 *pResult = S->classes[STEP(sum)]; 283 } 284 285 #else 286 void arm_svm_linear_predict_f16( 287 const arm_svm_linear_instance_f16 *S, 288 const float16_t * in, 289 int32_t * pResult) 290 { 291 _Float16 sum=S->intercept; 292 _Float16 dot=0; 293 uint32_t i,j; 294 const float16_t *pSupport = S->supportVectors; 295 296 for(i=0; i < S->nbOfSupportVectors; i++) 297 { 298 dot=0; 299 for(j=0; j < S->vectorDimension; j++) 300 { 301 dot = dot + in[j]* *pSupport++; 302 } 303 sum += S->dualCoefficients[i] * dot; 304 } 305 *pResult=S->classes[STEP(sum)]; 306 } 307 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 308 309 /** 310 * @} end of linearsvm group 311 */ 312 313 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 314