arm_svm_linear_predict_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_svm_linear_predict_f32.c 4 * Description: SVM Linear Classifier 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/svm_functions.h" 30 #include <limits.h> 31 #include <math.h> 32 33 34 /** 35 * @addtogroup linearsvm 36 * @{ 37 */ 38 39 40 /** 41 * @brief SVM linear prediction 42 * @param[in] S Pointer to an instance of the linear SVM structure. 43 * @param[in] in Pointer to input vector 44 * @param[out] pResult Decision value 45 * @return none. 46 * 47 */ 48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 49 50 #include "arm_helium_utils.h" 51 52 void arm_svm_linear_predict_f32( 53 const arm_svm_linear_instance_f32 *S, 54 const float32_t * in, 55 int32_t * pResult) 56 { 57 /* inlined Matrix x Vector function interleaved with dot prod */ 58 uint32_t numRows = S->nbOfSupportVectors; 59 uint32_t numCols = S->vectorDimension; 60 const float32_t *pSupport = S->supportVectors; 61 const float32_t *pSrcA = pSupport; 62 const float32_t *pInA0; 63 const float32_t *pInA1; 64 uint32_t row; 65 uint32_t blkCnt; /* loop counters */ 66 const float32_t *pDualCoef = S->dualCoefficients; 67 float32_t sum = S->intercept; 68 row = numRows; 69 70 /* 71 * compute 4 rows in parrallel 72 */ 73 while (row >= 4) 74 { 75 const float32_t *pInA2, *pInA3; 76 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; 77 f32x4_t vecIn, acc0, acc1, acc2, acc3; 78 float32_t const *pSrcVecPtr = in; 79 80 /* 81 * Initialize the pointers to 4 consecutive MatrixA rows 82 */ 83 pInA0 = pSrcA; 84 pInA1 = pInA0 + numCols; 85 pInA2 = pInA1 + numCols; 86 pInA3 = pInA2 + numCols; 87 /* 88 * Initialize the vector pointer 89 */ 90 pInVec = pSrcVecPtr; 91 /* 92 * reset accumulators 93 */ 94 acc0 = vdupq_n_f32(0.0f); 95 acc1 = vdupq_n_f32(0.0f); 96 acc2 = vdupq_n_f32(0.0f); 97 acc3 = vdupq_n_f32(0.0f); 98 99 pSrcA0Vec = pInA0; 100 pSrcA1Vec = pInA1; 101 pSrcA2Vec = pInA2; 102 pSrcA3Vec = pInA3; 103 104 blkCnt = numCols >> 2; 105 while (blkCnt > 0U) { 106 f32x4_t vecA; 107 108 vecIn = vld1q(pInVec); 109 pInVec += 4; 110 vecA = vld1q(pSrcA0Vec); 111 pSrcA0Vec += 4; 112 acc0 = vfmaq(acc0, vecIn, vecA); 113 vecA = vld1q(pSrcA1Vec); 114 pSrcA1Vec += 4; 115 acc1 = vfmaq(acc1, vecIn, vecA); 116 vecA = vld1q(pSrcA2Vec); 117 pSrcA2Vec += 4; 118 acc2 = vfmaq(acc2, vecIn, vecA); 119 vecA = vld1q(pSrcA3Vec); 120 pSrcA3Vec += 4; 121 acc3 = vfmaq(acc3, vecIn, vecA); 122 123 blkCnt--; 124 } 125 /* 126 * tail 127 * (will be merged thru tail predication) 128 */ 129 blkCnt = numCols & 3; 130 if (blkCnt > 0U) { 131 mve_pred16_t p0 = vctp32q(blkCnt); 132 f32x4_t vecA; 133 134 vecIn = vldrwq_z_f32(pInVec, p0); 135 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 136 acc0 = vfmaq(acc0, vecIn, vecA); 137 vecA = vldrwq_z_f32(pSrcA1Vec, p0); 138 acc1 = vfmaq(acc1, vecIn, vecA); 139 vecA = vldrwq_z_f32(pSrcA2Vec, p0); 140 acc2 = vfmaq(acc2, vecIn, vecA); 141 vecA = vldrwq_z_f32(pSrcA3Vec, p0); 142 acc3 = vfmaq(acc3, vecIn, vecA); 143 } 144 /* 145 * Sum the partial parts 146 */ 147 148 acc0 = vmulq_n_f32(acc0,*pDualCoef++); 149 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++); 150 acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++); 151 acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++); 152 153 sum += vecAddAcrossF32Mve(acc0); 154 155 pSrcA += numCols * 4; 156 /* 157 * Decrement the row loop counter 158 */ 159 row -= 4; 160 } 161 162 /* 163 * compute 2 rows in parallel 164 */ 165 if (row >= 2) { 166 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; 167 f32x4_t vecIn, acc0, acc1; 168 float32_t const *pSrcVecPtr = in; 169 170 /* 171 * Initialize the pointers to 2 consecutive MatrixA rows 172 */ 173 pInA0 = pSrcA; 174 pInA1 = pInA0 + numCols; 175 /* 176 * Initialize the vector pointer 177 */ 178 pInVec = pSrcVecPtr; 179 /* 180 * reset accumulators 181 */ 182 acc0 = vdupq_n_f32(0.0f); 183 acc1 = vdupq_n_f32(0.0f); 184 pSrcA0Vec = pInA0; 185 pSrcA1Vec = pInA1; 186 187 blkCnt = numCols >> 2; 188 while (blkCnt > 0U) { 189 f32x4_t vecA; 190 191 vecIn = vld1q(pInVec); 192 pInVec += 4; 193 vecA = vld1q(pSrcA0Vec); 194 pSrcA0Vec += 4; 195 acc0 = vfmaq(acc0, vecIn, vecA); 196 vecA = vld1q(pSrcA1Vec); 197 pSrcA1Vec += 4; 198 acc1 = vfmaq(acc1, vecIn, vecA); 199 200 blkCnt--; 201 } 202 /* 203 * tail 204 * (will be merged thru tail predication) 205 */ 206 blkCnt = numCols & 3; 207 if (blkCnt > 0U) { 208 mve_pred16_t p0 = vctp32q(blkCnt); 209 f32x4_t vecA; 210 211 vecIn = vldrwq_z_f32(pInVec, p0); 212 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 213 acc0 = vfmaq(acc0, vecIn, vecA); 214 vecA = vldrwq_z_f32(pSrcA1Vec, p0); 215 acc1 = vfmaq(acc1, vecIn, vecA); 216 } 217 /* 218 * Sum the partial parts 219 */ 220 acc0 = vmulq_n_f32(acc0,*pDualCoef++); 221 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++); 222 223 sum += vecAddAcrossF32Mve(acc0); 224 225 226 pSrcA += numCols * 2; 227 row -= 2; 228 } 229 230 if (row >= 1) { 231 f32x4_t vecIn, acc0; 232 float32_t const *pSrcA0Vec, *pInVec; 233 float32_t const *pSrcVecPtr = in; 234 /* 235 * Initialize the pointers to last MatrixA row 236 */ 237 pInA0 = pSrcA; 238 /* 239 * Initialize the vector pointer 240 */ 241 pInVec = pSrcVecPtr; 242 /* 243 * reset accumulators 244 */ 245 acc0 = vdupq_n_f32(0.0f); 246 247 pSrcA0Vec = pInA0; 248 249 blkCnt = numCols >> 2; 250 while (blkCnt > 0U) { 251 f32x4_t vecA; 252 253 vecIn = vld1q(pInVec); 254 pInVec += 4; 255 vecA = vld1q(pSrcA0Vec); 256 pSrcA0Vec += 4; 257 acc0 = vfmaq(acc0, vecIn, vecA); 258 259 blkCnt--; 260 } 261 /* 262 * tail 263 * (will be merged thru tail predication) 264 */ 265 blkCnt = numCols & 3; 266 if (blkCnt > 0U) { 267 mve_pred16_t p0 = vctp32q(blkCnt); 268 f32x4_t vecA; 269 270 vecIn = vldrwq_z_f32(pInVec, p0); 271 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 272 acc0 = vfmaq(acc0, vecIn, vecA); 273 } 274 /* 275 * Sum the partial parts 276 */ 277 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0); 278 279 } 280 281 *pResult = S->classes[STEP(sum)]; 282 } 283 284 #else 285 #if defined(ARM_MATH_NEON) 286 void arm_svm_linear_predict_f32( 287 const arm_svm_linear_instance_f32 *S, 288 const float32_t * in, 289 int32_t * pResult) 290 { 291 float32_t sum = S->intercept; 292 293 float32_t dot; 294 float32x4_t dotV; 295 296 float32x4_t accuma,accumb,accumc,accumd,accum; 297 float32x2_t accum2; 298 float32x4_t vec1; 299 300 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; 301 302 uint32_t blkCnt; 303 uint32_t vectorBlkCnt; 304 305 const float32_t *pIn = in; 306 307 const float32_t *pSupport = S->supportVectors; 308 309 const float32_t *pSupporta = S->supportVectors; 310 const float32_t *pSupportb; 311 const float32_t *pSupportc; 312 const float32_t *pSupportd; 313 314 pSupportb = pSupporta + S->vectorDimension; 315 pSupportc = pSupportb + S->vectorDimension; 316 pSupportd = pSupportc + S->vectorDimension; 317 318 const float32_t *pDualCoefs = S->dualCoefficients; 319 320 vectorBlkCnt = S->nbOfSupportVectors >> 2; 321 322 while (vectorBlkCnt > 0U) 323 { 324 accuma = vdupq_n_f32(0); 325 accumb = vdupq_n_f32(0); 326 accumc = vdupq_n_f32(0); 327 accumd = vdupq_n_f32(0); 328 329 pIn = in; 330 331 blkCnt = S->vectorDimension >> 2; 332 while (blkCnt > 0U) 333 { 334 335 vec1 = vld1q_f32(pIn); 336 vec2a = vld1q_f32(pSupporta); 337 vec2b = vld1q_f32(pSupportb); 338 vec2c = vld1q_f32(pSupportc); 339 vec2d = vld1q_f32(pSupportd); 340 341 pIn += 4; 342 pSupporta += 4; 343 pSupportb += 4; 344 pSupportc += 4; 345 pSupportd += 4; 346 347 accuma = vmlaq_f32(accuma, vec1,vec2a); 348 accumb = vmlaq_f32(accumb, vec1,vec2b); 349 accumc = vmlaq_f32(accumc, vec1,vec2c); 350 accumd = vmlaq_f32(accumd, vec1,vec2d); 351 352 blkCnt -- ; 353 } 354 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); 355 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); 356 357 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); 358 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); 359 360 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); 361 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); 362 363 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); 364 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); 365 366 367 blkCnt = S->vectorDimension & 3; 368 while (blkCnt > 0U) 369 { 370 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); 371 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); 372 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); 373 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); 374 375 pIn++; 376 377 blkCnt -- ; 378 } 379 380 vec1 = vld1q_f32(pDualCoefs); 381 pDualCoefs += 4; 382 383 accum = vmulq_f32(vec1,dotV); 384 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); 385 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); 386 387 pSupporta += 3*S->vectorDimension; 388 pSupportb += 3*S->vectorDimension; 389 pSupportc += 3*S->vectorDimension; 390 pSupportd += 3*S->vectorDimension; 391 392 vectorBlkCnt -- ; 393 } 394 395 pSupport = pSupporta; 396 vectorBlkCnt = S->nbOfSupportVectors & 3; 397 while (vectorBlkCnt > 0U) 398 { 399 accum = vdupq_n_f32(0); 400 dot = 0.0f; 401 pIn = in; 402 403 blkCnt = S->vectorDimension >> 2; 404 while (blkCnt > 0U) 405 { 406 407 vec1 = vld1q_f32(pIn); 408 vec2 = vld1q_f32(pSupport); 409 pIn += 4; 410 pSupport += 4; 411 412 accum = vmlaq_f32(accum, vec1,vec2); 413 414 blkCnt -- ; 415 } 416 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); 417 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); 418 419 420 blkCnt = S->vectorDimension & 3; 421 while (blkCnt > 0U) 422 { 423 dot = dot + *pIn++ * *pSupport++; 424 425 blkCnt -- ; 426 } 427 428 sum += *pDualCoefs++ * dot; 429 vectorBlkCnt -- ; 430 } 431 432 *pResult=S->classes[STEP(sum)]; 433 } 434 #else 435 void arm_svm_linear_predict_f32( 436 const arm_svm_linear_instance_f32 *S, 437 const float32_t * in, 438 int32_t * pResult) 439 { 440 float32_t sum=S->intercept; 441 float32_t dot=0; 442 uint32_t i,j; 443 const float32_t *pSupport = S->supportVectors; 444 445 for(i=0; i < S->nbOfSupportVectors; i++) 446 { 447 dot=0; 448 for(j=0; j < S->vectorDimension; j++) 449 { 450 dot = dot + in[j]* *pSupport++; 451 } 452 sum += S->dualCoefficients[i] * dot; 453 } 454 *pResult=S->classes[STEP(sum)]; 455 } 456 #endif 457 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 458 459 /** 460 * @} end of linearsvm group 461 */