arm_svm_polynomial_predict_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_svm_polynomial_predict_f32.c 4 * Description: SVM Polynomial Classifier 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/svm_functions.h" 30 #include <limits.h> 31 #include <math.h> 32 33 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 34 #include "arm_vec_math.h" 35 #endif 36 37 /** 38 * @addtogroup polysvm 39 * @{ 40 */ 41 42 43 /** 44 * @brief SVM polynomial prediction 45 * @param[in] S Pointer to an instance of the polynomial SVM structure. 46 * @param[in] in Pointer to input vector 47 * @param[out] pResult Decision value 48 * @return none. 49 * 50 */ 51 52 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 53 54 #include "arm_helium_utils.h" 55 #include "arm_vec_math.h" 56 57 void arm_svm_polynomial_predict_f32( 58 const arm_svm_polynomial_instance_f32 *S, 59 const float32_t * in, 60 int32_t * pResult) 61 { 62 /* inlined Matrix x Vector function interleaved with dot prod */ 63 uint32_t numRows = S->nbOfSupportVectors; 64 uint32_t numCols = S->vectorDimension; 65 const float32_t *pSupport = S->supportVectors; 66 const float32_t *pSrcA = pSupport; 67 const float32_t *pInA0; 68 const float32_t *pInA1; 69 uint32_t row; 70 uint32_t blkCnt; /* loop counters */ 71 const float32_t *pDualCoef = S->dualCoefficients; 72 float32_t sum = S->intercept; 73 f32x4_t vSum = vdupq_n_f32(0.0f); 74 75 row = numRows; 76 77 /* 78 * compute 4 rows in parrallel 79 */ 80 while (row >= 4) { 81 const float32_t *pInA2, *pInA3; 82 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; 83 f32x4_t vecIn, acc0, acc1, acc2, acc3; 84 float32_t const *pSrcVecPtr = in; 85 86 /* 87 * Initialize the pointers to 4 consecutive MatrixA rows 88 */ 89 pInA0 = pSrcA; 90 pInA1 = pInA0 + numCols; 91 pInA2 = pInA1 + numCols; 92 pInA3 = pInA2 + numCols; 93 /* 94 * Initialize the vector pointer 95 */ 96 pInVec = pSrcVecPtr; 97 /* 98 * reset accumulators 99 */ 100 acc0 = vdupq_n_f32(0.0f); 101 acc1 = vdupq_n_f32(0.0f); 102 acc2 = vdupq_n_f32(0.0f); 103 acc3 = vdupq_n_f32(0.0f); 104 105 pSrcA0Vec = pInA0; 106 pSrcA1Vec = pInA1; 107 pSrcA2Vec = pInA2; 108 pSrcA3Vec = pInA3; 109 110 blkCnt = numCols >> 2; 111 while (blkCnt > 0U) { 112 f32x4_t vecA; 113 114 vecIn = vld1q(pInVec); 115 pInVec += 4; 116 vecA = vld1q(pSrcA0Vec); 117 pSrcA0Vec += 4; 118 acc0 = vfmaq(acc0, vecIn, vecA); 119 vecA = vld1q(pSrcA1Vec); 120 pSrcA1Vec += 4; 121 acc1 = vfmaq(acc1, vecIn, vecA); 122 vecA = vld1q(pSrcA2Vec); 123 pSrcA2Vec += 4; 124 acc2 = vfmaq(acc2, vecIn, vecA); 125 vecA = vld1q(pSrcA3Vec); 126 pSrcA3Vec += 4; 127 acc3 = vfmaq(acc3, vecIn, vecA); 128 129 blkCnt--; 130 } 131 /* 132 * tail 133 * (will be merged thru tail predication) 134 */ 135 blkCnt = numCols & 3; 136 if (blkCnt > 0U) { 137 mve_pred16_t p0 = vctp32q(blkCnt); 138 f32x4_t vecA; 139 140 vecIn = vldrwq_z_f32(pInVec, p0); 141 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 142 acc0 = vfmaq(acc0, vecIn, vecA); 143 vecA = vldrwq_z_f32(pSrcA1Vec, p0); 144 acc1 = vfmaq(acc1, vecIn, vecA); 145 vecA = vldrwq_z_f32(pSrcA2Vec, p0); 146 acc2 = vfmaq(acc2, vecIn, vecA); 147 vecA = vldrwq_z_f32(pSrcA3Vec, p0); 148 acc3 = vfmaq(acc3, vecIn, vecA); 149 } 150 /* 151 * Sum the partial parts 152 */ 153 f32x4_t vtmp = vuninitializedq_f32(); 154 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); 155 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); 156 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2); 157 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3); 158 159 vSum = vfmaq_f32(vSum, vld1q(pDualCoef), 160 arm_vec_exponent_f32 161 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree)); 162 163 pDualCoef += 4; 164 165 pSrcA += numCols * 4; 166 /* 167 * Decrement the row loop counter 168 */ 169 row -= 4; 170 } 171 172 /* 173 * compute 2 rows in parrallel 174 */ 175 if (row >= 2) { 176 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; 177 f32x4_t vecIn, acc0, acc1; 178 float32_t const *pSrcVecPtr = in; 179 180 /* 181 * Initialize the pointers to 2 consecutive MatrixA rows 182 */ 183 pInA0 = pSrcA; 184 pInA1 = pInA0 + numCols; 185 /* 186 * Initialize the vector pointer 187 */ 188 pInVec = pSrcVecPtr; 189 /* 190 * reset accumulators 191 */ 192 acc0 = vdupq_n_f32(0.0f); 193 acc1 = vdupq_n_f32(0.0f); 194 pSrcA0Vec = pInA0; 195 pSrcA1Vec = pInA1; 196 197 blkCnt = numCols >> 2; 198 while (blkCnt > 0U) { 199 f32x4_t vecA; 200 201 vecIn = vld1q(pInVec); 202 pInVec += 4; 203 vecA = vld1q(pSrcA0Vec); 204 pSrcA0Vec += 4; 205 acc0 = vfmaq(acc0, vecIn, vecA); 206 vecA = vld1q(pSrcA1Vec); 207 pSrcA1Vec += 4; 208 acc1 = vfmaq(acc1, vecIn, vecA); 209 210 blkCnt--; 211 } 212 /* 213 * tail 214 * (will be merged thru tail predication) 215 */ 216 blkCnt = numCols & 3; 217 if (blkCnt > 0U) { 218 mve_pred16_t p0 = vctp32q(blkCnt); 219 f32x4_t vecA; 220 221 vecIn = vldrwq_z_f32(pInVec, p0); 222 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 223 acc0 = vfmaq(acc0, vecIn, vecA); 224 vecA = vldrwq_z_f32(pSrcA1Vec, p0); 225 acc1 = vfmaq(acc1, vecIn, vecA); 226 } 227 /* 228 * Sum the partial parts 229 */ 230 f32x4_t vtmp = vuninitializedq_f32(); 231 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); 232 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); 233 234 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef), 235 arm_vec_exponent_f32 236 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 237 vctp32q(2)); 238 239 pDualCoef += 2; 240 pSrcA += numCols * 2; 241 row -= 2; 242 } 243 244 if (row >= 1) { 245 f32x4_t vecIn, acc0; 246 float32_t const *pSrcA0Vec, *pInVec; 247 float32_t const *pSrcVecPtr = in; 248 /* 249 * Initialize the pointers to last MatrixA row 250 */ 251 pInA0 = pSrcA; 252 /* 253 * Initialize the vector pointer 254 */ 255 pInVec = pSrcVecPtr; 256 /* 257 * reset accumulators 258 */ 259 acc0 = vdupq_n_f32(0.0f); 260 261 pSrcA0Vec = pInA0; 262 263 blkCnt = numCols >> 2; 264 while (blkCnt > 0U) { 265 f32x4_t vecA; 266 267 vecIn = vld1q(pInVec); 268 pInVec += 4; 269 vecA = vld1q(pSrcA0Vec); 270 pSrcA0Vec += 4; 271 acc0 = vfmaq(acc0, vecIn, vecA); 272 273 blkCnt--; 274 } 275 /* 276 * tail 277 * (will be merged thru tail predication) 278 */ 279 blkCnt = numCols & 3; 280 if (blkCnt > 0U) { 281 mve_pred16_t p0 = vctp32q(blkCnt); 282 f32x4_t vecA; 283 284 vecIn = vldrwq_z_f32(pInVec, p0); 285 vecA = vldrwq_z_f32(pSrcA0Vec, p0); 286 acc0 = vfmaq(acc0, vecIn, vecA); 287 } 288 /* 289 * Sum the partial parts 290 */ 291 f32x4_t vtmp = vuninitializedq_f32(); 292 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); 293 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef), 294 arm_vec_exponent_f32 295 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 296 vctp32q(1)); 297 } 298 sum += vecAddAcrossF32Mve(vSum); 299 300 301 *pResult = S->classes[STEP(sum)]; 302 } 303 304 #else 305 #if defined(ARM_MATH_NEON) 306 void arm_svm_polynomial_predict_f32( 307 const arm_svm_polynomial_instance_f32 *S, 308 const float32_t * in, 309 int32_t * pResult) 310 { 311 float32_t sum = S->intercept; 312 313 float32_t dot; 314 float32x4_t dotV; 315 316 float32x4_t accuma,accumb,accumc,accumd,accum; 317 float32x2_t accum2; 318 float32x4_t vec1; 319 float32x4_t coef0 = vdupq_n_f32(S->coef0); 320 321 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; 322 323 uint32_t blkCnt; 324 uint32_t vectorBlkCnt; 325 326 const float32_t *pIn = in; 327 328 const float32_t *pSupport = S->supportVectors; 329 330 const float32_t *pSupporta = S->supportVectors; 331 const float32_t *pSupportb; 332 const float32_t *pSupportc; 333 const float32_t *pSupportd; 334 335 pSupportb = pSupporta + S->vectorDimension; 336 pSupportc = pSupportb + S->vectorDimension; 337 pSupportd = pSupportc + S->vectorDimension; 338 339 const float32_t *pDualCoefs = S->dualCoefficients; 340 341 vectorBlkCnt = S->nbOfSupportVectors >> 2; 342 while (vectorBlkCnt > 0U) 343 { 344 accuma = vdupq_n_f32(0); 345 accumb = vdupq_n_f32(0); 346 accumc = vdupq_n_f32(0); 347 accumd = vdupq_n_f32(0); 348 349 pIn = in; 350 351 blkCnt = S->vectorDimension >> 2; 352 while (blkCnt > 0U) 353 { 354 355 vec1 = vld1q_f32(pIn); 356 vec2a = vld1q_f32(pSupporta); 357 vec2b = vld1q_f32(pSupportb); 358 vec2c = vld1q_f32(pSupportc); 359 vec2d = vld1q_f32(pSupportd); 360 361 pIn += 4; 362 pSupporta += 4; 363 pSupportb += 4; 364 pSupportc += 4; 365 pSupportd += 4; 366 367 accuma = vmlaq_f32(accuma, vec1,vec2a); 368 accumb = vmlaq_f32(accumb, vec1,vec2b); 369 accumc = vmlaq_f32(accumc, vec1,vec2c); 370 accumd = vmlaq_f32(accumd, vec1,vec2d); 371 372 blkCnt -- ; 373 } 374 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); 375 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); 376 377 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); 378 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); 379 380 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); 381 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); 382 383 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); 384 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); 385 386 387 blkCnt = S->vectorDimension & 3; 388 while (blkCnt > 0U) 389 { 390 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); 391 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); 392 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); 393 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); 394 395 pIn++; 396 397 blkCnt -- ; 398 } 399 400 vec1 = vld1q_f32(pDualCoefs); 401 pDualCoefs += 4; 402 403 // To vectorize later 404 dotV = vmulq_n_f32(dotV, S->gamma); 405 dotV = vaddq_f32(dotV, coef0); 406 407 dotV = arm_vec_exponent_f32(dotV,S->degree); 408 409 accum = vmulq_f32(vec1,dotV); 410 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); 411 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); 412 413 pSupporta += 3*S->vectorDimension; 414 pSupportb += 3*S->vectorDimension; 415 pSupportc += 3*S->vectorDimension; 416 pSupportd += 3*S->vectorDimension; 417 418 vectorBlkCnt -- ; 419 } 420 421 pSupport = pSupporta; 422 vectorBlkCnt = S->nbOfSupportVectors & 3; 423 424 while (vectorBlkCnt > 0U) 425 { 426 accum = vdupq_n_f32(0); 427 dot = 0.0f; 428 pIn = in; 429 430 blkCnt = S->vectorDimension >> 2; 431 while (blkCnt > 0U) 432 { 433 434 vec1 = vld1q_f32(pIn); 435 vec2 = vld1q_f32(pSupport); 436 pIn += 4; 437 pSupport += 4; 438 439 accum = vmlaq_f32(accum, vec1,vec2); 440 441 blkCnt -- ; 442 } 443 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); 444 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); 445 446 447 blkCnt = S->vectorDimension & 3; 448 while (blkCnt > 0U) 449 { 450 dot = dot + *pIn++ * *pSupport++; 451 452 blkCnt -- ; 453 } 454 455 sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree); 456 vectorBlkCnt -- ; 457 } 458 459 *pResult=S->classes[STEP(sum)]; 460 } 461 #else 462 void arm_svm_polynomial_predict_f32( 463 const arm_svm_polynomial_instance_f32 *S, 464 const float32_t * in, 465 int32_t * pResult) 466 { 467 float32_t sum=S->intercept; 468 float32_t dot=0; 469 uint32_t i,j; 470 const float32_t *pSupport = S->supportVectors; 471 472 for(i=0; i < S->nbOfSupportVectors; i++) 473 { 474 dot=0; 475 for(j=0; j < S->vectorDimension; j++) 476 { 477 dot = dot + in[j]* *pSupport++; 478 } 479 sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree); 480 } 481 482 *pResult=S->classes[STEP(sum)]; 483 } 484 #endif 485 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 486 487 488 /** 489 * @} end of polysvm group 490 */