arm_svm_polynomial_predict_f16.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_svm_polynomial_predict_f16.c 4 * Description: SVM Polynomial Classifier 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/svm_functions_f16.h" 30 31 #if defined(ARM_FLOAT16_SUPPORTED) 32 33 #include <limits.h> 34 #include <math.h> 35 36 37 /** 38 * @addtogroup polysvm 39 * @{ 40 */ 41 42 43 /** 44 * @brief SVM polynomial prediction 45 * @param[in] S Pointer to an instance of the polynomial SVM structure. 46 * @param[in] in Pointer to input vector 47 * @param[out] pResult Decision value 48 * @return none. 49 * 50 */ 51 52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 53 54 #include "arm_helium_utils.h" 55 #include "arm_vec_math_f16.h" 56 57 void arm_svm_polynomial_predict_f16( 58 const arm_svm_polynomial_instance_f16 *S, 59 const float16_t * in, 60 int32_t * pResult) 61 { 62 /* inlined Matrix x Vector function interleaved with dot prod */ 63 uint32_t numRows = S->nbOfSupportVectors; 64 uint32_t numCols = S->vectorDimension; 65 const float16_t *pSupport = S->supportVectors; 66 const float16_t *pSrcA = pSupport; 67 const float16_t *pInA0; 68 const float16_t *pInA1; 69 uint32_t row; 70 uint32_t blkCnt; /* loop counters */ 71 const float16_t *pDualCoef = S->dualCoefficients; 72 _Float16 sum = S->intercept; 73 f16x8_t vSum = vdupq_n_f16(0.0f); 74 75 row = numRows; 76 77 /* 78 * compute 4 rows in parrallel 79 */ 80 while (row >= 4) { 81 const float16_t *pInA2, *pInA3; 82 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; 83 f16x8_t vecIn, acc0, acc1, acc2, acc3; 84 float16_t const *pSrcVecPtr = in; 85 86 /* 87 * Initialize the pointers to 4 consecutive MatrixA rows 88 */ 89 pInA0 = pSrcA; 90 pInA1 = pInA0 + numCols; 91 pInA2 = pInA1 + numCols; 92 pInA3 = pInA2 + numCols; 93 /* 94 * Initialize the vector pointer 95 */ 96 pInVec = pSrcVecPtr; 97 /* 98 * reset accumulators 99 */ 100 acc0 = vdupq_n_f16(0.0f); 101 acc1 = vdupq_n_f16(0.0f); 102 acc2 = vdupq_n_f16(0.0f); 103 acc3 = vdupq_n_f16(0.0f); 104 105 pSrcA0Vec = pInA0; 106 pSrcA1Vec = pInA1; 107 pSrcA2Vec = pInA2; 108 pSrcA3Vec = pInA3; 109 110 blkCnt = numCols >> 3; 111 while (blkCnt > 0U) { 112 f16x8_t vecA; 113 114 vecIn = vld1q(pInVec); 115 pInVec += 8; 116 vecA = vld1q(pSrcA0Vec); 117 pSrcA0Vec += 8; 118 acc0 = vfmaq(acc0, vecIn, vecA); 119 vecA = vld1q(pSrcA1Vec); 120 pSrcA1Vec += 8; 121 acc1 = vfmaq(acc1, vecIn, vecA); 122 vecA = vld1q(pSrcA2Vec); 123 pSrcA2Vec += 8; 124 acc2 = vfmaq(acc2, vecIn, vecA); 125 vecA = vld1q(pSrcA3Vec); 126 pSrcA3Vec += 8; 127 acc3 = vfmaq(acc3, vecIn, vecA); 128 129 blkCnt--; 130 } 131 /* 132 * tail 133 * (will be merged thru tail predication) 134 */ 135 blkCnt = numCols & 7; 136 if (blkCnt > 0U) { 137 mve_pred16_t p0 = vctp16q(blkCnt); 138 f16x8_t vecA; 139 140 vecIn = vldrhq_z_f16(pInVec, p0); 141 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 142 acc0 = vfmaq(acc0, vecIn, vecA); 143 vecA = vldrhq_z_f16(pSrcA1Vec, p0); 144 acc1 = vfmaq(acc1, vecIn, vecA); 145 vecA = vldrhq_z_f16(pSrcA2Vec, p0); 146 acc2 = vfmaq(acc2, vecIn, vecA); 147 vecA = vldrhq_z_f16(pSrcA3Vec, p0); 148 acc3 = vfmaq(acc3, vecIn, vecA); 149 } 150 /* 151 * Sum the partial parts 152 */ 153 f16x8_t vtmp = vuninitializedq_f16(); 154 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); 155 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); 156 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2); 157 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3); 158 159 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), 160 arm_vec_exponent_f16 161 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), 162 S->degree),vctp16q(4)); 163 164 pDualCoef += 4; 165 166 pSrcA += numCols * 4; 167 /* 168 * Decrement the row loop counter 169 */ 170 row -= 4; 171 } 172 173 /* 174 * compute 2 rows in parrallel 175 */ 176 if (row >= 2) { 177 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; 178 f16x8_t vecIn, acc0, acc1; 179 float16_t const *pSrcVecPtr = in; 180 181 /* 182 * Initialize the pointers to 2 consecutive MatrixA rows 183 */ 184 pInA0 = pSrcA; 185 pInA1 = pInA0 + numCols; 186 /* 187 * Initialize the vector pointer 188 */ 189 pInVec = pSrcVecPtr; 190 /* 191 * reset accumulators 192 */ 193 acc0 = vdupq_n_f16(0.0f); 194 acc1 = vdupq_n_f16(0.0f); 195 pSrcA0Vec = pInA0; 196 pSrcA1Vec = pInA1; 197 198 blkCnt = numCols >> 3; 199 while (blkCnt > 0U) { 200 f16x8_t vecA; 201 202 vecIn = vld1q(pInVec); 203 pInVec += 8; 204 vecA = vld1q(pSrcA0Vec); 205 pSrcA0Vec += 8; 206 acc0 = vfmaq(acc0, vecIn, vecA); 207 vecA = vld1q(pSrcA1Vec); 208 pSrcA1Vec += 8; 209 acc1 = vfmaq(acc1, vecIn, vecA); 210 211 blkCnt--; 212 } 213 /* 214 * tail 215 * (will be merged thru tail predication) 216 */ 217 blkCnt = numCols & 7; 218 if (blkCnt > 0U) { 219 mve_pred16_t p0 = vctp16q(blkCnt); 220 f16x8_t vecA; 221 222 vecIn = vldrhq_z_f16(pInVec, p0); 223 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 224 acc0 = vfmaq(acc0, vecIn, vecA); 225 vecA = vldrhq_z_f16(pSrcA1Vec, p0); 226 acc1 = vfmaq(acc1, vecIn, vecA); 227 } 228 /* 229 * Sum the partial parts 230 */ 231 f16x8_t vtmp = vuninitializedq_f16(); 232 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); 233 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); 234 235 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), 236 arm_vec_exponent_f16 237 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 238 vctp16q(2)); 239 240 pDualCoef += 2; 241 pSrcA += numCols * 2; 242 row -= 2; 243 } 244 245 if (row >= 1) { 246 f16x8_t vecIn, acc0; 247 float16_t const *pSrcA0Vec, *pInVec; 248 float16_t const *pSrcVecPtr = in; 249 /* 250 * Initialize the pointers to last MatrixA row 251 */ 252 pInA0 = pSrcA; 253 /* 254 * Initialize the vector pointer 255 */ 256 pInVec = pSrcVecPtr; 257 /* 258 * reset accumulators 259 */ 260 acc0 = vdupq_n_f16(0.0f); 261 262 pSrcA0Vec = pInA0; 263 264 blkCnt = numCols >> 3; 265 while (blkCnt > 0U) { 266 f16x8_t vecA; 267 268 vecIn = vld1q(pInVec); 269 pInVec += 8; 270 vecA = vld1q(pSrcA0Vec); 271 pSrcA0Vec += 8; 272 acc0 = vfmaq(acc0, vecIn, vecA); 273 274 blkCnt--; 275 } 276 /* 277 * tail 278 * (will be merged thru tail predication) 279 */ 280 blkCnt = numCols & 7; 281 if (blkCnt > 0U) { 282 mve_pred16_t p0 = vctp16q(blkCnt); 283 f16x8_t vecA; 284 285 vecIn = vldrhq_z_f16(pInVec, p0); 286 vecA = vldrhq_z_f16(pSrcA0Vec, p0); 287 acc0 = vfmaq(acc0, vecIn, vecA); 288 } 289 /* 290 * Sum the partial parts 291 */ 292 f16x8_t vtmp = vuninitializedq_f16(); 293 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); 294 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), 295 arm_vec_exponent_f16 296 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 297 vctp16q(1)); 298 } 299 sum += (_Float16)vecAddAcrossF16Mve(vSum); 300 301 302 *pResult = S->classes[STEP(sum)]; 303 } 304 305 #else 306 void arm_svm_polynomial_predict_f16( 307 const arm_svm_polynomial_instance_f16 *S, 308 const float16_t * in, 309 int32_t * pResult) 310 { 311 _Float16 sum=S->intercept; 312 _Float16 dot=0; 313 uint32_t i,j; 314 const float16_t *pSupport = S->supportVectors; 315 316 for(i=0; i < S->nbOfSupportVectors; i++) 317 { 318 dot=0; 319 for(j=0; j < S->vectorDimension; j++) 320 { 321 dot = dot + (_Float16)in[j]* (_Float16)*pSupport++; 322 } 323 sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree); 324 } 325 326 *pResult=S->classes[STEP(sum)]; 327 } 328 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 329 330 331 /** 332 * @} end of polysvm group 333 */ 334 335 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 336