arm_max_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_max_f32.c 4 * Description: Maximum value of a floating-point vector 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/statistics_functions.h" 30 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE) 31 #include <limits.h> 32 #endif 33 34 /** 35 @ingroup groupStats 36 */ 37 38 /** 39 @defgroup Max Maximum 40 41 Computes the maximum value of an array of data. 42 The function returns both the maximum value and its position within the array. 43 There are separate functions for floating-point, Q31, Q15, and Q7 data types. 44 */ 45 46 /** 47 @addtogroup Max 48 @{ 49 */ 50 51 /** 52 @brief Maximum value of a floating-point vector. 53 @param[in] pSrc points to the input vector 54 @param[in] blockSize number of samples in input vector 55 @param[out] pResult maximum value returned here 56 @param[out] pIndex index of maximum value returned here 57 @return none 58 */ 59 60 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 61 void arm_max_f32( 62 const float32_t * pSrc, 63 uint32_t blockSize, 64 float32_t * pResult, 65 uint32_t * pIndex) 66 { 67 uint32_t blkCnt; 68 f32x4_t vecSrc; 69 f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN); 70 float32_t maxValue = F32_MIN; 71 uint32_t idx = blockSize; 72 uint32x4_t indexVec; 73 uint32x4_t curExtremIdxVec; 74 uint32_t curIdx = 0; 75 mve_pred16_t p0; 76 float32_t tmp; 77 78 79 indexVec = vidupq_wb_u32(&curIdx, 1); 80 curExtremIdxVec = vdupq_n_u32(0); 81 82 /* Compute 4 outputs at a time */ 83 blkCnt = blockSize >> 2U; 84 while (blkCnt > 0U) 85 { 86 vecSrc = vldrwq_f32(pSrc); 87 /* 88 * Get current max per lane and current index per lane 89 * when a max is selected 90 */ 91 p0 = vcmpgeq(vecSrc, curExtremValVec); 92 curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); 93 curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); 94 95 indexVec = vidupq_wb_u32(&curIdx, 1); 96 97 pSrc += 4; 98 /* Decrement the loop counter */ 99 blkCnt--; 100 } 101 102 103 /* 104 * Get max value across the vector 105 */ 106 maxValue = vmaxnmvq(maxValue, curExtremValVec); 107 /* 108 * set index for lower values to max possible index 109 */ 110 p0 = vcmpgeq(curExtremValVec, maxValue); 111 indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); 112 /* 113 * Get min index which is thus for a max value 114 */ 115 idx = vminvq(idx, indexVec); 116 117 /* Tail */ 118 blkCnt = blockSize & 0x3; 119 120 while (blkCnt > 0U) 121 { 122 /* Initialize tmp to the next consecutive values one by one */ 123 tmp = *pSrc++; 124 125 /* compare for the maximum value */ 126 if (maxValue < tmp) 127 { 128 /* Update the maximum value and it's index */ 129 maxValue = tmp; 130 idx = blockSize - blkCnt; 131 } 132 133 /* Decrement loop counter */ 134 blkCnt--; 135 } 136 137 /* 138 * Save result 139 */ 140 *pIndex = idx; 141 *pResult = maxValue; 142 } 143 144 #else 145 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 146 void arm_max_f32( 147 const float32_t * pSrc, 148 uint32_t blockSize, 149 float32_t * pResult, 150 uint32_t * pIndex) 151 { 152 float32_t maxVal1, out; /* Temporary variables to store the output value. */ 153 uint32_t blkCnt, outIndex; /* loop counter */ 154 155 float32x4_t outV, srcV; 156 float32x2_t outV2; 157 158 uint32x4_t idxV; 159 uint32x4_t maxIdx; 160 static const uint32_t indexInit[4]={4,5,6,7}; 161 static const uint32_t countVInit[4]={0,1,2,3}; 162 163 uint32x4_t index; 164 uint32x4_t delta; 165 uint32x4_t countV; 166 uint32x2_t countV2; 167 168 maxIdx = vdupq_n_u32(ULONG_MAX); 169 delta = vdupq_n_u32(4); 170 index = vld1q_u32(indexInit); 171 countV = vld1q_u32(countVInit); 172 173 174 /* Initialise the index value to zero. */ 175 outIndex = 0U; 176 177 /* Load first input value that act as reference value for comparison */ 178 if (blockSize <= 3) 179 { 180 out = *pSrc++; 181 182 blkCnt = blockSize - 1; 183 184 while (blkCnt > 0U) 185 { 186 /* Initialize maxVal to the next consecutive values one by one */ 187 maxVal1 = *pSrc++; 188 189 /* compare for the maximum value */ 190 if (out < maxVal1) 191 { 192 /* Update the maximum value and it's index */ 193 out = maxVal1; 194 outIndex = blockSize - blkCnt; 195 } 196 197 /* Decrement the loop counter */ 198 blkCnt--; 199 } 200 } 201 else 202 { 203 outV = vld1q_f32(pSrc); 204 pSrc += 4; 205 206 /* Compute 4 outputs at a time */ 207 blkCnt = (blockSize - 4 ) >> 2U; 208 209 while (blkCnt > 0U) 210 { 211 srcV = vld1q_f32(pSrc); 212 pSrc += 4; 213 214 idxV = vcgtq_f32(srcV, outV); 215 outV = vbslq_f32(idxV, srcV, outV ); 216 countV = vbslq_u32(idxV, index,countV ); 217 218 index = vaddq_u32(index,delta); 219 220 /* Decrement the loop counter */ 221 blkCnt--; 222 } 223 224 outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV)); 225 outV2 = vpmax_f32(outV2,outV2); 226 out = vget_lane_f32(outV2, 0); 227 228 idxV = vceqq_f32(outV, vdupq_n_f32(out)); 229 countV = vbslq_u32(idxV, countV,maxIdx); 230 231 countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV)); 232 countV2 = vpmin_u32(countV2,countV2); 233 outIndex = vget_lane_u32(countV2,0); 234 235 /* if (blockSize - 1U) is not multiple of 4 */ 236 blkCnt = (blockSize - 4 ) % 4U; 237 238 while (blkCnt > 0U) 239 { 240 /* Initialize maxVal to the next consecutive values one by one */ 241 maxVal1 = *pSrc++; 242 243 /* compare for the maximum value */ 244 if (out < maxVal1) 245 { 246 /* Update the maximum value and it's index */ 247 out = maxVal1; 248 outIndex = blockSize - blkCnt ; 249 } 250 251 /* Decrement the loop counter */ 252 blkCnt--; 253 } 254 255 256 } 257 258 /* Store the maximum value and it's index into destination pointers */ 259 *pResult = out; 260 *pIndex = outIndex; 261 } 262 #else 263 void arm_max_f32( 264 const float32_t * pSrc, 265 uint32_t blockSize, 266 float32_t * pResult, 267 uint32_t * pIndex) 268 { 269 float32_t maxVal, out; /* Temporary variables to store the output value. */ 270 uint32_t blkCnt, outIndex; /* Loop counter */ 271 272 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 273 uint32_t index; /* index of maximum value */ 274 #endif 275 276 /* Initialise index value to zero. */ 277 outIndex = 0U; 278 279 /* Load first input value that act as reference value for comparision */ 280 out = *pSrc++; 281 282 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 283 /* Initialise index of maximum value. */ 284 index = 0U; 285 286 /* Loop unrolling: Compute 4 outputs at a time */ 287 blkCnt = (blockSize - 1U) >> 2U; 288 289 while (blkCnt > 0U) 290 { 291 /* Initialize maxVal to next consecutive values one by one */ 292 maxVal = *pSrc++; 293 294 /* compare for the maximum value */ 295 if (out < maxVal) 296 { 297 /* Update the maximum value and it's index */ 298 out = maxVal; 299 outIndex = index + 1U; 300 } 301 302 maxVal = *pSrc++; 303 if (out < maxVal) 304 { 305 out = maxVal; 306 outIndex = index + 2U; 307 } 308 309 maxVal = *pSrc++; 310 if (out < maxVal) 311 { 312 out = maxVal; 313 outIndex = index + 3U; 314 } 315 316 maxVal = *pSrc++; 317 if (out < maxVal) 318 { 319 out = maxVal; 320 outIndex = index + 4U; 321 } 322 323 index += 4U; 324 325 /* Decrement loop counter */ 326 blkCnt--; 327 } 328 329 /* Loop unrolling: Compute remaining outputs */ 330 blkCnt = (blockSize - 1U) % 4U; 331 332 #else 333 334 /* Initialize blkCnt with number of samples */ 335 blkCnt = (blockSize - 1U); 336 337 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 338 339 while (blkCnt > 0U) 340 { 341 /* Initialize maxVal to the next consecutive values one by one */ 342 maxVal = *pSrc++; 343 344 /* compare for the maximum value */ 345 if (out < maxVal) 346 { 347 /* Update the maximum value and it's index */ 348 out = maxVal; 349 outIndex = blockSize - blkCnt; 350 } 351 352 /* Decrement loop counter */ 353 blkCnt--; 354 } 355 356 /* Store the maximum value and it's index into destination pointers */ 357 *pResult = out; 358 *pIndex = outIndex; 359 } 360 #endif /* #if defined(ARM_MATH_NEON) */ 361 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 362 363 /** 364 @} end of Max group 365 */