arm_vlog_f16.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_vlog_f16.c 4 * Description: Fast vectorized log 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/fast_math_functions_f16.h" 30 #include "dsp/support_functions_f16.h" 31 32 #if defined(ARM_FLOAT16_SUPPORTED) 33 34 /* Degree of the polynomial approximation */ 35 #define NB_DEG_LOGF16 3 36 37 /* 38 Related to the Log2 of the number of approximations. 39 For instance, with 3 there are 1 + 2^3 polynomials 40 */ 41 #define NB_DIV_LOGF16 3 42 43 /* Length of the LUT table */ 44 #define NB_LUT_LOGF16 (NB_DEG_LOGF16+1)*(1 + (1<<NB_DIV_LOGF16)) 45 46 47 /* 48 49 LUT of polynomial approximations. 50 51 Could be generated with: 52 53 ClearAll[lut, coefs, nb, deg]; 54 nb = 3; 55 deg = 3; 56 lut = Table[ 57 MiniMaxApproximation[ 58 Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, 59 MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}]; 60 coefs = Chop@Flatten[CoefficientList[lut, x]]; 61 62 */ 63 static float16_t lut_logf16[NB_LUT_LOGF16]={ 64 0,0.125,-0.00781197,0.00063974,0.117783, 65 0.111111,-0.00617212,0.000447935,0.223144, 66 0.1,-0.00499952,0.000327193,0.318454,0.0909091, 67 -0.00413191,0.000246234,0.405465,0.0833333, 68 -0.00347199,0.000189928,0.485508,0.0769231, 69 -0.00295841,0.00014956,0.559616,0.0714286, 70 -0.0025509,0.000119868,0.628609,0.0666667, 71 -0.00222213,0.0000975436,0.693147, 72 0.0625,-0.00195305,0.0000804357}; 73 74 75 float16_t logf16_scalar(float16_t x) 76 { 77 int16_t i = arm_typecast_s16_f16(x); 78 79 int32_t vecExpUnBiased = (i >> 10) - 15; 80 i = i - (vecExpUnBiased << 10); 81 float16_t vecTmpFlt1 = arm_typecast_f16_s16(i); 82 83 float16_t *lut; 84 int n; 85 float16_t tmp,v; 86 87 tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16); 88 n = (int)floor((double)tmp); 89 v = (_Float16)tmp - (_Float16)n; 90 91 lut = lut_logf16 + n * (1+NB_DEG_LOGF16); 92 93 float16_t res = lut[NB_DEG_LOGF16-1]; 94 for(int j=NB_DEG_LOGF16-2; j >=0 ; j--) 95 { 96 res = (_Float16)lut[j] + (_Float16)v * (_Float16)res; 97 } 98 99 res = (_Float16)res + 0.693147f16 * (_Float16)vecExpUnBiased; 100 101 102 return(res); 103 } 104 105 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 106 107 #include "arm_common_tables.h" 108 #include "arm_vec_math_f16.h" 109 110 111 float16x8_t vlogq_lut_f16(float16x8_t vecIn) 112 { 113 int16x8_t i = vreinterpretq_s16_f16(vecIn); 114 115 int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); 116 i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10)); 117 float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i); 118 119 120 float16x8_t lutV; 121 int16x8_t n; 122 int16x8_t offset; 123 124 float16x8_t tmp,v,res; 125 126 tmp = vmulq_n_f16(vsubq_n_f16(vecTmpFlt1,1.0f16),(_Float16)(1 << NB_DIV_LOGF16)); 127 128 n = vcvtq_s16_f16(tmp); 129 v = vsubq_f16(tmp,vcvtq_f16_s16(n)); 130 131 132 offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16)); 133 offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1); 134 135 res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset); 136 offset = vsubq_n_s16(offset,1); 137 138 for(int j=NB_DEG_LOGF16-2; j >=0 ; j--) 139 { 140 lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset); 141 res = vfmaq_f16(lutV,v,res); 142 offset = vsubq_n_s16(offset,1); 143 144 } 145 146 res = vfmaq_n_f16(res,vcvtq_f16_s16(vecExpUnBiased),0.693147f16); 147 148 149 return(res); 150 151 } 152 153 #endif 154 155 /** 156 @ingroup groupFastMath 157 */ 158 159 /** 160 @addtogroup vlog 161 @{ 162 */ 163 164 /** 165 @brief Floating-point vector of log values. 166 @param[in] pSrc points to the input vector 167 @param[out] pDst points to the output vector 168 @param[in] blockSize number of samples in each vector 169 @return none 170 */ 171 172 173 void arm_vlog_f16( 174 const float16_t * pSrc, 175 float16_t * pDst, 176 uint32_t blockSize) 177 { 178 uint32_t blkCnt; 179 180 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 181 f16x8_t src; 182 f16x8_t dst; 183 184 blkCnt = blockSize >> 3; 185 186 while (blkCnt > 0U) 187 { 188 src = vld1q(pSrc); 189 dst = vlogq_lut_f16(src); 190 vst1q(pDst, dst); 191 192 pSrc += 8; 193 pDst += 8; 194 /* Decrement loop counter */ 195 blkCnt--; 196 } 197 198 blkCnt = blockSize & 7; 199 #else 200 blkCnt = blockSize; 201 #endif 202 203 while (blkCnt > 0U) 204 { 205 /* C = log(A) */ 206 207 /* Calculate log and store result in destination buffer. */ 208 *pDst++ = logf16_scalar(*pSrc++); 209 210 /* Decrement loop counter */ 211 blkCnt--; 212 } 213 } 214 215 216 217 /** 218 @} end of vlog group 219 */ 220 221 222 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */