/ Drivers / CMSIS / DSP / Source / FastMathFunctions / arm_vlog_f16.c
arm_vlog_f16.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_vlog_f16.c
  4   * Description:  Fast vectorized log
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/fast_math_functions_f16.h"
 30  #include "dsp/support_functions_f16.h"
 31  
 32  #if defined(ARM_FLOAT16_SUPPORTED)
 33  
 34  /* Degree of the polynomial approximation */
 35  #define NB_DEG_LOGF16 3
 36  
 37  /*
 38  Related to the Log2 of the number of approximations.
 39  For instance, with 3 there are 1 + 2^3 polynomials
 40  */
 41  #define NB_DIV_LOGF16 3
 42  
 43  /* Length of the LUT table */
 44  #define NB_LUT_LOGF16 (NB_DEG_LOGF16+1)*(1 + (1<<NB_DIV_LOGF16))
 45  
 46  
 47  /*
 48  
 49  LUT of polynomial approximations.
 50  
 51  Could be generated with:
 52  
 53  ClearAll[lut, coefs, nb, deg];
 54  nb = 3;
 55  deg = 3;
 56  lut = Table[
 57     MiniMaxApproximation[
 58       Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0},
 59       MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}];
 60  coefs = Chop@Flatten[CoefficientList[lut, x]];
 61  
 62  */
 63  static float16_t lut_logf16[NB_LUT_LOGF16]={
 64     0,0.125,-0.00781197,0.00063974,0.117783,
 65     0.111111,-0.00617212,0.000447935,0.223144,
 66     0.1,-0.00499952,0.000327193,0.318454,0.0909091,
 67     -0.00413191,0.000246234,0.405465,0.0833333,
 68     -0.00347199,0.000189928,0.485508,0.0769231,
 69     -0.00295841,0.00014956,0.559616,0.0714286,
 70     -0.0025509,0.000119868,0.628609,0.0666667,
 71     -0.00222213,0.0000975436,0.693147,
 72     0.0625,-0.00195305,0.0000804357};
 73  
 74  
 75  float16_t logf16_scalar(float16_t x)
 76  {
 77      int16_t i =  arm_typecast_s16_f16(x);
 78  
 79      int32_t vecExpUnBiased = (i >> 10) - 15;
 80      i = i - (vecExpUnBiased << 10);
 81      float16_t vecTmpFlt1 = arm_typecast_f16_s16(i);
 82  
 83      float16_t *lut;
 84      int n;
 85      float16_t tmp,v;
 86  
 87      tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16);
 88      n = (int)floor((double)tmp);
 89      v = (_Float16)tmp - (_Float16)n;
 90  
 91      lut = lut_logf16 + n * (1+NB_DEG_LOGF16);
 92  
 93      float16_t res = lut[NB_DEG_LOGF16-1];
 94      for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
 95      {
 96         res = (_Float16)lut[j] + (_Float16)v * (_Float16)res;
 97      }
 98  
 99      res = (_Float16)res + 0.693147f16 * (_Float16)vecExpUnBiased;
100  
101  
102      return(res);
103  }
104  
105  #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
106  
107  #include "arm_common_tables.h"
108  #include "arm_vec_math_f16.h"
109  
110  
111  float16x8_t vlogq_lut_f16(float16x8_t vecIn)
112  {
113      int16x8_t i =  vreinterpretq_s16_f16(vecIn);
114  
115      int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15);
116      i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10));
117      float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i);
118  
119  
120      float16x8_t lutV;
121      int16x8_t n;
122      int16x8_t offset;
123  
124      float16x8_t tmp,v,res;
125  
126      tmp = vmulq_n_f16(vsubq_n_f16(vecTmpFlt1,1.0f16),(_Float16)(1 << NB_DIV_LOGF16));
127  
128      n = vcvtq_s16_f16(tmp);
129      v = vsubq_f16(tmp,vcvtq_f16_s16(n));
130  
131  
132      offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16));
133      offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1);
134  
135      res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
136      offset = vsubq_n_s16(offset,1);
137  
138      for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
139      {
140         lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
141         res = vfmaq_f16(lutV,v,res);
142         offset = vsubq_n_s16(offset,1);
143  
144      }
145  
146      res = vfmaq_n_f16(res,vcvtq_f16_s16(vecExpUnBiased),0.693147f16);
147  
148  
149      return(res);
150  
151  }
152  
153  #endif
154  
155  /**
156    @ingroup groupFastMath
157   */
158  
159  /**
160    @addtogroup vlog
161    @{
162   */
163  
164  /**
165    @brief         Floating-point vector of log values.
166    @param[in]     pSrc       points to the input vector
167    @param[out]    pDst       points to the output vector
168    @param[in]     blockSize  number of samples in each vector
169    @return        none
170   */
171  
172  
173  void arm_vlog_f16(
174    const float16_t * pSrc,
175          float16_t * pDst,
176          uint32_t blockSize)
177  {
178     uint32_t blkCnt;
179  
180  #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
181     f16x8_t src;
182     f16x8_t dst;
183  
184     blkCnt = blockSize >> 3;
185  
186     while (blkCnt > 0U)
187     {
188        src = vld1q(pSrc);
189        dst = vlogq_lut_f16(src);
190        vst1q(pDst, dst);
191  
192        pSrc += 8;
193        pDst += 8;
194        /* Decrement loop counter */
195        blkCnt--;
196     }
197  
198     blkCnt = blockSize & 7;
199  #else
200     blkCnt = blockSize;
201  #endif
202  
203     while (blkCnt > 0U)
204     {
205        /* C = log(A) */
206  
207        /* Calculate log and store result in destination buffer. */
208        *pDst++ = logf16_scalar(*pSrc++);
209  
210        /* Decrement loop counter */
211        blkCnt--;
212     }
213  }
214  
215  
216  
217  /**
218    @} end of vlog group
219   */
220  
221  
222  #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */