/ src / isa / avx / log1pf.c
log1pf.c
  1  /*
  2   * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  #include "libm_amd.h"
 29  #include "libm_util_amd.h"
 30  #include "libm_special.h"
 31  
 32  #undef _FUNCNAME
 33  #define _FUNCNAME "log1pf"
 34  
 35  float FN_PROTOTYPE_BAS64(log1pf)(float x)
 36  {
 37  
 38    int xexp;
 39    double dx, r, f, f1, f2, q, u, v, z1, z2, poly, m2;
 40    int index;
 41    unsigned int ux, ax;
 42    unsigned long long lux;
 43  
 44    /*
 45      Computes natural log(1+x) for float arguments. Algorithm is
 46      basically a promotion of the arguments to double followed
 47      by an inlined version of the double algorithm, simplified
 48      for efficiency (see log1p_amd.c). Simplifications include:
 49      * Special algorithm for arguments near 0.0 not required
 50      * Scaling of denormalised arguments not required
 51      * Shorter core series approximations used
 52      Note that we use a lookup table of size 64 rather than 128,
 53      and compensate by having extra terms in the minimax polynomial
 54      for the kernel approximation.
 55    */
 56  
 57  /* Arrays ln_lead_table and ln_tail_table contain
 58     leading and trailing parts respectively of precomputed
 59     values of natural log(1+i/64), for i = 0, 1, ..., 64.
 60     ln_lead_table contains the first 24 bits of precision,
 61     and ln_tail_table contains a further 53 bits precision. */
 62  
 63    static const double ln_lead_table[65] = {
 64      0.00000000000000000000e+00,   /* 0x0000000000000000 */
 65      1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
 66      3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
 67      4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
 68      6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
 69      7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
 70      8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
 71      1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
 72      1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
 73      1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
 74      1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
 75      1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
 76      1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
 77      1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
 78      1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
 79      2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
 80      2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
 81      2.35566020011901855469e-01,   /* 0x3fce270700000000 */
 82      2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
 83      2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
 84      2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
 85      2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
 86      2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
 87      3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
 88      3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
 89      3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
 90      3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
 91      3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
 92      3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
 93      3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
 94      3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
 95      3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
 96      4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
 97      4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
 98      4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
 99      4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
100      4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
101      4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
102      4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
103      4.75845873355865478516e-01,   /* 0x3fde744240000000 */
104      4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
105      4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
106      5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
107      5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
108      5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
109      5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
110      5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
111      5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
112      5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
113      5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
114      5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
115      5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
116      5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
117      6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
118      6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
119      6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
120      6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
121      6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
122      6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
123      6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
124      6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
125      6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
126      6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
127      6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
128      6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */
129  
130    static const double ln_tail_table[65] = {
131      0.00000000000000000000e+00,   /* 0x0000000000000000 */
132      5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
133      4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
134      2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
135      2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
136      3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
137      3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
138      1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
139      2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
140      5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
141      5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
142      5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
143      5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
144      9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
145      1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
146      3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
147      3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
148      5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
149      5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
150      3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
151      4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
152      5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
153      5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
154      1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
155      1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
156      6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
157      5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
158      2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
159      5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
160      5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
161      6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
162      2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
163      4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
164      2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
165      5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
166      2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
167      7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
168      5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
169      4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
170      3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
171      2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
172      1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
173      1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
174      5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
175      7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
176      5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
177      3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
178      3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
179      5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
180      4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
181      3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
182      2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
183      3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
184      5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
185      3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
186      1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
187      1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
188      3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
189      5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
190      3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
191      1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
192      4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
193      2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
194      1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
195      5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */
196  
197    static const double
198      log2 = 6.931471805599453e-01,       /* 0x3fe62e42fefa39ef */
199  
200    /* Approximating polynomial coefficients */
201      cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
202      cb_2 = 1.24999999978138668903e-02;  /* 0x3f89999999865ede */
203  
204    GET_BITS_SP32(x, ux);
205    ax = ux & ~SIGNBIT_SP32;
206  
207    if ((ux & EXPBITS_SP32) == EXPBITS_SP32)
208      {
209        /* x is either NaN or infinity */
210        if (ux & MANTBITS_SP32)
211          {
212            /* x is NaN */
213  #ifdef WINDOWS
214            return __amd_handle_errorf(_FUNCNAME, __amd_log1p, ux|0x00400000, _DOMAIN, AMD_F_NONE, EDOM, x, 0.0, 1);
215  #else
216            return x + x; /* Raise invalid if it is a signalling NaN */
217  #endif
218          }
219        else
220          {
221            /* x is infinity */
222            if (ux & SIGNBIT_SP32)
223              {
224                /* x is negative infinity. Return a NaN. */
225                return __amd_handle_errorf(_FUNCNAME, __amd_log1p, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
226              }
227            else
228              return x;
229          }
230      }
231    else if (ux >= 0xbf800000)
232      {
233        /* x <= -1.0 */
234        if (ux > 0xbf800000)
235          {
236            /* x is less than -1.0. Return a NaN. */
237            return __amd_handle_errorf(_FUNCNAME, __amd_log1p, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
238          }
239        else
240          {
241            /* x is exactly -1.0. Return -infinity with div-by-zero flag. */
242            return  __amd_handle_errorf(_FUNCNAME, __amd_log1p, NINFBITPATT_SP32, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
243  
244          }
245      }
246    else if (ax < 0x33800000)
247      {
248        if (ax == 0x00000000)
249          {
250            /* x is +/-zero. Return the same zero. */
251            return x;
252          }
253        else
254          /* abs(x) is less than float epsilon. Return x with inexact. */
255            return x;
256      }
257  
258    dx = x;
259    /*
260      First, we decompose the argument dx to the form
261      1 + dx  =  2**M  *  (F1  +  F2),
262      where  1 <= F1+F2 < 2, M has the value of an integer,
263      F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
264  
265      Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
266      in U, where U  =  2 F2 / (2 F2 + F1).
267      Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
268      The core approximation calculates
269      Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
270      Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
271      thus, Poly =  2 arctanh( U/2 ) / U  -  1.
272  
273      It is not hard to see that
274      log(dx) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
275      Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
276      The values of log(F1) are calculated beforehand and stored
277      in the program.
278    */
279  
280    f = 1.0 + dx;
281    GET_BITS_DP64(f, lux);
282  
283    /* Store the exponent of f = 1 + dx in xexp and put
284       f into the range [1.0,2.0) */
285    xexp = (int)((lux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
286    PUT_BITS_DP64((lux & MANTBITS_DP64) | ONEEXPBITS_DP64, f);
287  
288    /* Now  (1+dx) = 2**(xexp)  * f,  1 <= f < 2. */
289  
290    /* Set index to be the nearest integer to 64*f */
291    /* 64 <= index <= 128 */
292    /*
293      r = 64.0 * f;
294      index = (int)(r + 0.5);
295    */
296    /* This code instead of the above can save several cycles.
297       It only works because 64 <= r < 128, so
298       the nearest integer is always contained in exactly
299       7 bits, and the right shift is always the same. */
300    index = (int)((((lux & 0x000fc00000000000) | 0x0010000000000000) >> 46)
301                  + ((lux & 0x0000200000000000) >> 45));
302  
303    f1 = index * 0.015625; /* 0.015625 = 1/64 */
304    index -= 64;
305  
306    /* Now take great care to compute f2 such that f1 + f2 = f */
307    if (xexp <= -2 || xexp >= MANTLENGTH_DP64 + 8)
308        {
309          f2 = f - f1;
310        }
311      else
312        {
313          /* Create the number m2 = 2.0^(-xexp) */
314          lux = (unsigned long long)(0x3ff - xexp) << EXPSHIFTBITS_DP64;
315          PUT_BITS_DP64(lux,m2);
316          if (xexp <= MANTLENGTH_DP64 - 1)
317            {
318              f2 = (m2 - f1) + m2*dx;
319            }
320          else
321            {
322              f2 = (m2*dx - f1) + m2;
323            }
324        }
325  
326    /* At this point, dx = 2**xexp * ( f1  +  f2 ) where
327       f1 = j/64, j = 1, 2, ..., 64 and |f2| <= 1/128. */
328  
329    z1 = ln_lead_table[index];
330    q = ln_tail_table[index];
331  
332    /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
333    u = f2 / (f1 + 0.5 * f2);
334  
335    /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
336       The core approximation calculates
337       poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
338    v = u * u;
339    poly = (v * (cb_1 + v * cb_2));
340    z2 = q + (u + u * poly);
341  
342    /* Now z1,z2 is an extra-precise approximation of log(f). */
343  
344    /* Add xexp * log(2) to z1,z2 to get the result log(1+x). */
345    r = xexp * log2 + z1 + z2;
346    /* Natural log(1+x) */
347    return (float)r;
348  }
349  
350