/ include / libm_inlines_amd.h
libm_inlines_amd.h
  1  /*
  2   * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  #ifndef LIBM_INLINES_AMD_H_INCLUDED
 29  #define LIBM_INLINES_AMD_H_INCLUDED 1
 30  
 31  
 32  #ifdef WINDOWS
 33  #define inline __inline
 34  #include "emmintrin.h"
 35  #endif
 36  
 37  /* Scales the double x by 2.0**n.
 38     Assumes EMIN <= n <= EMAX, though this condition is not checked. */
 39  static inline double scaleDouble_1(double x, int n)
 40  {
 41    double t;
 42    /* Construct the number t = 2.0**n */
 43    PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
 44    return x*t;
 45  }
 46  
 47  
 48  /* Scales the double x by 2.0**n.
 49     Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
 50  static inline double scaleDouble_2(double x, int n)
 51  {
 52    double t1, t2;
 53    int n1, n2;
 54    n1 = n / 2;
 55    n2 = n - n1;
 56    /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
 57    PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
 58    PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
 59    return (x*t1)*t2;
 60  }
 61  
 62  
 63  
 64  
 65  /* Scales the double x by 2.0**n.
 66     Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
 67  static inline double scaleDouble_3(double x, int n)
 68  {
 69    double t1, t2, t3;
 70    int n1, n2, n3;
 71    n1 = n / 3;
 72    n2 = (n - n1) / 2;
 73    n3 = n - n1 - n2;
 74    /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
 75    PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
 76    PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
 77    PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
 78    return ((x*t1)*t2)*t3;
 79  }
 80  
 81  
 82  /* Scales the float x by 2.0**n.
 83     Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
 84  static inline float scaleFloat_2(float x, int n)
 85  {
 86    float t1, t2;
 87    int n1, n2;
 88    n1 = n / 2;
 89    n2 = n - n1;
 90    /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
 91    PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
 92    PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
 93    return (x*t1)*t2;
 94  }
 95  
 96  
 97  
 98  /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
 99     Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
100     abs(x) > large/(ln(base)) (where large is the largest representable
101     floating point number) should be handled separately instead of calling
102     this function. This function is called by exp_amd, exp2_amd, exp10_amd,
103     cosh_amd and sinh_amd. */
104  static inline void splitexp(double x, double logbase,
105                              double thirtytwo_by_logbaseof2,
106                              double logbaseof2_by_32_lead,
107                              double logbaseof2_by_32_trail,
108                              int *m, double *z1, double *z2)
109  {
110    double q, r, r1, r2, f1, f2;
111    int n, j;
112  
113  /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
114     leading and trailing parts respectively of precomputed
115     values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
116     two_to_jby32_lead_table contains the first 25 bits of precision,
117     and two_to_jby32_trail_table contains a further 53 bits precision. */
118  
119    static const double two_to_jby32_lead_table[32] = {
120      1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
121      1.02189713716506958008e+00,   /* 0x3ff059b0d0000000 */
122      1.04427373409271240234e+00,   /* 0x3ff0b55860000000 */
123      1.06714040040969848633e+00,   /* 0x3ff11301d0000000 */
124      1.09050768613815307617e+00,   /* 0x3ff172b830000000 */
125      1.11438673734664916992e+00,   /* 0x3ff1d48730000000 */
126      1.13878858089447021484e+00,   /* 0x3ff2387a60000000 */
127      1.16372483968734741211e+00,   /* 0x3ff29e9df0000000 */
128      1.18920707702636718750e+00,   /* 0x3ff306fe00000000 */
129      1.21524733304977416992e+00,   /* 0x3ff371a730000000 */
130      1.24185776710510253906e+00,   /* 0x3ff3dea640000000 */
131      1.26905095577239990234e+00,   /* 0x3ff44e0860000000 */
132      1.29683953523635864258e+00,   /* 0x3ff4bfdad0000000 */
133      1.32523661851882934570e+00,   /* 0x3ff5342b50000000 */
134      1.35425549745559692383e+00,   /* 0x3ff5ab07d0000000 */
135      1.38390988111495971680e+00,   /* 0x3ff6247eb0000000 */
136      1.41421353816986083984e+00,   /* 0x3ff6a09e60000000 */
137      1.44518077373504638672e+00,   /* 0x3ff71f75e0000000 */
138      1.47682613134384155273e+00,   /* 0x3ff7a11470000000 */
139      1.50916439294815063477e+00,   /* 0x3ff8258990000000 */
140      1.54221081733703613281e+00,   /* 0x3ff8ace540000000 */
141      1.57598084211349487305e+00,   /* 0x3ff93737b0000000 */
142      1.61049032211303710938e+00,   /* 0x3ff9c49180000000 */
143      1.64575546979904174805e+00,   /* 0x3ffa5503b0000000 */
144      1.68179279565811157227e+00,   /* 0x3ffae89f90000000 */
145      1.71861928701400756836e+00,   /* 0x3ffb7f76f0000000 */
146      1.75625211000442504883e+00,   /* 0x3ffc199bd0000000 */
147      1.79470902681350708008e+00,   /* 0x3ffcb720d0000000 */
148      1.83400803804397583008e+00,   /* 0x3ffd5818d0000000 */
149      1.87416762113571166992e+00,   /* 0x3ffdfc9730000000 */
150      1.91520655155181884766e+00,   /* 0x3ffea4afa0000000 */
151      1.95714408159255981445e+00};  /* 0x3fff507650000000 */
152  
153    static const double two_to_jby32_trail_table[32] = {
154      0.00000000000000000000e+00,   /* 0x0000000000000000 */
155      1.14890470981563546737e-08,   /* 0x3e48ac2ba1d73e2a */
156      4.83347014379782142328e-08,   /* 0x3e69f3121ec53172 */
157      2.67125131841396124714e-10,   /* 0x3df25b50a4ebbf1b */
158      4.65271045830351350190e-08,   /* 0x3e68faa2f5b9bef9 */
159      5.24924336638693782574e-09,   /* 0x3e368b9aa7805b80 */
160      5.38622214388600821910e-08,   /* 0x3e6ceac470cd83f6 */
161      1.90902301017041969782e-08,   /* 0x3e547f7b84b09745 */
162      3.79763538792174980894e-08,   /* 0x3e64636e2a5bd1ab */
163      2.69306947081946450986e-08,   /* 0x3e5ceaa72a9c5154 */
164      4.49683815095311756138e-08,   /* 0x3e682468446b6824 */
165      1.41933332021066904914e-09,   /* 0x3e18624b40c4dbd0 */
166      1.94146510233556266402e-08,   /* 0x3e54d8a89c750e5e */
167      2.46409119489264118569e-08,   /* 0x3e5a753e077c2a0f */
168      4.94812958044698886494e-08,   /* 0x3e6a90a852b19260 */
169      8.48872238075784476136e-10,   /* 0x3e0d2ac258f87d03 */
170      2.42032342089579394887e-08,   /* 0x3e59fcef32422cbf */
171      3.32420002333182569170e-08,   /* 0x3e61d8bee7ba46e2 */
172      1.45956577586525322754e-08,   /* 0x3e4f580c36bea881 */
173      3.46452721050003920866e-08,   /* 0x3e62999c25159f11 */
174      8.07090469079979051284e-09,   /* 0x3e415506dadd3e2a */
175      2.99439161340839520436e-09,   /* 0x3e29b8bc9e8a0388 */
176      9.83621719880452147153e-09,   /* 0x3e451f8480e3e236 */
177      8.35492309647188080486e-09,   /* 0x3e41f12ae45a1224 */
178      3.48493175137966283582e-08,   /* 0x3e62b5a75abd0e6a */
179      1.11084703472699692902e-08,   /* 0x3e47daf237553d84 */
180      5.03688744342840346564e-08,   /* 0x3e6b0aa538444196 */
181      4.81896001063495806249e-08,   /* 0x3e69df20d22a0798 */
182      4.83653666334089557746e-08,   /* 0x3e69f7490e4bb40b */
183      1.29745882314081237628e-08,   /* 0x3e4bdcdaf5cb4656 */
184      9.84532844621636118964e-09,   /* 0x3e452486cc2c7b9d */
185      4.25828404545651943883e-08};  /* 0x3e66dc8a80ce9f09 */
186  
187      /*
188        Step 1. Reduce the argument.
189  
190        To perform argument reduction, we find the integer n such that
191        x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
192        n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
193        remainder by x - n*logbaseof2/32. The calculation of n is
194        straightforward whereas the computation of x - n*logbaseof2/32
195        must be carried out carefully.
196        logbaseof2/32 is so represented in two pieces that
197        (1) logbaseof2/32 is known to extra precision, (2) the product
198        of n and the leading piece is a model number and is hence
199        calculated without error, and (3) the subtraction of the value
200        obtained in (2) from x is a model number and is hence again
201        obtained without error.
202      */
203  
204      r = x * thirtytwo_by_logbaseof2;
205      /* Set n = nearest integer to r */
206      /* This is faster on Hammer */
207      if (r > 0)
208        n = (int)(r + 0.5);
209      else
210        n = (int)(r - 0.5);
211  
212      r1 = x - n * logbaseof2_by_32_lead;
213      r2 =   - n * logbaseof2_by_32_trail;
214  
215      /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
216      /* j = n % 32;
217         if (j < 0) j += 32; */
218      j = n & 0x0000001f;
219  
220      f1 = two_to_jby32_lead_table[j];
221      f2 = two_to_jby32_trail_table[j];
222  
223      *m = (n - j) / 32;
224  
225      /* Step 2. The following is the core approximation. We approximate
226         exp(r1+r2)-1 by a polynomial. */
227  
228      r1 *= logbase; r2 *= logbase;
229  
230      r = r1 + r2;
231      q = r1 + (r2 +
232                r*r*( 5.00000000000000008883e-01 +
233                        r*( 1.66666666665260878863e-01 +
234                        r*( 4.16666666662260795726e-02 +
235                        r*( 8.33336798434219616221e-03 +
236                        r*( 1.38889490863777199667e-03 ))))));
237  
238      /* Step 3. Function value reconstruction.
239         We now reconstruct the exponential of the input argument
240         so that exp(x) = 2**m * (z1 + z2).
241         The order of the computation below must be strictly observed. */
242  
243      *z1 = f1;
244      *z2 = f2 + ((f1 + f2) * q);
245  }
246  
247  
248  
249  
250  /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
251     Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
252     abs(x) > large/(ln(base)) (where large is the largest representable
253     floating point number) should be handled separately instead of calling
254     this function. This function is called by exp_amd, exp2_amd, exp10_amd,
255     cosh_amd and sinh_amd. */
256  static inline void splitexpf(float x, float logbase,
257                               float thirtytwo_by_logbaseof2,
258                               float logbaseof2_by_32_lead,
259                               float logbaseof2_by_32_trail,
260                               int *m, float *z1, float *z2)
261  {
262    float q, r, r1, r2, f1, f2;
263    int n, j;
264  
265  /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
266     leading and trailing parts respectively of precomputed
267     values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
268     two_to_jby32_lead_table contains the first 10 bits of precision,
269     and two_to_jby32_trail_table contains a further 24 bits precision. */
270  
271    static const float two_to_jby32_lead_table[32] = {
272      1.0000000000E+00F,  /* 0x3F800000 */
273      1.0214843750E+00F,  /* 0x3F82C000 */
274      1.0429687500E+00F,  /* 0x3F858000 */
275      1.0664062500E+00F,  /* 0x3F888000 */
276      1.0898437500E+00F,  /* 0x3F8B8000 */
277      1.1132812500E+00F,  /* 0x3F8E8000 */
278      1.1386718750E+00F,  /* 0x3F91C000 */
279      1.1621093750E+00F,  /* 0x3F94C000 */
280      1.1875000000E+00F,  /* 0x3F980000 */
281      1.2148437500E+00F,  /* 0x3F9B8000 */
282      1.2402343750E+00F,  /* 0x3F9EC000 */
283      1.2675781250E+00F,  /* 0x3FA24000 */
284      1.2949218750E+00F,  /* 0x3FA5C000 */
285      1.3242187500E+00F,  /* 0x3FA98000 */
286      1.3535156250E+00F,  /* 0x3FAD4000 */
287      1.3828125000E+00F,  /* 0x3FB10000 */
288      1.4140625000E+00F,  /* 0x3FB50000 */
289      1.4433593750E+00F,  /* 0x3FB8C000 */
290      1.4765625000E+00F,  /* 0x3FBD0000 */
291      1.5078125000E+00F,  /* 0x3FC10000 */
292      1.5410156250E+00F,  /* 0x3FC54000 */
293      1.5742187500E+00F,  /* 0x3FC98000 */
294      1.6093750000E+00F,  /* 0x3FCE0000 */
295      1.6445312500E+00F,  /* 0x3FD28000 */
296      1.6816406250E+00F,  /* 0x3FD74000 */
297      1.7167968750E+00F,  /* 0x3FDBC000 */
298      1.7558593750E+00F,  /* 0x3FE0C000 */
299      1.7929687500E+00F,  /* 0x3FE58000 */
300      1.8339843750E+00F,  /* 0x3FEAC000 */
301      1.8730468750E+00F,  /* 0x3FEFC000 */
302      1.9140625000E+00F,  /* 0x3FF50000 */
303      1.9570312500E+00F}; /* 0x3FFA8000 */
304  
305    static const float two_to_jby32_trail_table[32] = {
306      0.0000000000E+00F,  /* 0x00000000 */
307      4.1277357377E-04F,  /* 0x39D86988 */
308      1.3050324051E-03F,  /* 0x3AAB0D9F */
309      7.3415064253E-04F,  /* 0x3A407404 */
310      6.6398258787E-04F,  /* 0x3A2E0F1E */
311      1.1054925853E-03F,  /* 0x3A90E62D */
312      1.1675967835E-04F,  /* 0x38F4DCE0 */
313      1.6154836630E-03F,  /* 0x3AD3BEA3 */
314      1.7071149778E-03F,  /* 0x3ADFC146 */
315      4.0360994171E-04F,  /* 0x39D39B9C */
316      1.6234370414E-03F,  /* 0x3AD4C982 */
317      1.4728321694E-03F,  /* 0x3AC10C0C */
318      1.9176795613E-03F,  /* 0x3AFB5AA6 */
319      1.0178930825E-03F,  /* 0x3A856AD3 */
320      7.3992193211E-04F,  /* 0x3A41F752 */
321      1.0973819299E-03F,  /* 0x3A8FD607 */
322      1.5106226783E-04F,  /* 0x391E6678 */
323      1.8214319134E-03F,  /* 0x3AEEBD1D */
324      2.6364589576E-04F,  /* 0x398A39F4 */
325      1.3519275235E-03F,  /* 0x3AB13329 */
326      1.1952003697E-03F,  /* 0x3A9CA845 */
327      1.7620950239E-03F,  /* 0x3AE6F619 */
328      1.1153318919E-03F,  /* 0x3A923054 */
329      1.2242280645E-03F,  /* 0x3AA07647 */
330      1.5220546629E-04F,  /* 0x391F9958 */
331      1.8224230735E-03F,  /* 0x3AEEDE5F */
332      3.9278529584E-04F,  /* 0x39CDEEC0 */
333      1.7403248930E-03F,  /* 0x3AE41B9D */
334      2.3711356334E-05F,  /* 0x37C6E7C0 */
335      1.1207590578E-03F,  /* 0x3A92E66F */
336      1.1440613307E-03F,  /* 0x3A95F454 */
337      1.1287408415E-04F}; /* 0x38ECB6D0 */
338  
339      /*
340        Step 1. Reduce the argument.
341  
342        To perform argument reduction, we find the integer n such that
343        x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
344        n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
345        remainder by x - n*logbaseof2/32. The calculation of n is
346        straightforward whereas the computation of x - n*logbaseof2/32
347        must be carried out carefully.
348        logbaseof2/32 is so represented in two pieces that
349        (1) logbaseof2/32 is known to extra precision, (2) the product
350        of n and the leading piece is a model number and is hence
351        calculated without error, and (3) the subtraction of the value
352        obtained in (2) from x is a model number and is hence again
353        obtained without error.
354      */
355  
356      r = x * thirtytwo_by_logbaseof2;
357      /* Set n = nearest integer to r */
358      /* This is faster on Hammer */
359      if (r > 0)
360        n = (int)(r + 0.5F);
361      else
362        n = (int)(r - 0.5F);
363  
364      r1 = x - n * logbaseof2_by_32_lead;
365      r2 =   - n * logbaseof2_by_32_trail;
366  
367      /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
368      /* j = n % 32;
369         if (j < 0) j += 32; */
370      j = n & 0x0000001f;
371  
372      f1 = two_to_jby32_lead_table[j];
373      f2 = two_to_jby32_trail_table[j];
374  
375      *m = (n - j) / 32;
376  
377      /* Step 2. The following is the core approximation. We approximate
378         exp(r1+r2)-1 by a polynomial. */
379  
380      r1 *= logbase; r2 *= logbase;
381  
382      r = r1 + r2;
383      q = r1 + (r2 +
384                r*r*( 5.00000000000000008883e-01F +
385                        r*( 1.66666666665260878863e-01F )));
386  
387      /* Step 3. Function value reconstruction.
388         We now reconstruct the exponential of the input argument
389         so that exp(x) = 2**m * (z1 + z2).
390         The order of the computation below must be strictly observed. */
391  
392      *z1 = f1;
393      *z2 = f2 + ((f1 + f2) * q);
394  }
395  
396  
397  
398  /* Scales up a double (normal or denormal) whose bit pattern is given
399     as ux by 2**1024. There are no checks that the input number is
400     scalable by that amount. */
401  static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
402  {
403    unsigned long long uy;
404    double y;
405  
406    if ((ux & EXPBITS_DP64) == 0)
407      {
408        /* ux is denormalised */
409        PUT_BITS_DP64(ux | 0x4010000000000000, y);
410        if (ux & SIGNBIT_DP64)
411          y += 4.0;
412        else
413          y -= 4.0;
414        GET_BITS_DP64(y, uy);
415      }
416    else
417      /* ux is normal */
418      uy = ux + 0x4000000000000000;
419  
420    *ur = uy;
421    return;
422  }
423  
424  
425  
426  /* Scales down a double whose bit pattern is given as ux by 2**k.
427     There are no checks that the input number is scalable by that amount. */
428  static inline void scaleDownDouble(unsigned long long ux, int k,
429                                     unsigned long long *ur)
430  {
431    unsigned long long uy, uk, ax, xsign;
432    int n, shift;
433    xsign = ux & SIGNBIT_DP64;
434    ax = ux & ~SIGNBIT_DP64;
435    n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
436    if (n > 0)
437      {
438        uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
439        uy = (ax & ~EXPBITS_DP64) | uk;
440      }
441    else
442      {
443        uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
444        shift = (1 - n);
445        if (shift > MANTLENGTH_DP64 + 1)
446          /* Sigh. Shifting works mod 64 so be careful not to shift too much */
447          uy = 0;
448        else
449          {
450            /* Make sure we round the result */
451            uy >>= shift - 1;
452            uy = (uy >> 1) + (uy & 1);
453          }
454      }
455    *ur = uy | xsign;
456  }
457  
458  
459  static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
460  {
461  
462    int expadjust;
463    double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
464    int index;
465  
466    /*
467      Computes natural log(x). Algorithm based on:
468      Ping-Tak Peter Tang
469      "Table-driven implementation of the logarithm function in IEEE
470      floating-point arithmetic"
471      ACM Transactions on Mathematical Software (TOMS)
472      Volume 16, Issue 4 (December 1990)
473    */
474  
475  /* Arrays ln_lead_table and ln_tail_table contain
476     leading and trailing parts respectively of precomputed
477     values of natural log(1+i/64), for i = 0, 1, ..., 64.
478     ln_lead_table contains the first 24 bits of precision,
479     and ln_tail_table contains a further 53 bits precision. */
480  
481    static const double ln_lead_table[65] = {
482      0.00000000000000000000e+00,   /* 0x0000000000000000 */
483      1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
484      3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
485      4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
486      6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
487      7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
488      8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
489      1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
490      1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
491      1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
492      1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
493      1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
494      1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
495      1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
496      1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
497      2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
498      2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
499      2.35566020011901855469e-01,   /* 0x3fce270700000000 */
500      2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
501      2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
502      2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
503      2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
504      2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
505      3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
506      3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
507      3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
508      3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
509      3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
510      3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
511      3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
512      3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
513      3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
514      4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
515      4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
516      4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
517      4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
518      4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
519      4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
520      4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
521      4.75845873355865478516e-01,   /* 0x3fde744240000000 */
522      4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
523      4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
524      5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
525      5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
526      5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
527      5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
528      5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
529      5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
530      5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
531      5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
532      5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
533      5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
534      5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
535      6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
536      6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
537      6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
538      6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
539      6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
540      6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
541      6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
542      6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
543      6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
544      6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
545      6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
546      6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */
547  
548    static const double ln_tail_table[65] = {
549      0.00000000000000000000e+00,   /* 0x0000000000000000 */
550      5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
551      4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
552      2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
553      2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
554      3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
555      3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
556      1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
557      2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
558      5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
559      5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
560      5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
561      5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
562      9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
563      1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
564      3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
565      3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
566      5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
567      5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
568      3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
569      4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
570      5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
571      5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
572      1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
573      1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
574      6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
575      5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
576      2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
577      5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
578      5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
579      6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
580      2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
581      4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
582      2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
583      5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
584      2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
585      7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
586      5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
587      4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
588      3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
589      2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
590      1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
591      1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
592      5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
593      7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
594      5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
595      3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
596      3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
597      5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
598      4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
599      3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
600      2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
601      3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
602      5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
603      3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
604      1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
605      1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
606      3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
607      5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
608      3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
609      1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
610      4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
611      2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
612      1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
613      5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */
614  
615    /* Approximating polynomial coefficients for x near 1.0 */
616    static const double
617      ca_1 = 8.33333333333317923934e-02,  /* 0x3fb55555555554e6 */
618      ca_2 = 1.25000000037717509602e-02,  /* 0x3f89999999bac6d4 */
619      ca_3 = 2.23213998791944806202e-03,  /* 0x3f62492307f1519f */
620      ca_4 = 4.34887777707614552256e-04;  /* 0x3f3c8034c85dfff0 */
621  
622    /* Approximating polynomial coefficients for other x */
623    static const double
624      cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
625      cb_2 = 1.24999999978138668903e-02,  /* 0x3f89999999865ede */
626      cb_3 = 2.23219810758559851206e-03;  /* 0x3f6249423bd94741 */
627  
628    static const unsigned long long
629      log_thresh1 = 0x3fee0faa00000000,
630      log_thresh2 = 0x3ff1082c00000000;
631  
632    /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
633       log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
634    if (ux >= log_thresh1 && ux <= log_thresh2)
635      {
636        /* Arguments close to 1.0 are handled separately to maintain
637           accuracy.
638  
639           The approximation in this region exploits the identity
640               log( 1 + r ) = log( 1 + u/2 )  /  log( 1 - u/2 ), where
641               u  = 2r / (2+r).
642           Note that the right hand side has an odd Taylor series expansion
643           which converges much faster than the Taylor series expansion of
644           log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
645               u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).
646  
647           One subtlety is that since u cannot be calculated from
648           r exactly, the rounding error in the first u should be
649           avoided if possible. To accomplish this, we observe that
650                         u  =  r  -  r*r/(2+r).
651           Since x (=1+r) is the input argument, and thus presumed exact,
652           the formula above approximates u accurately because
653                         u  =  r  -  correction,
654           and the magnitude of "correction" (of the order of r*r)
655           is small.
656           With these observations, we will approximate log( 1 + r ) by
657              r + (  (A1*u^3 + ... + An*u^(2n+1)) - correction ).
658  
659           We approximate log(1+r) by an odd polynomial in u, where
660                    u = 2r/(2+r) = r - r*r/(2+r).
661        */
662        r = x - 1.0;
663        u = r / (2.0 + r);
664        correction = r * u;
665        u = u + u;
666        v = u * u;
667        z1 = r;
668        z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
669        *r1 = z1;
670        *r2 = z2;
671        *xexp = 0;
672      }
673    else
674      {
675        /*
676          First, we decompose the argument x to the form
677          x  =  2**M  *  (F1  +  F2),
678          where  1 <= F1+F2 < 2, M has the value of an integer,
679          F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
680  
681          Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
682          in U, where U  =  2 F2 / (2 F2 + F1).
683          Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
684          The core approximation calculates
685          Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
686          Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
687          thus, Poly =  2 arctanh( U/2 ) / U  -  1.
688  
689          It is not hard to see that
690            log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
691          Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
692          The values of log(F1) are calculated beforehand and stored
693          in the program.
694        */
695  
696        f = x;
697        if (ux < IMPBIT_DP64)
698          {
699            /* The input argument x is denormalized */
700            /* Normalize f by increasing the exponent by 60
701               and subtracting a correction to account for the implicit
702               bit. This replaces a slow denormalized
703               multiplication by a fast normal subtraction. */
704            static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
705            GET_BITS_DP64(f, ux);
706            ux |= 0x03d0000000000000;
707            PUT_BITS_DP64(ux, f);
708            f -= corr;
709            GET_BITS_DP64(f, ux);
710            expadjust = 60;
711          }
712        else
713          expadjust = 0;
714  
715        /* Store the exponent of x in xexp and put
716           f into the range [0.5,1) */
717        *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
718        PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);
719  
720        /* Now  x = 2**xexp  * f,  1/2 <= f < 1. */
721  
722        /* Set index to be the nearest integer to 128*f */
723        r = 128.0 * f;
724        index = (int)(r + 0.5);
725  
726        z1 = ln_lead_table[index-64];
727        q = ln_tail_table[index-64];
728        f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
729        f2 = f - f1;
730        /* At this point, x = 2**xexp * ( f1  +  f2 ) where
731           f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */
732  
733        /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
734        /* u = f2 / (f1 + 0.5 * f2); */
735        u = f2 / (f1 + 0.5 * f2);
736  
737        /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
738           The core approximation calculates
739           poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
740        v = u * u;
741        poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
742        z2 = q + (u + u * poly);
743        *r1 = z1;
744        *r2 = z2;
745      }
746    return;
747  }
748  
749  #endif /* LIBM_INLINES_AMD_H_INCLUDED */