libm_inlines_amd.h
1 /* 2 * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 #ifndef LIBM_INLINES_AMD_H_INCLUDED 29 #define LIBM_INLINES_AMD_H_INCLUDED 1 30 31 32 #ifdef WINDOWS 33 #define inline __inline 34 #include "emmintrin.h" 35 #endif 36 37 /* Scales the double x by 2.0**n. 38 Assumes EMIN <= n <= EMAX, though this condition is not checked. */ 39 static inline double scaleDouble_1(double x, int n) 40 { 41 double t; 42 /* Construct the number t = 2.0**n */ 43 PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); 44 return x*t; 45 } 46 47 48 /* Scales the double x by 2.0**n. 49 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ 50 static inline double scaleDouble_2(double x, int n) 51 { 52 double t1, t2; 53 int n1, n2; 54 n1 = n / 2; 55 n2 = n - n1; 56 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ 57 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); 58 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); 59 return (x*t1)*t2; 60 } 61 62 63 64 65 /* Scales the double x by 2.0**n. 66 Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ 67 static inline double scaleDouble_3(double x, int n) 68 { 69 double t1, t2, t3; 70 int n1, n2, n3; 71 n1 = n / 3; 72 n2 = (n - n1) / 2; 73 n3 = n - n1 - n2; 74 /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ 75 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); 76 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); 77 PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); 78 return ((x*t1)*t2)*t3; 79 } 80 81 82 /* Scales the float x by 2.0**n. 83 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ 84 static inline float scaleFloat_2(float x, int n) 85 { 86 float t1, t2; 87 int n1, n2; 88 n1 = n / 2; 89 n2 = n - n1; 90 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ 91 PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); 92 PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); 93 return (x*t1)*t2; 94 } 95 96 97 98 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). 99 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments 100 abs(x) > large/(ln(base)) (where large is the largest representable 101 floating point number) should be handled separately instead of calling 102 this function. This function is called by exp_amd, exp2_amd, exp10_amd, 103 cosh_amd and sinh_amd. */ 104 static inline void splitexp(double x, double logbase, 105 double thirtytwo_by_logbaseof2, 106 double logbaseof2_by_32_lead, 107 double logbaseof2_by_32_trail, 108 int *m, double *z1, double *z2) 109 { 110 double q, r, r1, r2, f1, f2; 111 int n, j; 112 113 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain 114 leading and trailing parts respectively of precomputed 115 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. 116 two_to_jby32_lead_table contains the first 25 bits of precision, 117 and two_to_jby32_trail_table contains a further 53 bits precision. */ 118 119 static const double two_to_jby32_lead_table[32] = { 120 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ 121 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ 122 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ 123 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ 124 1.09050768613815307617e+00, /* 0x3ff172b830000000 */ 125 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ 126 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ 127 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ 128 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ 129 1.21524733304977416992e+00, /* 0x3ff371a730000000 */ 130 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ 131 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ 132 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ 133 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ 134 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ 135 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ 136 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ 137 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ 138 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ 139 1.50916439294815063477e+00, /* 0x3ff8258990000000 */ 140 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ 141 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ 142 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ 143 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ 144 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ 145 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ 146 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ 147 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ 148 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ 149 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ 150 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ 151 1.95714408159255981445e+00}; /* 0x3fff507650000000 */ 152 153 static const double two_to_jby32_trail_table[32] = { 154 0.00000000000000000000e+00, /* 0x0000000000000000 */ 155 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ 156 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ 157 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ 158 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ 159 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ 160 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ 161 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ 162 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ 163 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ 164 4.49683815095311756138e-08, /* 0x3e682468446b6824 */ 165 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ 166 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ 167 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ 168 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ 169 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ 170 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ 171 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ 172 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ 173 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ 174 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ 175 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ 176 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ 177 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ 178 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ 179 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ 180 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ 181 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ 182 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ 183 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ 184 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ 185 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ 186 187 /* 188 Step 1. Reduce the argument. 189 190 To perform argument reduction, we find the integer n such that 191 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. 192 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and 193 remainder by x - n*logbaseof2/32. The calculation of n is 194 straightforward whereas the computation of x - n*logbaseof2/32 195 must be carried out carefully. 196 logbaseof2/32 is so represented in two pieces that 197 (1) logbaseof2/32 is known to extra precision, (2) the product 198 of n and the leading piece is a model number and is hence 199 calculated without error, and (3) the subtraction of the value 200 obtained in (2) from x is a model number and is hence again 201 obtained without error. 202 */ 203 204 r = x * thirtytwo_by_logbaseof2; 205 /* Set n = nearest integer to r */ 206 /* This is faster on Hammer */ 207 if (r > 0) 208 n = (int)(r + 0.5); 209 else 210 n = (int)(r - 0.5); 211 212 r1 = x - n * logbaseof2_by_32_lead; 213 r2 = - n * logbaseof2_by_32_trail; 214 215 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ 216 /* j = n % 32; 217 if (j < 0) j += 32; */ 218 j = n & 0x0000001f; 219 220 f1 = two_to_jby32_lead_table[j]; 221 f2 = two_to_jby32_trail_table[j]; 222 223 *m = (n - j) / 32; 224 225 /* Step 2. The following is the core approximation. We approximate 226 exp(r1+r2)-1 by a polynomial. */ 227 228 r1 *= logbase; r2 *= logbase; 229 230 r = r1 + r2; 231 q = r1 + (r2 + 232 r*r*( 5.00000000000000008883e-01 + 233 r*( 1.66666666665260878863e-01 + 234 r*( 4.16666666662260795726e-02 + 235 r*( 8.33336798434219616221e-03 + 236 r*( 1.38889490863777199667e-03 )))))); 237 238 /* Step 3. Function value reconstruction. 239 We now reconstruct the exponential of the input argument 240 so that exp(x) = 2**m * (z1 + z2). 241 The order of the computation below must be strictly observed. */ 242 243 *z1 = f1; 244 *z2 = f2 + ((f1 + f2) * q); 245 } 246 247 248 249 250 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). 251 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments 252 abs(x) > large/(ln(base)) (where large is the largest representable 253 floating point number) should be handled separately instead of calling 254 this function. This function is called by exp_amd, exp2_amd, exp10_amd, 255 cosh_amd and sinh_amd. */ 256 static inline void splitexpf(float x, float logbase, 257 float thirtytwo_by_logbaseof2, 258 float logbaseof2_by_32_lead, 259 float logbaseof2_by_32_trail, 260 int *m, float *z1, float *z2) 261 { 262 float q, r, r1, r2, f1, f2; 263 int n, j; 264 265 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain 266 leading and trailing parts respectively of precomputed 267 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. 268 two_to_jby32_lead_table contains the first 10 bits of precision, 269 and two_to_jby32_trail_table contains a further 24 bits precision. */ 270 271 static const float two_to_jby32_lead_table[32] = { 272 1.0000000000E+00F, /* 0x3F800000 */ 273 1.0214843750E+00F, /* 0x3F82C000 */ 274 1.0429687500E+00F, /* 0x3F858000 */ 275 1.0664062500E+00F, /* 0x3F888000 */ 276 1.0898437500E+00F, /* 0x3F8B8000 */ 277 1.1132812500E+00F, /* 0x3F8E8000 */ 278 1.1386718750E+00F, /* 0x3F91C000 */ 279 1.1621093750E+00F, /* 0x3F94C000 */ 280 1.1875000000E+00F, /* 0x3F980000 */ 281 1.2148437500E+00F, /* 0x3F9B8000 */ 282 1.2402343750E+00F, /* 0x3F9EC000 */ 283 1.2675781250E+00F, /* 0x3FA24000 */ 284 1.2949218750E+00F, /* 0x3FA5C000 */ 285 1.3242187500E+00F, /* 0x3FA98000 */ 286 1.3535156250E+00F, /* 0x3FAD4000 */ 287 1.3828125000E+00F, /* 0x3FB10000 */ 288 1.4140625000E+00F, /* 0x3FB50000 */ 289 1.4433593750E+00F, /* 0x3FB8C000 */ 290 1.4765625000E+00F, /* 0x3FBD0000 */ 291 1.5078125000E+00F, /* 0x3FC10000 */ 292 1.5410156250E+00F, /* 0x3FC54000 */ 293 1.5742187500E+00F, /* 0x3FC98000 */ 294 1.6093750000E+00F, /* 0x3FCE0000 */ 295 1.6445312500E+00F, /* 0x3FD28000 */ 296 1.6816406250E+00F, /* 0x3FD74000 */ 297 1.7167968750E+00F, /* 0x3FDBC000 */ 298 1.7558593750E+00F, /* 0x3FE0C000 */ 299 1.7929687500E+00F, /* 0x3FE58000 */ 300 1.8339843750E+00F, /* 0x3FEAC000 */ 301 1.8730468750E+00F, /* 0x3FEFC000 */ 302 1.9140625000E+00F, /* 0x3FF50000 */ 303 1.9570312500E+00F}; /* 0x3FFA8000 */ 304 305 static const float two_to_jby32_trail_table[32] = { 306 0.0000000000E+00F, /* 0x00000000 */ 307 4.1277357377E-04F, /* 0x39D86988 */ 308 1.3050324051E-03F, /* 0x3AAB0D9F */ 309 7.3415064253E-04F, /* 0x3A407404 */ 310 6.6398258787E-04F, /* 0x3A2E0F1E */ 311 1.1054925853E-03F, /* 0x3A90E62D */ 312 1.1675967835E-04F, /* 0x38F4DCE0 */ 313 1.6154836630E-03F, /* 0x3AD3BEA3 */ 314 1.7071149778E-03F, /* 0x3ADFC146 */ 315 4.0360994171E-04F, /* 0x39D39B9C */ 316 1.6234370414E-03F, /* 0x3AD4C982 */ 317 1.4728321694E-03F, /* 0x3AC10C0C */ 318 1.9176795613E-03F, /* 0x3AFB5AA6 */ 319 1.0178930825E-03F, /* 0x3A856AD3 */ 320 7.3992193211E-04F, /* 0x3A41F752 */ 321 1.0973819299E-03F, /* 0x3A8FD607 */ 322 1.5106226783E-04F, /* 0x391E6678 */ 323 1.8214319134E-03F, /* 0x3AEEBD1D */ 324 2.6364589576E-04F, /* 0x398A39F4 */ 325 1.3519275235E-03F, /* 0x3AB13329 */ 326 1.1952003697E-03F, /* 0x3A9CA845 */ 327 1.7620950239E-03F, /* 0x3AE6F619 */ 328 1.1153318919E-03F, /* 0x3A923054 */ 329 1.2242280645E-03F, /* 0x3AA07647 */ 330 1.5220546629E-04F, /* 0x391F9958 */ 331 1.8224230735E-03F, /* 0x3AEEDE5F */ 332 3.9278529584E-04F, /* 0x39CDEEC0 */ 333 1.7403248930E-03F, /* 0x3AE41B9D */ 334 2.3711356334E-05F, /* 0x37C6E7C0 */ 335 1.1207590578E-03F, /* 0x3A92E66F */ 336 1.1440613307E-03F, /* 0x3A95F454 */ 337 1.1287408415E-04F}; /* 0x38ECB6D0 */ 338 339 /* 340 Step 1. Reduce the argument. 341 342 To perform argument reduction, we find the integer n such that 343 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. 344 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and 345 remainder by x - n*logbaseof2/32. The calculation of n is 346 straightforward whereas the computation of x - n*logbaseof2/32 347 must be carried out carefully. 348 logbaseof2/32 is so represented in two pieces that 349 (1) logbaseof2/32 is known to extra precision, (2) the product 350 of n and the leading piece is a model number and is hence 351 calculated without error, and (3) the subtraction of the value 352 obtained in (2) from x is a model number and is hence again 353 obtained without error. 354 */ 355 356 r = x * thirtytwo_by_logbaseof2; 357 /* Set n = nearest integer to r */ 358 /* This is faster on Hammer */ 359 if (r > 0) 360 n = (int)(r + 0.5F); 361 else 362 n = (int)(r - 0.5F); 363 364 r1 = x - n * logbaseof2_by_32_lead; 365 r2 = - n * logbaseof2_by_32_trail; 366 367 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ 368 /* j = n % 32; 369 if (j < 0) j += 32; */ 370 j = n & 0x0000001f; 371 372 f1 = two_to_jby32_lead_table[j]; 373 f2 = two_to_jby32_trail_table[j]; 374 375 *m = (n - j) / 32; 376 377 /* Step 2. The following is the core approximation. We approximate 378 exp(r1+r2)-1 by a polynomial. */ 379 380 r1 *= logbase; r2 *= logbase; 381 382 r = r1 + r2; 383 q = r1 + (r2 + 384 r*r*( 5.00000000000000008883e-01F + 385 r*( 1.66666666665260878863e-01F ))); 386 387 /* Step 3. Function value reconstruction. 388 We now reconstruct the exponential of the input argument 389 so that exp(x) = 2**m * (z1 + z2). 390 The order of the computation below must be strictly observed. */ 391 392 *z1 = f1; 393 *z2 = f2 + ((f1 + f2) * q); 394 } 395 396 397 398 /* Scales up a double (normal or denormal) whose bit pattern is given 399 as ux by 2**1024. There are no checks that the input number is 400 scalable by that amount. */ 401 static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur) 402 { 403 unsigned long long uy; 404 double y; 405 406 if ((ux & EXPBITS_DP64) == 0) 407 { 408 /* ux is denormalised */ 409 PUT_BITS_DP64(ux | 0x4010000000000000, y); 410 if (ux & SIGNBIT_DP64) 411 y += 4.0; 412 else 413 y -= 4.0; 414 GET_BITS_DP64(y, uy); 415 } 416 else 417 /* ux is normal */ 418 uy = ux + 0x4000000000000000; 419 420 *ur = uy; 421 return; 422 } 423 424 425 426 /* Scales down a double whose bit pattern is given as ux by 2**k. 427 There are no checks that the input number is scalable by that amount. */ 428 static inline void scaleDownDouble(unsigned long long ux, int k, 429 unsigned long long *ur) 430 { 431 unsigned long long uy, uk, ax, xsign; 432 int n, shift; 433 xsign = ux & SIGNBIT_DP64; 434 ax = ux & ~SIGNBIT_DP64; 435 n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; 436 if (n > 0) 437 { 438 uk = (unsigned long long)n << EXPSHIFTBITS_DP64; 439 uy = (ax & ~EXPBITS_DP64) | uk; 440 } 441 else 442 { 443 uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; 444 shift = (1 - n); 445 if (shift > MANTLENGTH_DP64 + 1) 446 /* Sigh. Shifting works mod 64 so be careful not to shift too much */ 447 uy = 0; 448 else 449 { 450 /* Make sure we round the result */ 451 uy >>= shift - 1; 452 uy = (uy >> 1) + (uy & 1); 453 } 454 } 455 *ur = uy | xsign; 456 } 457 458 459 static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2) 460 { 461 462 int expadjust; 463 double r, z1, z2, correction, f, f1, f2, q, u, v, poly; 464 int index; 465 466 /* 467 Computes natural log(x). Algorithm based on: 468 Ping-Tak Peter Tang 469 "Table-driven implementation of the logarithm function in IEEE 470 floating-point arithmetic" 471 ACM Transactions on Mathematical Software (TOMS) 472 Volume 16, Issue 4 (December 1990) 473 */ 474 475 /* Arrays ln_lead_table and ln_tail_table contain 476 leading and trailing parts respectively of precomputed 477 values of natural log(1+i/64), for i = 0, 1, ..., 64. 478 ln_lead_table contains the first 24 bits of precision, 479 and ln_tail_table contains a further 53 bits precision. */ 480 481 static const double ln_lead_table[65] = { 482 0.00000000000000000000e+00, /* 0x0000000000000000 */ 483 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ 484 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ 485 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ 486 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ 487 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ 488 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ 489 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ 490 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ 491 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ 492 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ 493 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ 494 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ 495 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ 496 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ 497 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ 498 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ 499 2.35566020011901855469e-01, /* 0x3fce270700000000 */ 500 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ 501 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ 502 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ 503 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ 504 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ 505 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ 506 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ 507 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ 508 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ 509 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ 510 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ 511 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ 512 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ 513 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ 514 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ 515 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ 516 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ 517 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ 518 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ 519 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ 520 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ 521 4.75845873355865478516e-01, /* 0x3fde744240000000 */ 522 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ 523 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ 524 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ 525 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ 526 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ 527 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ 528 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ 529 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ 530 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ 531 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ 532 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ 533 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ 534 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ 535 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ 536 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ 537 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ 538 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ 539 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ 540 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ 541 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ 542 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ 543 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ 544 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ 545 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ 546 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ 547 548 static const double ln_tail_table[65] = { 549 0.00000000000000000000e+00, /* 0x0000000000000000 */ 550 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ 551 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ 552 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ 553 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ 554 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ 555 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ 556 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ 557 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ 558 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ 559 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ 560 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ 561 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ 562 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ 563 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ 564 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ 565 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ 566 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ 567 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ 568 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ 569 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ 570 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ 571 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ 572 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ 573 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ 574 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ 575 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ 576 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ 577 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ 578 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ 579 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ 580 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ 581 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ 582 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ 583 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ 584 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ 585 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ 586 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ 587 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ 588 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ 589 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ 590 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ 591 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ 592 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ 593 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ 594 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ 595 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ 596 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ 597 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ 598 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ 599 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ 600 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ 601 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ 602 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ 603 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ 604 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ 605 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ 606 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ 607 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ 608 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ 609 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ 610 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ 611 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ 612 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ 613 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ 614 615 /* Approximating polynomial coefficients for x near 1.0 */ 616 static const double 617 ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ 618 ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ 619 ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ 620 ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ 621 622 /* Approximating polynomial coefficients for other x */ 623 static const double 624 cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ 625 cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ 626 cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ 627 628 static const unsigned long long 629 log_thresh1 = 0x3fee0faa00000000, 630 log_thresh2 = 0x3ff1082c00000000; 631 632 /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 633 log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ 634 if (ux >= log_thresh1 && ux <= log_thresh2) 635 { 636 /* Arguments close to 1.0 are handled separately to maintain 637 accuracy. 638 639 The approximation in this region exploits the identity 640 log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where 641 u = 2r / (2+r). 642 Note that the right hand side has an odd Taylor series expansion 643 which converges much faster than the Taylor series expansion of 644 log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by 645 u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). 646 647 One subtlety is that since u cannot be calculated from 648 r exactly, the rounding error in the first u should be 649 avoided if possible. To accomplish this, we observe that 650 u = r - r*r/(2+r). 651 Since x (=1+r) is the input argument, and thus presumed exact, 652 the formula above approximates u accurately because 653 u = r - correction, 654 and the magnitude of "correction" (of the order of r*r) 655 is small. 656 With these observations, we will approximate log( 1 + r ) by 657 r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). 658 659 We approximate log(1+r) by an odd polynomial in u, where 660 u = 2r/(2+r) = r - r*r/(2+r). 661 */ 662 r = x - 1.0; 663 u = r / (2.0 + r); 664 correction = r * u; 665 u = u + u; 666 v = u * u; 667 z1 = r; 668 z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); 669 *r1 = z1; 670 *r2 = z2; 671 *xexp = 0; 672 } 673 else 674 { 675 /* 676 First, we decompose the argument x to the form 677 x = 2**M * (F1 + F2), 678 where 1 <= F1+F2 < 2, M has the value of an integer, 679 F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. 680 681 Second, we approximate log( 1 + F2/F1 ) by an odd polynomial 682 in U, where U = 2 F2 / (2 F2 + F1). 683 Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). 684 The core approximation calculates 685 Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. 686 Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), 687 thus, Poly = 2 arctanh( U/2 ) / U - 1. 688 689 It is not hard to see that 690 log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). 691 Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). 692 The values of log(F1) are calculated beforehand and stored 693 in the program. 694 */ 695 696 f = x; 697 if (ux < IMPBIT_DP64) 698 { 699 /* The input argument x is denormalized */ 700 /* Normalize f by increasing the exponent by 60 701 and subtracting a correction to account for the implicit 702 bit. This replaces a slow denormalized 703 multiplication by a fast normal subtraction. */ 704 static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ 705 GET_BITS_DP64(f, ux); 706 ux |= 0x03d0000000000000; 707 PUT_BITS_DP64(ux, f); 708 f -= corr; 709 GET_BITS_DP64(f, ux); 710 expadjust = 60; 711 } 712 else 713 expadjust = 0; 714 715 /* Store the exponent of x in xexp and put 716 f into the range [0.5,1) */ 717 *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; 718 PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); 719 720 /* Now x = 2**xexp * f, 1/2 <= f < 1. */ 721 722 /* Set index to be the nearest integer to 128*f */ 723 r = 128.0 * f; 724 index = (int)(r + 0.5); 725 726 z1 = ln_lead_table[index-64]; 727 q = ln_tail_table[index-64]; 728 f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ 729 f2 = f - f1; 730 /* At this point, x = 2**xexp * ( f1 + f2 ) where 731 f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ 732 733 /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ 734 /* u = f2 / (f1 + 0.5 * f2); */ 735 u = f2 / (f1 + 0.5 * f2); 736 737 /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). 738 The core approximation calculates 739 poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ 740 v = u * u; 741 poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); 742 z2 = q + (u + u * poly); 743 *r1 = z1; 744 *r2 = z2; 745 } 746 return; 747 } 748 749 #endif /* LIBM_INLINES_AMD_H_INCLUDED */