pow.c
1 /* 2 * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* Contains implementation of double pow(double x, double y) 29 * x^y = 2^(y*log2(x)) 30 */ 31 #include <stdint.h> 32 #include <math.h> 33 #include <float.h> 34 35 #include "libm_macros.h" 36 #include "libm_util_amd.h" 37 #include "libm_special.h" 38 #include <libm/typehelper.h> 39 40 #define L__exp_bias 0x00000000000003ff /* 1023 */ 41 #define L__mant_mask 0x000fffffffffffff 42 #define N 10 43 #define TABLE_SIZE (1ULL << N) 44 #define ABSOLUTE_VALUE 0x7FFFFFFFFFFFFFFF 45 #define TOP12_EXP_MAX 0x40862E42 46 #define MANTISSA_10_BITS 0x000FFC0000000000 47 #define MANTISSA_11_BIT 0x0000020000000000 48 49 #if !defined(ENABLE_DEBUG) 50 #ifndef __clang__ 51 #pragma GCC push_options 52 #pragma GCC optimize ("O3") 53 #endif 54 #endif /* !DEBUG */ 55 56 #define likely(x) __builtin_expect (!!(x), 1) 57 #define unlikely(x) __builtin_expect (x, 0) 58 #define ALIGN(x) __attribute__((aligned ((x)))) 59 60 #define EXP_X_NAN 1 61 #define EXP_Y_ZERO 2 62 #define EXP_Y_INF 3 63 64 double _exp_special(double x, double y, uint32_t code); 65 66 struct log_data{ 67 uint64_t head; 68 uint64_t tail; 69 }; 70 71 struct exp_data{ 72 uint64_t head; 73 uint64_t tail; 74 }; 75 76 77 typedef union { 78 uint64_t value; 79 uint32_t split[2]; 80 } doubleword; 81 82 extern struct log_data log_Finv[]; 83 extern struct log_data log_f_256[]; 84 extern struct exp_data exp_lookup[]; 85 86 static struct { 87 double ALIGN(16) poly_log[8]; 88 double ALIGN(16) poly_exp[8]; 89 double_t L__real_log2_head, 90 L__real_log2_tail, 91 log2_by_N_head, 92 log2_by_N_tail, 93 N_by_log2; 94 } pow_data = { 95 .L__real_log2_head = 0x1.62e42e0000000p-1, 96 .L__real_log2_tail = 0x1.efa39ef35793cp-25, 97 /* 98 * Polynomial constants, 1/x! (reciprocal x) 99 * To make better use of cache line, 100 * we dont store 0! and 1! 101 */ 102 .poly_log = { /* skip for 0/1 and 1/1 */ 103 0x1.0000000000000p-1, /* 1/2 */ 104 0x1.5555555555555p-2, /* 1/3 */ 105 0x1.0000000000000p-2, /* 1/4 */ 106 0x1.999999999999ap-3, /* 1/5 */ 107 0x1.5555555555555p-3, /* 1/6 */ 108 0x1.2492492492492p-3, /* 1/7 */ 109 0x1.0000000000000p-3, /* 1/8 */ 110 0x1.c71c71c71c71cp-4, /* 1/9 */ 111 }, 112 .log2_by_N_head = 0x1.62e42f0000000p-11, 113 .log2_by_N_tail = 0x1.DF473DE6AF279p-36, 114 .N_by_log2 = 0x1.71547652b82fep10, 115 .poly_exp = { 116 0x1.5555555555555p-3, /* 1/3! * 3 */ 117 0x1.5555555555555p-5, /* 1/4! * 5 */ 118 0x1.1111111111111p-7, /* 1/5! * 7 */ 119 0x1.6c16c16c16c17p-10 /* 1/6! * 9 */ 120 }, 121 }; 122 123 #define C1 pow_data.poly_log[0] 124 #define C2 pow_data.poly_log[1] 125 #define C3 pow_data.poly_log[2] 126 #define C4 pow_data.poly_log[3] 127 128 #define LOG2_HEAD pow_data.L__real_log2_head 129 #define LOG2_TAIL pow_data.L__real_log2_tail 130 #define LOG2_BY_N_HEAD pow_data.log2_by_N_head 131 #define LOG2_BY_N_TAIL pow_data.log2_by_N_tail 132 #define N_BY_LOG2 pow_data.N_by_log2 133 134 #define a0 0.5 135 #define a1 pow_data.poly_exp[0] 136 #define a2 pow_data.poly_exp[1] 137 #define a3 pow_data.poly_exp[2] 138 139 #define POW_X_ONE_Y_SNAN 1 140 #define POW_X_ZERO_Z_INF 2 141 #define POW_X_NAN 3 142 #define POW_Y_NAN 4 143 #define POW_X_NAN_Y_NAN 5 144 #define POW_X_NEG_Y_NOTINT 6 145 #define POW_Z_ZERO 7 146 #define POW_Z_DENORMAL 8 147 #define POW_Z_INF 9 148 149 double _pow_special(double x, double y, double z, uint32_t code); 150 151 static inline double_t 152 compute_log(uint64_t ux, double_t* log_lo, int32_t expadjust) 153 { 154 /* 155 * Calculate log(x) in higher precision and store as log_hi and log_lo 156 * 157 * x very close to 1.0 is handled differently, for x everywhere else 158 * a brief explanation is given below 159 * 160 * x = (2 ^ m) * A 161 * x = (2 ^ m) * (G + g) with (1 <= G < 2) and (g <= 2 ^ (-9)) 162 * x = (2 ^ m) * 2 * (G/2 + g/2) 163 * x = (2 ^ m)* 2* (F + f) with (0.5 <= F < 1) and (f <= 2 ^ (-10)) 164 * 165 * Y = (2 ^ (-1)) * (2 ^ (-m)) * (2 ^ m) * A 166 * Now, range of Y is: 0.5 <= Y < 1 167 * F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit) 168 * Now, range of F is: 256 <= F <= 512 169 * F = F / 512 170 * Now, range of F is: 0.5 <= F <= 1 171 * f = -(Y - F), with (f <= 2^(-10)) 172 * log(x) = m * log(2) + log(2) + log(F - f) 173 * log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F)) 174 * log(x) = m * log(2) + log(2 * F) + log(1 - r) 175 * r = (f / F), with (r <= 2^(-9)) 176 * r = f * (1 / F) with (1 / F) precomputed to avoid division 177 * log(x) = m * log(2) + log(G) - poly 178 * log(G) is precomputed 179 * poly = (r + (r ^ 2) / 2 + (r ^ 3) / 3 + (r ^ 4) / 4) + (r ^ 5) / 5) + (r ^ 6) / 6)) 180 * log(2) and log(G) need to be maintained in extra precision 181 * to avoid losing precision in the calculations 182 * Store the exponent of x in xexp and put 183 * f into the range [0.5, 1) 184 */ 185 int32_t xexp; 186 double_t r, r1, w, z, w1, resT_t, resT, resH; 187 double_t u, f, z1, q, f1, f2, poly; 188 xexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; 189 double_t exponent_x = (double_t)xexp; 190 f1 = asdouble((ux & MANTBITS_DP64) | HALFEXPBITS_DP64); 191 uint64_t index = (ux & MANTISSA_10_BITS); 192 index += ((ux & MANTISSA_11_BIT) << 1); 193 f = asdouble(index | HALFEXPBITS_DP64); 194 index = index >> 42; 195 f2 = f - f1; 196 /* 197 * At this point, x = 2**xexp * ( f1 + f2 ) where 198 * f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. 199 * Compute 'u' from Taylor series of log2(1+f1/f2) 200 */ 201 z1 = asdouble(log_Finv[index].tail); 202 q = asdouble(log_Finv[index].head); 203 w = asdouble(log_f_256[index].head); 204 w1 = asdouble(log_f_256[index].tail); 205 r = f2 * z1; 206 r1 = f2 * q; 207 u = r + r1; 208 z = r1 - u; 209 210 /* 211 * Polynomial evaluation 212 * For N=10, 213 * poly = u * (u * (C1 + u * (C2 + u * (C3 + u * (C4))))) 214 */ 215 216 double_t A1, B0, usquare; 217 218 /* Estrin Scheme */ 219 A1 = C1 + u * C2; 220 double_t A2 = C3 + u * C4; 221 usquare = u * u; 222 B0 = usquare * A1; /* u^2 * C1 + u^3 * C2 */ 223 double ufour = usquare * usquare; 224 poly = B0 + ufour * A2; 225 poly = (z + r) + poly; 226 resT_t = (LOG2_TAIL * exponent_x - poly) + w1; 227 resT = resT_t - u; 228 resH = LOG2_HEAD * exponent_x + w; 229 double log_hi = resT + resH; 230 *log_lo = resH - log_hi + resT; 231 return log_hi; 232 } 233 234 static inline double_t 235 compute_exp(double_t v, double_t vt, uint64_t result_sign) 236 { 237 double_t A1, A2, z, r, usquare, w, w1, B0, poly; 238 const double_t EXP_HUGE = 0x1.8000000000000p52; 239 int64_t n; 240 int64_t i; 241 doubleword xword; 242 double temp = v; 243 v = v * N_BY_LOG2; 244 xword.value = asuint64(temp); 245 int32_t abs_ylogx = (xword.value & ABSOLUTE_VALUE) >> 32; 246 247 /* check if x > 1024 * ln(2) */ 248 if(unlikely(abs_ylogx >= TOP12_EXP_MAX)) 249 { 250 /* if abs(y * log(x)) > 709.7822265625 */ 251 if(xword.value >= EXP_MIN) 252 { 253 /* if y * log(x) < -745.133219101941222106688655913 */ 254 v = asdouble(0x0 | result_sign); 255 return _exp_special(xword.value, v, EXP_Y_ZERO); 256 257 } 258 if(temp > EXP_MAX_DOUBLE) 259 { 260 /* if y * log(x) > 709.7822265625 */ 261 v = asdouble(EXPBITS_DP64 | result_sign); 262 return _exp_special(xword.value, v, EXP_Y_INF); 263 264 } 265 abs_ylogx = 0xfff; 266 } 267 double_t fastconvert = v + EXP_HUGE; 268 n = asuint64(fastconvert); 269 double_t dn = fastconvert - EXP_HUGE; 270 271 /* Table size = 1024. Here N = 10 */ 272 int32_t index = n % (1 << N); 273 r = temp - (dn * LOG2_BY_N_HEAD); 274 int64_t m = ((n - index) << (EXPSHIFTBITS_DP64 - N)) + 0x3ff0000000000000; 275 r = (r - (LOG2_BY_N_TAIL * dn)) + vt; 276 /* 277 * Taylor's series to evaluate exp(r) 278 * For N = 11 279 * polynomial = r * (1.0 + r * (0.5 + r * (1/3! + r * (1/4!))) 280 * for N = 10 281 * polynomial =r * (1.0 + r * (0.5 + r * (1/3! + r * (1/4! + r * (1/5!)))) 282 */ 283 /*Estrin's approach used here for polynomial evaluation 284 * 285 * N = 10 286 * 287 */ 288 A1 = a0 + r * a1; /* A1 = 0.5 + r ^ (1/3!) */ 289 usquare = r * r; 290 A2 = a2 + r * a3; 291 B0 = r + usquare*A1; /* r + 0.5 * r ^ 2 + r ^ 3 * (1 / 3!) */ 292 poly = B0 + usquare * usquare * A2; 293 w = asdouble(exp_lookup[index].head); 294 w1 = asdouble(exp_lookup[index].tail); 295 temp = w1 + poly * w1; 296 z = poly * w; 297 double_t result= w + (z + temp); 298 299 /* Process denormals */ 300 if(unlikely(abs_ylogx == 0xfff)) 301 { 302 int32_t m2 = (n - index) >> N; 303 if(result < 1.0 || m2 < EMIN_DP64) 304 { 305 m2 = m2 + 1074; 306 i = 1ULL << m2; 307 dn = asdouble(i); 308 n = asuint64(result * dn); 309 result = asdouble(n | result_sign); 310 return result; 311 } 312 } 313 314 z = asdouble(m | result_sign); 315 return result * z; 316 } 317 318 static inline uint32_t checkint(uint64_t u) 319 { 320 int32_t u_exp = ((u & ABSOLUTE_VALUE) >> EXPSHIFTBITS_DP64); 321 /* 322 * See whether u is an integer. 323 * status = 0 means not an integer. 324 * status = 1 means odd integer. 325 * status = 2 means even integer. 326 */ 327 if (u_exp < 0x3ff) 328 return 0; 329 if (u_exp > 0x3ff + EXPSHIFTBITS_DP64) 330 return 2; 331 if (u & ((1ULL << (0x3ff + EXPSHIFTBITS_DP64 - u_exp)) - 1)) 332 return 0; 333 if (u & (1ULL << (0x3ff + EXPSHIFTBITS_DP64 - u_exp))) 334 return 1; /* odd integer */ 335 return 2; 336 } 337 338 /* Returns 1 if input is the bit representation of 0, infinity or nan. */ 339 static inline int checkzeroinfnan (uint64_t i) 340 { 341 return 2 * i - 1 >= 2 * EXPBITS_DP64 - 1; 342 } 343 344 static inline int issignaling_inline (double x) 345 { 346 uint64_t ix; 347 ix = asuint64(x); 348 return 2 * (ix ^ QNAN_MASK_64) > 2 * QNANBITPATT_DP64; 349 } 350 351 static inline double _pow_inexact(double x) 352 { 353 double_t a = 0x1.0p+0; /* a = 1.0 */ 354 double_t b = 0x1.4000000000000p+3; /* b = 10.0 */ 355 __asm __volatile ("divsd %1, %0" : "+x" (a): "x" (b)); 356 return x; 357 } 358 359 double 360 ALM_PROTO_OPT(pow)(double x, double y) 361 { 362 double_t log_lo; 363 double_t f; 364 int32_t expadjust = 0; 365 uint64_t ux, uy, result_sign; 366 uint64_t infinity = EXPBITS_DP64; 367 uint64_t one = ONEEXPBITS_DP64; 368 ux = asuint64(x); 369 uy = asuint64(y); 370 uint32_t xhigh = ux >> EXPSHIFTBITS_DP64; /* Top 12 bits of x */ 371 uint32_t yhigh = uy >> EXPSHIFTBITS_DP64; /* Top 12 bits of y */ 372 result_sign = 0; /* Hold the sign of the result */ 373 374 if (unlikely (xhigh - 0x001 >= 0x7ff - 0x001 375 || (yhigh & 0x7ff) - 0x3be >= 0x43e - 0x3be)) 376 { 377 if (unlikely (checkzeroinfnan (uy))) 378 { 379 if (2 * uy == 0) 380 return issignaling_inline (x) ? x + y : 1.0; 381 if (ux == one) 382 return issignaling_inline (y) ? x + y : 1.0; 383 if (2 * ux > 2 * infinity || 2 * uy > 2 * infinity) 384 return x + y; 385 if (2 * ux == 2 * one) 386 return 1.0; 387 if ((2 * ux < 2 * one) == !(uy >> 63)) 388 return 0.0; /* |x| < 1 && y = inf or |x| > 1 && y = -inf */ 389 return y * y; 390 } 391 if (unlikely (checkzeroinfnan (ux))) 392 { 393 394 double x2 = x * x; 395 /* x is negative , y is odd*/ 396 if (ux >> 63 && checkint (uy) == 1) 397 { 398 result_sign = SIGNBIT_DP64; 399 } 400 if ( 2 * ux == 0 && uy >> 63) 401 { 402 x2 = 1.0 / 0.0; 403 x2 = asdouble(ux | result_sign); 404 return x2; 405 } 406 x2 = asdouble(asuint64(x2) | result_sign); 407 return uy >> 63 ? (1 / x2) : x2; 408 } 409 /* Here x and y are non-zero finite. */ 410 if (ux >> 63) 411 { 412 /* Finite x < 0 */ 413 uint32_t yint = checkint (uy); 414 if (yint == 0) 415 return sqrt(x); 416 if (yint == 1) 417 result_sign = SIGNBIT_DP64; 418 ux &= ABSOLUTE_VALUE; 419 xhigh &= 0x7ff; 420 } 421 if ((yhigh & 0x7ff) - 0x3be >= 0x43e - 0x3be) { 422 /* Note: sign_bias = 0 here because y is not odd. */ 423 if (ux == one) 424 { 425 return _pow_inexact(1.0); 426 } 427 if ((yhigh & 0x7ff) < 0x3be) 428 { 429 /* |y| < 2 ^ -65, x ^ y ~= 1 + y * log(x) */ 430 return ux > one ? 1.0 + y : 1.0 - y; 431 } 432 return (ux > one) == (yhigh < 0x800) ? 433 (DBL_MAX*DBL_MAX) : 434 _pow_special(x, y, 0.0, POW_Z_ZERO); 435 } 436 437 if (xhigh == 0) 438 { 439 /* subnormal x */ 440 uint64_t mant = ux & MANTBITS_DP64; 441 f = asuint64(mant | ONEEXPBITS_DP64); 442 double_t temp = f - 1.0; 443 ux = asuint64(temp); 444 expadjust = 1022; 445 } 446 } 447 448 double_t log_hi = compute_log(ux, &log_lo, expadjust); 449 450 /* Multiplication of log_hi and log_lo with y */ 451 double_t v = log_hi * y; 452 double_t vt = y * log_lo + fma(y, log_hi, -v); 453 454 double_t result = compute_exp(v, vt, result_sign); 455 return result; 456 } 457 458 #if !defined(ENABLE_DEBUG) 459 #ifndef __clang__ 460 #pragma GCC pop_options 461 #endif 462 #endif