/ src / optmized / pow.c
pow.c
  1  /*
  2   * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  /* Contains implementation of double pow(double x, double y)
 29   * x^y = 2^(y*log2(x))
 30   */
 31  #include <stdint.h>
 32  #include <math.h>
 33  #include <float.h>
 34  
 35  #include "libm_macros.h"
 36  #include "libm_util_amd.h"
 37  #include "libm_special.h"
 38  #include <libm/typehelper.h>
 39  
 40  #define L__exp_bias 0x00000000000003ff /* 1023 */
 41  #define L__mant_mask 0x000fffffffffffff
 42  #define N 10
 43  #define TABLE_SIZE (1ULL << N)
 44  #define ABSOLUTE_VALUE 0x7FFFFFFFFFFFFFFF
 45  #define TOP12_EXP_MAX 0x40862E42
 46  #define MANTISSA_10_BITS 0x000FFC0000000000
 47  #define MANTISSA_11_BIT 0x0000020000000000
 48  
 49  #if !defined(ENABLE_DEBUG)
 50  #ifndef __clang__
 51  #pragma GCC push_options
 52  #pragma GCC optimize ("O3")
 53  #endif
 54  #endif  /* !DEBUG */
 55  
 56  #define likely(x)   __builtin_expect (!!(x), 1)
 57  #define unlikely(x) __builtin_expect (x, 0)
 58  #define ALIGN(x)    __attribute__((aligned ((x))))
 59  
 60  #define EXP_X_NAN 1
 61  #define EXP_Y_ZERO 2
 62  #define EXP_Y_INF 3
 63  
 64  double _exp_special(double x, double y, uint32_t code);
 65  
 66  struct log_data{
 67      uint64_t head;
 68      uint64_t tail;
 69  };
 70  
 71  struct exp_data{
 72      uint64_t head;
 73      uint64_t tail;
 74  };
 75  
 76  
 77  typedef union {
 78      uint64_t value;
 79      uint32_t split[2];
 80  } doubleword;
 81  
 82  extern struct log_data log_Finv[];
 83  extern struct log_data log_f_256[];
 84  extern struct exp_data exp_lookup[];
 85  
 86  static struct {
 87      double ALIGN(16) poly_log[8];
 88      double ALIGN(16) poly_exp[8];
 89      double_t L__real_log2_head,
 90          L__real_log2_tail,
 91          log2_by_N_head,
 92          log2_by_N_tail,
 93          N_by_log2;
 94  } pow_data = {
 95      .L__real_log2_head  = 0x1.62e42e0000000p-1,
 96      .L__real_log2_tail  = 0x1.efa39ef35793cp-25,
 97      /*
 98       * Polynomial constants, 1/x! (reciprocal x)
 99       * To make better use of cache line,
100       * we dont store 0! and 1!
101       */
102      .poly_log = {       /* skip for 0/1 and 1/1 */
103          0x1.0000000000000p-1,   /* 1/2 */
104          0x1.5555555555555p-2,   /* 1/3 */
105          0x1.0000000000000p-2,   /* 1/4 */
106          0x1.999999999999ap-3,   /* 1/5 */
107          0x1.5555555555555p-3,   /* 1/6 */
108          0x1.2492492492492p-3,   /* 1/7 */
109          0x1.0000000000000p-3,   /* 1/8 */
110          0x1.c71c71c71c71cp-4,   /* 1/9 */
111      },
112      .log2_by_N_head = 0x1.62e42f0000000p-11,
113      .log2_by_N_tail = 0x1.DF473DE6AF279p-36,
114      .N_by_log2 = 0x1.71547652b82fep10,
115      .poly_exp = {
116          0x1.5555555555555p-3,   /* 1/3! * 3 */
117          0x1.5555555555555p-5,   /* 1/4! * 5 */
118          0x1.1111111111111p-7,   /* 1/5! * 7 */
119          0x1.6c16c16c16c17p-10   /* 1/6! * 9 */
120      },
121  };
122  
123  #define C1 pow_data.poly_log[0]
124  #define C2 pow_data.poly_log[1]
125  #define C3 pow_data.poly_log[2]
126  #define C4 pow_data.poly_log[3]
127  
128  #define LOG2_HEAD pow_data.L__real_log2_head
129  #define LOG2_TAIL pow_data.L__real_log2_tail
130  #define LOG2_BY_N_HEAD pow_data.log2_by_N_head
131  #define LOG2_BY_N_TAIL pow_data.log2_by_N_tail
132  #define N_BY_LOG2 pow_data.N_by_log2
133  
134  #define a0 0.5
135  #define a1 pow_data.poly_exp[0]
136  #define a2 pow_data.poly_exp[1]
137  #define a3 pow_data.poly_exp[2]
138  
139  #define POW_X_ONE_Y_SNAN 1
140  #define POW_X_ZERO_Z_INF 2
141  #define POW_X_NAN 3
142  #define POW_Y_NAN 4
143  #define POW_X_NAN_Y_NAN 5
144  #define POW_X_NEG_Y_NOTINT 6
145  #define POW_Z_ZERO 7
146  #define POW_Z_DENORMAL 8
147  #define POW_Z_INF 9
148  
149  double _pow_special(double x, double y, double z, uint32_t code);
150  
151  static inline double_t
152  compute_log(uint64_t ux, double_t* log_lo, int32_t expadjust)
153  {
154      /*
155       * Calculate log(x) in higher precision and store as log_hi and log_lo
156       *
157       * x very close to 1.0 is handled differently, for x everywhere else
158       * a brief explanation is given below
159       *
160       * x = (2 ^ m) * A
161       * x = (2 ^ m) * (G + g) with (1 <= G < 2) and (g <= 2 ^ (-9))
162       * x = (2 ^ m) * 2 * (G/2 + g/2)
163       * x = (2 ^ m)* 2* (F + f) with (0.5 <= F < 1) and (f <= 2 ^ (-10))
164       *
165       * Y = (2 ^ (-1)) * (2 ^ (-m)) * (2 ^ m) * A
166       * Now, range of Y is: 0.5 <= Y < 1
167       * F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
168       * Now, range of F is: 256 <= F <= 512
169       * F = F / 512
170       * Now, range of F is: 0.5 <= F <= 1
171       * f = -(Y - F), with (f <= 2^(-10))
172       * log(x) = m * log(2) + log(2) + log(F - f)
173       * log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F))
174       * log(x) = m * log(2) + log(2 * F) + log(1 - r)
175       * r = (f / F), with (r <= 2^(-9))
176       * r = f * (1 / F) with (1 / F) precomputed to avoid division
177       * log(x) = m * log(2) + log(G) - poly
178       * log(G) is precomputed
179       * poly = (r + (r ^ 2) / 2 + (r ^ 3) / 3 + (r ^ 4) / 4) + (r ^ 5) / 5) + (r ^ 6) / 6))
180       * log(2) and log(G) need to be maintained in extra precision
181       * to avoid losing precision in the calculations
182       * Store the exponent of x in xexp and put
183       * f into the range [0.5, 1)
184      */
185      int32_t xexp;
186      double_t r, r1, w, z, w1, resT_t, resT, resH;
187      double_t u, f, z1, q, f1, f2, poly;
188      xexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
189      double_t exponent_x = (double_t)xexp;
190      f1 = asdouble((ux & MANTBITS_DP64) | HALFEXPBITS_DP64);
191      uint64_t index = (ux & MANTISSA_10_BITS);
192      index += ((ux & MANTISSA_11_BIT) << 1);
193      f = asdouble(index | HALFEXPBITS_DP64);
194      index = index >> 42;
195      f2 = f - f1;
196      /*
197      * At this point, x = 2**xexp * ( f1  +  f2 ) where
198      * f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256.
199      * Compute 'u' from Taylor series of log2(1+f1/f2)
200      */
201      z1 = asdouble(log_Finv[index].tail);
202      q = asdouble(log_Finv[index].head);
203      w = asdouble(log_f_256[index].head);
204      w1 = asdouble(log_f_256[index].tail);
205      r = f2 * z1;
206      r1 = f2 * q;
207      u = r + r1;
208      z = r1 - u;
209  
210      /*
211       * Polynomial evaluation
212       * For N=10,
213       * poly = u * (u * (C1 + u * (C2 + u * (C3 + u * (C4)))))
214      */
215  
216      double_t A1, B0, usquare;
217  
218      /* Estrin Scheme */
219      A1 = C1 + u * C2;
220      double_t A2 = C3 + u * C4;
221      usquare = u * u;
222      B0 = usquare * A1; /* u^2 * C1 + u^3 * C2 */
223      double ufour = usquare * usquare;
224      poly = B0 + ufour * A2;
225      poly = (z + r) + poly;
226      resT_t = (LOG2_TAIL * exponent_x - poly) + w1;
227      resT = resT_t - u;
228      resH = LOG2_HEAD * exponent_x + w;
229      double log_hi = resT + resH;
230      *log_lo = resH - log_hi + resT;
231      return log_hi;
232  }
233  
234  static inline double_t
235  compute_exp(double_t v, double_t vt, uint64_t result_sign)
236  {
237      double_t A1, A2, z, r, usquare, w, w1, B0, poly;
238      const double_t EXP_HUGE = 0x1.8000000000000p52;
239      int64_t n;
240      int64_t i;
241      doubleword xword;
242      double temp = v;
243      v = v * N_BY_LOG2;
244      xword.value = asuint64(temp);
245      int32_t abs_ylogx = (xword.value & ABSOLUTE_VALUE) >> 32;
246  
247      /* check if x > 1024 * ln(2) */
248      if(unlikely(abs_ylogx >= TOP12_EXP_MAX))
249      {
250          /* if abs(y * log(x)) > 709.7822265625 */
251          if(xword.value >= EXP_MIN)
252          {
253              /* if y * log(x) < -745.133219101941222106688655913 */
254              v = asdouble(0x0 | result_sign);
255              return _exp_special(xword.value, v, EXP_Y_ZERO);
256  
257          }
258          if(temp > EXP_MAX_DOUBLE)
259          {
260              /* if y * log(x) > 709.7822265625 */
261              v = asdouble(EXPBITS_DP64 | result_sign);
262              return  _exp_special(xword.value, v,  EXP_Y_INF);
263  
264          }
265          abs_ylogx = 0xfff;
266      }
267      double_t fastconvert = v + EXP_HUGE;
268      n = asuint64(fastconvert);
269      double_t dn = fastconvert - EXP_HUGE;
270  
271      /* Table size = 1024. Here N = 10 */
272      int32_t index = n % (1 << N);
273      r = temp - (dn * LOG2_BY_N_HEAD);
274      int64_t m = ((n - index) << (EXPSHIFTBITS_DP64 - N)) + 0x3ff0000000000000;
275      r = (r - (LOG2_BY_N_TAIL * dn)) + vt;
276      /*
277       * Taylor's series to evaluate exp(r)
278       * For N = 11
279       * polynomial = r * (1.0 + r * (0.5 + r * (1/3! + r * (1/4!)))
280       * for N = 10
281       * polynomial =r * (1.0 + r * (0.5 + r * (1/3! + r * (1/4! + r * (1/5!))))
282      */
283      /*Estrin's approach used here for polynomial evaluation
284       *
285       * N = 10
286       *
287       */
288      A1 = a0 + r * a1; /* A1 = 0.5 + r ^ (1/3!) */
289      usquare = r * r;
290      A2 = a2 + r * a3;
291      B0 = r + usquare*A1; /* r + 0.5 * r ^ 2 + r ^ 3 * (1 / 3!) */
292      poly = B0 + usquare * usquare * A2;
293      w = asdouble(exp_lookup[index].head);
294      w1 = asdouble(exp_lookup[index].tail);
295      temp = w1 + poly * w1;
296      z = poly * w;
297      double_t result= w + (z + temp);
298  
299      /* Process denormals */
300      if(unlikely(abs_ylogx == 0xfff))
301      {
302          int32_t m2 = (n - index) >> N;
303          if(result < 1.0 || m2 < EMIN_DP64)
304          {
305              m2 = m2 + 1074;
306              i = 1ULL << m2;
307              dn = asdouble(i);
308              n = asuint64(result * dn);
309              result = asdouble(n | result_sign);
310              return result;
311          }
312      }
313  
314      z = asdouble(m | result_sign);
315      return result * z;
316  }
317  
318  static inline uint32_t checkint(uint64_t u)
319  {
320      int32_t u_exp = ((u & ABSOLUTE_VALUE) >> EXPSHIFTBITS_DP64);
321      /*
322       * See whether u is an integer.
323       * status = 0 means not an integer.
324       * status = 1 means odd integer.
325       * status = 2 means even integer.
326      */
327      if (u_exp < 0x3ff)
328          return 0;
329      if (u_exp > 0x3ff + EXPSHIFTBITS_DP64)
330          return 2;
331      if (u & ((1ULL << (0x3ff + EXPSHIFTBITS_DP64 - u_exp)) - 1))
332          return 0;
333      if (u & (1ULL << (0x3ff + EXPSHIFTBITS_DP64 - u_exp)))
334          return 1; /* odd integer */
335      return 2;
336  }
337  
338  /* Returns 1 if input is the bit representation of 0, infinity or nan. */
339  static inline int checkzeroinfnan (uint64_t i)
340  {
341      return 2 * i - 1 >= 2 * EXPBITS_DP64 - 1;
342  }
343  
344  static inline int issignaling_inline (double x)
345  {
346      uint64_t ix;
347      ix = asuint64(x);
348      return 2 * (ix ^ QNAN_MASK_64) > 2 * QNANBITPATT_DP64;
349  }
350  
351  static inline double _pow_inexact(double x)
352  {
353      double_t a = 0x1.0p+0; /* a = 1.0 */
354      double_t b = 0x1.4000000000000p+3; /* b = 10.0 */
355      __asm __volatile ("divsd %1, %0" :  "+x" (a): "x" (b));
356      return x;
357  }
358  
359  double
360  ALM_PROTO_OPT(pow)(double x, double y)
361  {
362      double_t log_lo;
363      double_t f;
364      int32_t expadjust = 0;
365      uint64_t ux, uy, result_sign;
366      uint64_t infinity = EXPBITS_DP64;
367      uint64_t one = ONEEXPBITS_DP64;
368      ux = asuint64(x);
369      uy = asuint64(y);
370      uint32_t xhigh = ux >> EXPSHIFTBITS_DP64; /* Top 12 bits of x */
371      uint32_t yhigh = uy >> EXPSHIFTBITS_DP64; /* Top 12 bits of y */
372      result_sign = 0; /* Hold the sign of the result */
373  
374      if (unlikely (xhigh - 0x001 >= 0x7ff - 0x001
375                    || (yhigh & 0x7ff) - 0x3be >= 0x43e - 0x3be))
376      {
377          if (unlikely (checkzeroinfnan (uy)))
378          {
379              if (2 * uy == 0)
380                  return issignaling_inline (x) ? x + y : 1.0;
381              if (ux == one)
382                  return issignaling_inline (y) ? x + y : 1.0;
383              if (2 * ux > 2 * infinity || 2 * uy > 2 * infinity)
384                  return x + y;
385              if (2 * ux == 2 * one)
386                  return 1.0;
387              if ((2 * ux < 2 * one) == !(uy >> 63))
388                  return 0.0; /* |x| < 1 && y = inf or |x| > 1 && y = -inf */
389              return y * y;
390          }
391          if (unlikely (checkzeroinfnan (ux)))
392          {
393  
394              double x2 = x * x;
395              /* x is negative , y is odd*/
396              if (ux >> 63 && checkint (uy) == 1)
397              {
398                  result_sign =  SIGNBIT_DP64;
399              }
400              if ( 2 * ux == 0 && uy >> 63)
401              {
402                  x2 = 1.0 / 0.0;
403                  x2 = asdouble(ux | result_sign);
404                  return x2;
405              }
406              x2 = asdouble(asuint64(x2) | result_sign);
407              return uy >> 63 ? (1 / x2) : x2;
408          }
409          /* Here x and y are non-zero finite. */
410          if (ux >> 63)
411          {
412              /* Finite x < 0 */
413              uint32_t yint = checkint (uy);
414              if (yint == 0)
415                  return sqrt(x);
416              if (yint == 1)
417                  result_sign = SIGNBIT_DP64;
418              ux &= ABSOLUTE_VALUE;
419              xhigh &= 0x7ff;
420          }
421          if ((yhigh & 0x7ff) - 0x3be >= 0x43e - 0x3be) {
422              /* Note: sign_bias = 0 here because y is not odd. */
423              if (ux == one)
424              {
425                  return _pow_inexact(1.0);
426              }
427              if ((yhigh & 0x7ff) < 0x3be)
428              {
429                  /* |y| < 2 ^ -65, x ^ y ~= 1 + y * log(x) */
430                  return ux > one ? 1.0 + y : 1.0 - y;
431              }
432              return (ux > one) == (yhigh < 0x800) ?
433                  (DBL_MAX*DBL_MAX) :
434                  _pow_special(x, y, 0.0, POW_Z_ZERO);
435          }
436  
437          if (xhigh == 0)
438          {
439              /* subnormal x */
440              uint64_t mant = ux & MANTBITS_DP64;
441              f = asuint64(mant | ONEEXPBITS_DP64);
442              double_t temp = f - 1.0;
443              ux = asuint64(temp);
444              expadjust = 1022;
445          }
446      }
447  
448      double_t log_hi = compute_log(ux, &log_lo, expadjust);
449  
450      /* Multiplication of log_hi and log_lo with y */
451      double_t v = log_hi * y;
452      double_t vt = y * log_lo + fma(y, log_hi, -v);
453  
454      double_t result = compute_exp(v, vt, result_sign);
455      return result;
456  }
457  
458  #if !defined(ENABLE_DEBUG)
459  #ifndef __clang__
460  #pragma GCC pop_options
461  #endif
462  #endif