/ src / optmized / vec / vrd2_exp.c
vrd2_exp.c
  1  /*
  2   * Copyright (C) 2019-2020, Advanced Micro Devices. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28   /*
 29    * C implementation of exp double precision 128-bit vector version (v2d)
 30    *
 31    * Implementation Notes
 32    * ----------------------
 33    * 1. Argument Reduction:
 34    *      e^x = 2^(x/ln2) = 2^(x*(64/ln(2))/64)     --- (1)
 35    *
 36    *      Choose 'n' and 'f', such that
 37    *      x * 64/ln2 = n + f                        --- (2) | n is integer
 38    *                            | |f| <= 0.5
 39    *     Choose 'm' and 'j' such that,
 40    *      n = (64 * m) + j                          --- (3)
 41    *
 42    *     From (1), (2) and (3),
 43    *      e^x = 2^((64*m + j + f)/64)
 44    *          = (2^m) * (2^(j/64)) * 2^(f/64)
 45    *          = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
 46    *
 47    * 2. Table Lookup
 48    *      Values of (2^(j/64)) are precomputed, j = 0, 1, 2, 3 ... 63
 49    *
 50    * 3. Polynomial Evaluation
 51    *   From (2),
 52    *     f = x*(64/ln(2)) - n
 53    *   Let,
 54    *     r  = f*(ln(2)/64) = x - n*(ln(2)/64)
 55    *
 56    * 4. Reconstruction
 57    *      Thus,
 58    *        e^x = (2^m) * (2^(j/64)) * e^r
 59    *
 60    */
 61  
 62  
 63  #include <stdint.h>
 64  #include <emmintrin.h>
 65  
 66  #include <libm_util_amd.h>
 67  #include <libm_special.h>
 68  #include <libm_macros.h>
 69  #include <libm_amd.h>
 70  #include <libm/types.h>
 71  #include <libm/typehelper.h>
 72  #include <libm/typehelper-vec.h>
 73  #include <libm/compiler.h>
 74  #include <libm/amd_funcs_internal.h>
 75  
 76  #include <libm/poly-vec.h>
 77  
 78  #define DOUBLE_PRECISION_BIAS 1023
 79  
 80  
 81  static const struct {
 82      v_f64x2_t tblsz_ln2;
 83      v_f64x2_t ln2_tblsz_head, ln2_tblsz_tail;
 84      v_f64x2_t Huge;
 85      v_i64x2_t exp_bias;
 86      v_i64x2_t exp_max;
 87      v_f64x2_t exp_maxd;
 88      v_f64x2_t exp_mind;
 89      v_i64x2_t mask;
 90      v_i64x2_t infinity;
 91      v_f64x2_t poly[12];
 92      double exp_min_value;
 93      }exp_data = {
 94                      .tblsz_ln2      = _MM_SET1_PD2(0x1.71547652b82fep+0),
 95                      .ln2_tblsz_head = _MM_SET1_PD2(0x1.63p-1),
 96                      .ln2_tblsz_tail = _MM_SET1_PD2(-0x1.bd0105c610ca8p-13),
 97                      .Huge           = _MM_SET1_PD2(0x1.8000000000000p+52),
 98                      .exp_bias       = _MM_SET1_I64x2(DOUBLE_PRECISION_BIAS),
 99                      .exp_maxd       = _MM_SET1_PD2(0x1.62e42fefa39efp+9),
100                      .exp_mind       = _MM_SET1_PD2(-0x1.62e42fefa39efp+9),
101                      .exp_max        = _MM_SET1_I64x2(0x4086200000000000),
102                      .mask           = _MM_SET1_I64x2(0x7FFFFFFFFFFFFFFF),
103                      .infinity       = _MM_SET1_I64x2(0x7ff0000000000000),
104                      .exp_min_value  = -0x1.74910d52d3051p+9,
105                      .poly           = {
106                                          _MM_SET1_PD2(0x1p0),
107                                          _MM_SET1_PD2(0x1.000000000001p-1),
108                                          _MM_SET1_PD2(0x1.55555555554a2p-3),
109                                          _MM_SET1_PD2(0x1.555555554f37p-5),
110                                          _MM_SET1_PD2(0x1.1111111130dd6p-7),
111                                          _MM_SET1_PD2(0x1.6c16c1878111dp-10),
112                                          _MM_SET1_PD2(0x1.a01a011057479p-13),
113                                          _MM_SET1_PD2(0x1.a01992d0fe581p-16),
114                                          _MM_SET1_PD2(0x1.71df4520705a4p-19),
115                                          _MM_SET1_PD2(0x1.28b311c80e499p-22),
116                                          _MM_SET1_PD2(0x1.ad661ce7af3e3p-26),
117                                         },
118      };
119  
120  #define DP64_BIAS        exp_data.exp_bias
121  #define LN2_HEAD         exp_data.ln2_tblsz_head
122  #define LN2_TAIL         exp_data.ln2_tblsz_tail
123  #define INVLN2           exp_data.tblsz_ln2
124  #define EXP_HUGE         exp_data.Huge
125  #define ARG_MAX          exp_data.exp_max
126  #define MASK             exp_data.mask
127  #define EXP_MAX          exp_data.exp_maxd
128  #define EXP_LOW          exp_data.exp_mind
129  #define INF              exp_data.infinity
130  #define EXP_MIN_VAL      exp_data.exp_min_value
131  
132  #define C1  exp_data.poly[0]
133  #define C3  exp_data.poly[1]
134  #define C4  exp_data.poly[2]
135  #define C5  exp_data.poly[3]
136  #define C6  exp_data.poly[4]
137  #define C7  exp_data.poly[5]
138  #define C8  exp_data.poly[6]
139  #define C9  exp_data.poly[7]
140  #define C10 exp_data.poly[8]
141  #define C11 exp_data.poly[9]
142  #define C12 exp_data.poly[10]
143  
144  
145  #define SCALAR_EXP ALM_PROTO(exp)
146  
147  v_f64x2_t
148  ALM_PROTO_OPT(vrd2_exp)(v_f64x2_t x)
149  {
150  
151      v_i64x2_t vx = as_v2_u64_f64(x);
152  
153      // Get absolute value
154      vx = vx & MASK;
155  
156      // Check if -709 < vx < 709
157      v_i64x2_t cond = (vx > ARG_MAX);
158  
159      // x * (64.0/ln(2))
160      v_f64x2_t z = x * INVLN2;
161  
162      v_f64x2_t dn = z + EXP_HUGE;
163  
164      // n = int (z)
165      v_i64x2_t n = as_v2_u64_f64(dn);
166  
167      // dn = double(n)
168      dn = dn - EXP_HUGE;
169  
170      // r = x - (dn * (ln(2)/64))
171      // where ln(2)/64 is split into Head and Tail values
172      v_f64x2_t r1 = x - ( dn * LN2_HEAD);
173      v_f64x2_t r2 = dn * LN2_TAIL;
174      v_f64x2_t r = r1 - r2;
175  
176      // m = (n - j)/64
177      // Calculate 2^m
178      v_i64x2_t m = (n + DP64_BIAS) << 52;
179  
180      // Compute polynomial
181      /* poly = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5 +
182                C7*r^6 + C8*r^7 + C9*r^8 + C10*r^9 + C11*r^10 + C12*r^11
183              = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r) +
184                r^6(C7 + C8*r) + r^8(C9 + C10*r) + r^10(C11 + C12*r)
185      */
186      v_f64x2_t poly = POLY_EVAL_11(r, C1, C1, C3, C4, C5, C6,
187                                    C7, C8, C9, C10, C11, C12);
188  
189      // result = polynomial * 2^m
190      v_f64x2_t ret = poly * as_v2_f64_u64(m);
191  
192      if(unlikely(any_v2_u64_loop(cond))) {
193  
194          v_i64x2_t inf_condition = x > EXP_MAX;
195  
196          v_i64x2_t zero_condition = x < EXP_LOW;
197  
198          v_64x2 vx = {.f64x2 = ret};
199  
200          //Zero out the elements that have to be set to infinity
201          vx.i64x2 = vx.i64x2 & (~inf_condition);
202  
203          inf_condition = inf_condition & INF;
204  
205          vx.i64x2 = vx.i64x2 | inf_condition;
206  
207          ret =  vx.f64x2;
208  
209          /*To handle denormal numbers */
210          if(any_v2_u64_loop(zero_condition)) {
211                  return (v_f64x2_t) {
212                      (zero_condition[0] && (x[0] < EXP_MIN_VAL)) ? 0.0:SCALAR_EXP(x[0]),
213                      (zero_condition[1] && (x[1] < EXP_MIN_VAL)) ? 0.0:SCALAR_EXP(x[1]),
214                  };
215          }
216      }
217  
218      return ret;
219  }