/ src / optmized / vec / vrs4_powf.c
vrs4_powf.c
  1  /*
  2   * Copyright (C) 2018-2020, Advanced Micro Devices. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  #include <libm_util_amd.h>
 29  #include <libm_special.h>
 30  
 31  #include <libm_macros.h>
 32  #include <libm_amd.h>
 33  #include <libm/amd_funcs_internal.h>
 34  #include <libm/types.h>
 35  #include <libm/typehelper.h>
 36  #include <libm/typehelper-vec.h>
 37  #include <libm/compiler.h>
 38  
 39  #define AMD_LIBM_FMA_USABLE 1           /* needed for poly.h */
 40  #include <libm/poly-vec.h>
 41  
 42  #define VECTOR_LENGTH 4
 43  #define N 8
 44  #define TABLE_SIZE (1ULL << N)
 45  #define MAX_POLYDEGREE  8
 46  extern const uint64_t log_256[];
 47  extern const uint64_t log_f_inv_256[];
 48  #define TAB_F_INV log_f_inv_256
 49  #define TAB_LOG   log_256
 50  #define MANT_MASK_N  (0x000FF00000000000ULL)
 51  #define MANT_MASK_N1 (0x0000080000000000ULL)
 52  #define SINGLE_PRECISION_BIAS 127
 53  #define DOUBLE_PRECISION_MANTISSA 0x000fffffffffffffULL
 54  #define ONE_BY_TWO 0x3fe0000000000000ULL
 55  
 56  v_i32x4_t float_bias =  _MM_SET1_I32(SINGLE_PRECISION_BIAS);
 57  v_u64x4_t mantissa_bits = _MM_SET1_I64(DOUBLE_PRECISION_MANTISSA);
 58  v_u64x4_t one_by_two = _MM_SET1_I64(ONE_BY_TWO);
 59  v_u64x4_t mant_8_bits = _MM_SET1_I64(MANT_MASK_N);
 60  
 61  
 62  static struct {
 63      v_u32x4_t v_min, v_max;
 64      double ALIGN(16) poly[MAX_POLYDEGREE];
 65      v_f64x4_t ln2;
 66  } v_log_data = {
 67      .ln2    = _MM_SET1_PD4(0x1.62e42fefa39efp-1), /* ln(2) */
 68      .v_min  = _MM_SET1_I32(0x00800000),
 69      .v_max  = _MM_SET1_I32(0x7f800000),
 70      /*
 71      * Polynomial constants, 1/x! (reciprocal x)
 72      */
 73      .poly = { /* skip for 0/1 and 1/1 */
 74          0x1.0000000000000p-1,    /* 1/2 */
 75          0x1.5555555555555p-2,    /* 1/3 */
 76          0x1.0000000000000p-2,    /* 1/4 */
 77          0x1.999999999999ap-3,    /* 1/5 */
 78          0x1.5555555555555p-3,    /* 1/6 */
 79          0x1.2492492492492p-3,    /* 1/7 */
 80          0x1.0000000000000p-3,    /* 1/8 */
 81          0x1.c71c71c71c71cp-4,    /* 1/9 */
 82      },
 83  };
 84  
 85  
 86  static struct {
 87      v_f64x4_t ln2by_tblsz, tblsz_byln2, Huge;
 88      double_t ALIGN(16) poly[MAX_POLYDEGREE];
 89      v_u64x4_t expf_max;
 90      v_f32x4_t   expf_maxf, expf_minf;
 91      v_i32x4_t   infinity;
 92  } expf_v4_data  = {
 93      .ln2by_tblsz = _MM_SET1_PD4(0x1.62e42fefa39efp-7),
 94      .tblsz_byln2 = _MM_SET1_PD4(0x1.71547652b82fep+0),
 95      .Huge = _MM_SET1_PD4(0x1.8000000000000p+52),
 96      .expf_max = _MM_SET1_I64(0x4056000000000000),
 97      .infinity    =  _MM_SET1_I32(0x7f800000),
 98      .expf_minf    =  _MM_SET1_PS4(-0x1.9fe368p6f),
 99      .expf_maxf    =  _MM_SET1_PS4(88.7228393f),
100      .poly = {
101          0x1.0000014439a91p0,
102  		0x1.62e43170e3344p-1,
103  		0x1.ebf906bc4c115p-3,
104  		0x1.c6ae2bb88c0c8p-5,
105  		0x1.3d1079db4ef69p-7,
106  		0x1.5f8905cb0cc4ep-10
107      },
108  };
109  
110  #define SCALAR_POWF amd_powf
111  #define V_MIN       v_log_data.v_min
112  #define V_MAX       v_log_data.v_max
113  #define V_MASK      v_log_data.v_mask
114  #define LN2         v_log_data.ln2
115  #define INVLN2      expf_v4_data.tblsz_byln2
116  #define EXPF_HUGE   expf_v4_data.Huge
117  #define EXPF_MAX    expf_v4_data.expf_max
118  #define EXPF_MAXF   expf_v4_data.expf_maxf
119  #define EXPF_MINF   expf_v4_data.expf_minf
120  #define INF         expf_v4_data.infinity
121  /*
122   * Short names for polynomial coefficients
123   */
124  
125  #define D1  _MM_SET1_PD4(expf_v4_data.poly[0])
126  #define D2  _MM_SET1_PD4(expf_v4_data.poly[1])
127  #define D3  _MM_SET1_PD4(expf_v4_data.poly[2])
128  #define D4  _MM_SET1_PD4(expf_v4_data.poly[3])
129  #define D5  _MM_SET1_PD4(expf_v4_data.poly[4])
130  #define D6  _MM_SET1_PD4(expf_v4_data.poly[5])
131  
132  #define C2  _MM_SET1_PD4(v_log_data.poly[0])
133  #define C3  _MM_SET1_PD4(v_log_data.poly[1])
134  
135  
136  /*
137   *   __m128 ALM_PROTO_OPT(vrs4_powf)(__m128, __m128);
138   *
139   * Spec:
140   *   - A slightly relaxed version of the scalar powf.
141   *   - Maximum ULP is expected to be less than 2.
142   *   - Performance is expected to be double of scalar powf
143   *
144   *
145   * Implementation Notes:
146   * pow(x,y) = e^(y * log(x))
147   *  1. Calculation of log(x):
148   *      x = 2^n*(1+f)                                          .... (1)
149   *         where n is exponent and is an integer
150   *             (1+f) is mantissa ∈ [1,2). i.e., 1 ≤ 1+f < 2    .... (2)
151   *
152   *      From (1), taking log on both sides
153   *      log(x) = log(2^n * (1+f))
154   *             = log(2^n) + log(1+f)
155   *             = n*log(2) + log(1+f)                           .... (3)
156   *
157   *      let z = 1 + f
158   *             log(x) = n*log(2) + log(z)
159   *      z = G + g
160   *      log(x) = n*log(2) + log(G + g)
161   *
162   *      log(x) = n*log(2) + log(2*(G/2 + g/2))
163   *
164   *      log(x) = n*log(2) + log(2) + log(F + f)  with (0.5 <= F < 1) and (f <= 2 ^ (-10))
165   *
166   *      log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F))
167   *
168   *      Y = (2 ^ (-1)) * (2 ^ (-m)) * (2 ^ m) * A
169   *
170   *      Now, range of Y is: 0.5 <= Y < 1
171   *
172   *      F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
173   *
174   *      Now, range of F is: 256 <= F <= 512
175   *
176   *      F = F / 512
177   *
178   *      Now, range of F is: 0.5 <= F <= 1
179   *
180   *      f = -(Y - F), with (f <= 2^(-10))
181   *
182   *      log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F))
183   *
184   *      log(x) = m * log(2) + log(2 * F) + log(1 - r)
185   *
186   *      r = (f / F), with (r <= 2^(-9))
187   *
188   *      r = f * (1 / F) with (1 / F) precomputed to avoid division
189   *
190   *      log(x) = m * log(2) + log(G) - poly
191   *
192   *      log(G) is precomputed
193   *
194   *      poly = (r + (r ^ 2) / 2 + (r ^ 3) / 3
195   *
196   *  2. Computation of e^(y * log(x))
197   *      e^x = 2 ^ (x/ln(2))
198   *
199   *      z =  x/ln(2)
200   *
201   *      z = n + r where n is the integer part of the number and r is the fraction
202   *
203   *      2^z = 2^n * 2^r
204   *
205   *      2^n can be calculated as (n + 1023) << 52
206   *
207   *      2^r is approximated by a polynomial approximated by sollya
208   *
209   *      Polynomial Approximation:
210   *      For More information refer to tools/sollya/vrs4_expf.sollya
211   *
212   *
213   */
214  
215  static inline v_f64x4_t look_table_access(const uint64_t* table,
216                                            const int vector_size,
217                                            v_u64x4_t indices)
218  {
219       uint64_t j;
220       v_f64x4_t ret;
221       for(int i = 0; i < vector_size; i++) {
222          j = indices[i];
223          ret[i] = asdouble(table[j]);
224       }
225       return ret;
226  }
227  
228  __m128
229  ALM_PROTO_OPT(vrs4_powf)(__m128 _x,__m128 _y)
230  {
231  
232      v_u32x4_t u;
233  
234      v_f32x4_t ret = _x;
235  
236      u = as_v4_u32_f32(_x);
237  
238      v_i32x4_t condition = (u - V_MIN >= V_MAX - V_MIN);
239  
240      if(any_v4_u32_loop(condition)) {
241  
242          ret[0] = SCALAR_POWF(_x[0], _y[0]);
243          ret[1] = SCALAR_POWF(_x[1], _y[1]);
244          ret[2] = SCALAR_POWF(_x[2], _y[2]);
245          ret[3] = SCALAR_POWF(_x[3], _y[3]);
246  
247          return ret;
248      }
249  
250      v_f64x4_t xd = _mm256_cvtps_pd(_x);
251  
252      v_f64x4_t yd = _mm256_cvtps_pd(_y);
253  
254      v_u64x4_t ux = as_v4_u64_f64(xd);
255  
256      v_i32x4_t int_exponent =  (u >> 23) - float_bias;
257  
258      v_f64x4_t exponent =  _mm256_cvtepi32_pd (int_exponent);
259  
260      v_u64x4_t mant  = ((ux & mantissa_bits) | one_by_two);
261  
262      v_u64x4_t index = ux & mant_8_bits;
263  
264      v_f64x4_t index_times_half = as_v4_f64_u64(index | one_by_two);
265  
266      index =  index >> (52 - N);
267  
268      v_f64x4_t y1  = as_v4_f64_u64(mant);
269  
270      v_f64x4_t f = index_times_half - y1;
271  
272      v_f64x4_t F_INV, LOG_256, r;
273  
274      /* Avoiding the use of vgatherpd instruction for performance reasons */
275  
276      F_INV = look_table_access(TAB_F_INV, VECTOR_LENGTH, index);
277  
278      LOG_256 = look_table_access(TAB_LOG, VECTOR_LENGTH, index);
279  
280      r = f * F_INV;
281  
282      v_f64x4_t r2 = r * r;                /* r^2 */
283  
284      v_f64x4_t temp = C2  + r * C3;
285  
286      v_f64x4_t q = r + r2 * temp;
287  
288      /* m*log(2) + log(G) - poly */
289  
290      temp  = (exponent * LN2) + LOG_256;
291  
292      temp -= q;
293  
294      v_f64x4_t ylogx = temp * yd;
295  
296      /* Calculate exp*/
297  
298      v_u64x4_t v = as_v4_u64_f64(ylogx);
299  
300      v_i64x4_t condition2 = (v >= EXPF_MAX);
301  
302      v_f64x4_t z = ylogx * INVLN2;
303  
304      v_f64x4_t dn = z + EXPF_HUGE;
305  
306      v_u64x4_t n = as_v4_u64_f64(dn);
307  
308      dn = dn - EXPF_HUGE;
309  
310      r = z - dn;
311  
312      v_f64x4_t qtmp1 = D1 + (D2 * r);
313  
314      v_f64x4_t qtmp2 = D3 + (D4 * r);
315  
316      r2 = r * r;
317  
318      v_f64x4_t qtmp3 = D5 + (D6 * r);
319  
320      q =  qtmp1 + r2 * qtmp2;
321  
322      v_f64x4_t result = q + r2 * r2  * qtmp3;
323  
324      ret = _mm256_cvtpd_ps(as_v4_f64_u64(as_v4_u64_f64(result) + (n << 52)));
325  
326      if(any_v4_u64_loop(condition2)) {
327  
328          v_f32x4_t x = _mm256_cvtpd_ps(ylogx);
329  
330          v_i32x4_t inf_condition = x > EXPF_MAXF;
331  
332          v_i32x4_t zero_condition = x < EXPF_MINF;
333  
334          v_32x4 vx = {.f32x4 = ret};
335  
336          //Zero out the elements that have to be set to infinity
337          vx.i32x4 = vx.i32x4 & (~inf_condition);
338  
339          inf_condition = inf_condition & INF;
340  
341          vx.i32x4 = vx.i32x4 | inf_condition;
342  
343          vx.i32x4 = vx.i32x4 & (~zero_condition);
344  
345          ret = vx.f32x4;
346      }
347  
348      return ret;
349  
350  }
351  
352