/ src / optmized / vec / vrs8_expf.c
vrs8_expf.c
  1  /*
  2   * Copyright (C) 2019-2020, Advanced Micro Devices. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  /*
 29   * C implementation of exp single precision 256-bit vector version (v8s)
 30   *
 31   * Implementation Notes
 32   * ----------------------
 33   * 1. Argument Reduction:
 34   *      e^x = 2^(x/ln2)                          --- (1)
 35   *
 36   *      Let x/ln(2) = z                          --- (2)
 37   *
 38   *      Let z = n + r , where n is an integer    --- (3)
 39   *                      |r| <= 1/2
 40   *
 41   *     From (1), (2) and (3),
 42   *      e^x = 2^z
 43   *          = 2^(N+r)
 44   *          = (2^N)*(2^r)                        --- (4)
 45   *
 46   * 2. Polynomial Evaluation
 47   *   From (4),
 48   *     r   = z - N
 49   *     2^r = C1 + C2*r + C3*r^2 + C4*r^3 + C5 *r^4 + C6*r^5
 50   *
 51   * 4. Reconstruction
 52   *      Thus,
 53   *        e^x = (2^N) * (2^r)
 54   *
 55   *
 56   */
 57  
 58  #include <stdint.h>
 59  #include <emmintrin.h>
 60  
 61  #include <libm_util_amd.h>
 62  #include <libm_special.h>
 63  #include <libm_macros.h>
 64  #include <libm_amd.h>
 65  #include <libm/types.h>
 66  #include <libm/typehelper.h>
 67  #include <libm/typehelper-vec.h>
 68  #include <libm/compiler.h>
 69  #include <libm/amd_funcs_internal.h>
 70  
 71  #include <libm/poly-vec.h>
 72  
 73  static const struct {
 74      v_f32x8_t   tblsz_byln2;
 75      v_f32x8_t   ln2_tbl_head, ln2_tbl_tail;
 76      v_f32x8_t   huge;
 77      v_i32x8_t   arg_max;
 78      v_i32x8_t   mask;
 79      v_i32x8_t   expf_bias;
 80      v_f32x8_t   poly_expf_5[5];
 81      v_f32x8_t   poly_expf_7[7];
 82  } v_expf_data ={
 83                .tblsz_byln2 =  _MM256_SET1_PS8(0x1.71547652b82fep+0),
 84                .ln2_tbl_head = _MM256_SET1_PS8(0x1.63p-1),
 85                .ln2_tbl_tail = _MM256_SET1_PS8(-0x1.bd0104p-13),
 86                .huge        =  _MM256_SET1_PS8(0x1.8p+23) ,
 87                .arg_max     =  _MM256_SET1_I32(0x42AE0000),
 88                .mask        =  _MM256_SET1_I32(0x7FFFFFFF),
 89                .expf_bias   =  _MM256_SET1_I32(127),
 90  
 91               // Polynomial coefficients obtained using Remez algorithm
 92  
 93                .poly_expf_5 = {
 94                                _MM256_SET1_PS8(0x1p0),
 95                                _MM256_SET1_PS8(0x1.fffdc4p-2),
 96                                _MM256_SET1_PS8(0x1.55543cp-3),
 97                                _MM256_SET1_PS8(0x1.573aecp-5),
 98                                _MM256_SET1_PS8(0x1.126bb6p-7),
 99                },
100  
101                .poly_expf_7 = {
102                                _MM256_SET1_PS8(0x1p0),
103                                _MM256_SET1_PS8(0x1p-1),
104                                _MM256_SET1_PS8(0x1.555554p-3),
105                                _MM256_SET1_PS8(0x1.555468p-5),
106                                _MM256_SET1_PS8(0x1.1112fap-7),
107                                _MM256_SET1_PS8(0x1.6da4acp-10),
108                                _MM256_SET1_PS8(0x1.9eb724p-13),
109                },
110  
111  };
112  
113  
114  #define TBL_LN2   v_expf_data.tblsz_byln2
115  #define LN2_TBL_H v_expf_data.ln2_tbl_head
116  #define LN2_TBL_T v_expf_data.ln2_tbl_tail
117  #define EXPF_BIAS v_expf_data.expf_bias
118  #define EXP_HUGE      v_expf_data.huge
119  #define ARG_MAX   v_expf_data.arg_max
120  #define MASK      v_expf_data.mask
121  
122  // Coefficients for 5-degree polynomial
123  #define A0 v_expf_data.poly_expf_5[0]
124  #define A1 v_expf_data.poly_expf_5[1]
125  #define A2 v_expf_data.poly_expf_5[2]
126  #define A3 v_expf_data.poly_expf_5[3]
127  #define A4 v_expf_data.poly_expf_5[4]
128  
129  // Coefficients for 7-degree polynomial
130  #define C0 v_expf_data.poly_expf_7[0]
131  #define C1 v_expf_data.poly_expf_7[1]
132  #define C2 v_expf_data.poly_expf_7[2]
133  #define C3 v_expf_data.poly_expf_7[3]
134  #define C4 v_expf_data.poly_expf_7[4]
135  #define C5 v_expf_data.poly_expf_7[5]
136  #define C6 v_expf_data.poly_expf_7[6]
137  
138  
139  #define SCALAR_EXPF ALM_PROTO_OPT(expf)
140  
141  /*
142      Implementation with 7-degree polynomial
143  
144      Performance numbers:
145      GCC - 731 MOPS
146      AOCC - 833 MOPS
147  
148      Max ULP error : 1.7
149  */
150  v_f32x8_t
151  ALM_PROTO_OPT(vrs8_expf_experimental)(v_f32x8_t _x)
152  {
153  
154      // vx = int(x)
155      v_i32x8_t vx = as_v8_u32_f32(_x);
156  
157      // Get absolute value of vx
158      vx = vx & MASK;
159  
160      // Check if -103 < vx < 88
161      v_i32x8_t cond = (vx > ARG_MAX);
162  
163      // x * (64.0/ln(2))
164      v_f32x8_t z = _x * TBL_LN2;
165  
166      v_f32x8_t dn = z + EXP_HUGE;
167  
168      // n = int(z)
169      v_u32x8_t n = as_v8_u32_f32(dn);
170  
171      // dn = double(n)
172      dn = dn - EXP_HUGE;
173  
174      // r = x - (dn * (ln(2)/64))
175      // where ln(2)/64 is split into Head and Tail values
176      v_f32x8_t r1 = _x - ( dn * LN2_TBL_H);
177      v_f32x8_t r2 = dn * LN2_TBL_T;
178      v_f32x8_t r = r1 - r2;
179  
180      // m = (n - j)/64
181      // Calculate 2^m
182      v_i32x8_t m = (n + EXPF_BIAS) << 23;
183  
184      // Compute polynomial
185      /* poly = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5
186              = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r)
187      */
188      v_f32x8_t poly = POLY_EVAL_7(r, C0, C0, C1, C2, C3, C4, C5, C6);
189  
190      // result = polynomial * 2^m
191      v_f32x8_t result = poly * as_v8_f32_u32(m);
192  
193      // If input value is outside valid range, call scalar expf(value)
194      // Else, return the above computed result
195      if(unlikely(any_v8_u32_loop(cond))) {
196      return (v_f32x8_t) {
197           cond[0] ? SCALAR_EXPF(_x[0]) : result[0],
198           cond[1] ? SCALAR_EXPF(_x[1]) : result[1],
199           cond[2] ? SCALAR_EXPF(_x[2]) : result[2],
200           cond[3] ? SCALAR_EXPF(_x[3]) : result[3],
201           cond[4] ? SCALAR_EXPF(_x[4]) : result[4],
202           cond[5] ? SCALAR_EXPF(_x[5]) : result[5],
203           cond[6] ? SCALAR_EXPF(_x[6]) : result[6],
204           cond[7] ? SCALAR_EXPF(_x[7]) : result[7],
205  
206       };
207      }
208  
209      return result;
210  
211  }
212  
213  /*
214      Implementation with 5-degree polynomial
215  
216      Performance numbers:
217      GCC - 773 MOPS
218      AOCC - 958 MOPS
219  
220      Max ULP - 3.3
221  */
222  v_f32x8_t
223  ALM_PROTO_OPT(vrs8_expf)(v_f32x8_t _x)
224  {
225  
226      // vx = int(x)
227      v_i32x8_t vx = as_v8_u32_f32(_x);
228  
229      // Get absolute value of vx
230      vx = vx & MASK;
231  
232      // Check if -103 < vx < 88
233      v_i32x8_t cond = (vx > ARG_MAX);
234  
235      // x * (64.0/ln(2))
236      v_f32x8_t z = _x * TBL_LN2;
237  
238      v_f32x8_t dn = z + EXP_HUGE;
239  
240      // n = int(z)
241      v_u32x8_t n = as_v8_u32_f32(dn);
242  
243      // dn = double(n)
244      dn = dn - EXP_HUGE;
245  
246      // r = x - (dn * (ln(2)/64))
247      // where ln(2)/64 is split into Head and Tail values
248      v_f32x8_t r1 = _x - ( dn * LN2_TBL_H);
249      v_f32x8_t r2 = dn * LN2_TBL_T;
250      v_f32x8_t r = r1 - r2;
251  
252      // m = (n - j)/64
253      // Calculate 2^m
254      v_i32x8_t m = (n + EXPF_BIAS) << 23;
255  
256      // Compute polynomial
257      /* poly = A1 + A2*r + A3*r^2 + A4*r^3 + A5*r^4 + A6*r^5
258              = (A1 + A2*r) + r^2(A3 + A4*r) + r^4(A5 + A6*r)
259      */
260      v_f32x8_t poly = POLY_EVAL_5(r, A0, A0, A1, A2, A3, A4);
261  
262      // result = polynomial * 2^m
263      v_f32x8_t result = poly * as_v8_f32_u32(m);
264  
265      // If input value is outside valid range, call scalar expf(value)
266      // Else, return the above computed result
267      if(unlikely(any_v8_u32_loop(cond))) {
268      return (v_f32x8_t) {
269           cond[0] ? SCALAR_EXPF(_x[0]) : result[0],
270           cond[1] ? SCALAR_EXPF(_x[1]) : result[1],
271           cond[2] ? SCALAR_EXPF(_x[2]) : result[2],
272           cond[3] ? SCALAR_EXPF(_x[3]) : result[3],
273           cond[4] ? SCALAR_EXPF(_x[4]) : result[4],
274           cond[5] ? SCALAR_EXPF(_x[5]) : result[5],
275           cond[6] ? SCALAR_EXPF(_x[6]) : result[6],
276           cond[7] ? SCALAR_EXPF(_x[7]) : result[7],
277  
278       };
279      }
280  
281      return result;
282  
283  }
284  
285