vrd2_exp.c
1 /* 2 * Copyright (C) 2019-2020, Advanced Micro Devices. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * C implementation of exp double precision 128-bit vector version (v2d) 30 * 31 * Implementation Notes 32 * ---------------------- 33 * 1. Argument Reduction: 34 * e^x = 2^(x/ln2) = 2^(x*(64/ln(2))/64) --- (1) 35 * 36 * Choose 'n' and 'f', such that 37 * x * 64/ln2 = n + f --- (2) | n is integer 38 * | |f| <= 0.5 39 * Choose 'm' and 'j' such that, 40 * n = (64 * m) + j --- (3) 41 * 42 * From (1), (2) and (3), 43 * e^x = 2^((64*m + j + f)/64) 44 * = (2^m) * (2^(j/64)) * 2^(f/64) 45 * = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 46 * 47 * 2. Table Lookup 48 * Values of (2^(j/64)) are precomputed, j = 0, 1, 2, 3 ... 63 49 * 50 * 3. Polynomial Evaluation 51 * From (2), 52 * f = x*(64/ln(2)) - n 53 * Let, 54 * r = f*(ln(2)/64) = x - n*(ln(2)/64) 55 * 56 * 4. Reconstruction 57 * Thus, 58 * e^x = (2^m) * (2^(j/64)) * e^r 59 * 60 */ 61 62 63 #include <stdint.h> 64 #include <emmintrin.h> 65 66 #include <libm_util_amd.h> 67 #include <libm_special.h> 68 #include <libm_macros.h> 69 #include <libm_amd.h> 70 #include <libm/types.h> 71 #include <libm/typehelper.h> 72 #include <libm/typehelper-vec.h> 73 #include <libm/compiler.h> 74 #include <libm/amd_funcs_internal.h> 75 76 #include <libm/poly-vec.h> 77 78 #define DOUBLE_PRECISION_BIAS 1023 79 80 81 static const struct { 82 v_f64x2_t tblsz_ln2; 83 v_f64x2_t ln2_tblsz_head, ln2_tblsz_tail; 84 v_f64x2_t Huge; 85 v_i64x2_t exp_bias; 86 v_i64x2_t exp_max; 87 v_f64x2_t exp_maxd; 88 v_f64x2_t exp_mind; 89 v_i64x2_t mask; 90 v_i64x2_t infinity; 91 v_f64x2_t poly[12]; 92 double exp_min_value; 93 }exp_data = { 94 .tblsz_ln2 = _MM_SET1_PD2(0x1.71547652b82fep+0), 95 .ln2_tblsz_head = _MM_SET1_PD2(0x1.63p-1), 96 .ln2_tblsz_tail = _MM_SET1_PD2(-0x1.bd0105c610ca8p-13), 97 .Huge = _MM_SET1_PD2(0x1.8000000000000p+52), 98 .exp_bias = _MM_SET1_I64x2(DOUBLE_PRECISION_BIAS), 99 .exp_maxd = _MM_SET1_PD2(0x1.62e42fefa39efp+9), 100 .exp_mind = _MM_SET1_PD2(-0x1.62e42fefa39efp+9), 101 .exp_max = _MM_SET1_I64x2(0x4086200000000000), 102 .mask = _MM_SET1_I64x2(0x7FFFFFFFFFFFFFFF), 103 .infinity = _MM_SET1_I64x2(0x7ff0000000000000), 104 .exp_min_value = -0x1.74910d52d3051p+9, 105 .poly = { 106 _MM_SET1_PD2(0x1p0), 107 _MM_SET1_PD2(0x1.000000000001p-1), 108 _MM_SET1_PD2(0x1.55555555554a2p-3), 109 _MM_SET1_PD2(0x1.555555554f37p-5), 110 _MM_SET1_PD2(0x1.1111111130dd6p-7), 111 _MM_SET1_PD2(0x1.6c16c1878111dp-10), 112 _MM_SET1_PD2(0x1.a01a011057479p-13), 113 _MM_SET1_PD2(0x1.a01992d0fe581p-16), 114 _MM_SET1_PD2(0x1.71df4520705a4p-19), 115 _MM_SET1_PD2(0x1.28b311c80e499p-22), 116 _MM_SET1_PD2(0x1.ad661ce7af3e3p-26), 117 }, 118 }; 119 120 #define DP64_BIAS exp_data.exp_bias 121 #define LN2_HEAD exp_data.ln2_tblsz_head 122 #define LN2_TAIL exp_data.ln2_tblsz_tail 123 #define INVLN2 exp_data.tblsz_ln2 124 #define EXP_HUGE exp_data.Huge 125 #define ARG_MAX exp_data.exp_max 126 #define MASK exp_data.mask 127 #define EXP_MAX exp_data.exp_maxd 128 #define EXP_LOW exp_data.exp_mind 129 #define INF exp_data.infinity 130 #define EXP_MIN_VAL exp_data.exp_min_value 131 132 #define C1 exp_data.poly[0] 133 #define C3 exp_data.poly[1] 134 #define C4 exp_data.poly[2] 135 #define C5 exp_data.poly[3] 136 #define C6 exp_data.poly[4] 137 #define C7 exp_data.poly[5] 138 #define C8 exp_data.poly[6] 139 #define C9 exp_data.poly[7] 140 #define C10 exp_data.poly[8] 141 #define C11 exp_data.poly[9] 142 #define C12 exp_data.poly[10] 143 144 145 #define SCALAR_EXP ALM_PROTO(exp) 146 147 v_f64x2_t 148 ALM_PROTO_OPT(vrd2_exp)(v_f64x2_t x) 149 { 150 151 v_i64x2_t vx = as_v2_u64_f64(x); 152 153 // Get absolute value 154 vx = vx & MASK; 155 156 // Check if -709 < vx < 709 157 v_i64x2_t cond = (vx > ARG_MAX); 158 159 // x * (64.0/ln(2)) 160 v_f64x2_t z = x * INVLN2; 161 162 v_f64x2_t dn = z + EXP_HUGE; 163 164 // n = int (z) 165 v_i64x2_t n = as_v2_u64_f64(dn); 166 167 // dn = double(n) 168 dn = dn - EXP_HUGE; 169 170 // r = x - (dn * (ln(2)/64)) 171 // where ln(2)/64 is split into Head and Tail values 172 v_f64x2_t r1 = x - ( dn * LN2_HEAD); 173 v_f64x2_t r2 = dn * LN2_TAIL; 174 v_f64x2_t r = r1 - r2; 175 176 // m = (n - j)/64 177 // Calculate 2^m 178 v_i64x2_t m = (n + DP64_BIAS) << 52; 179 180 // Compute polynomial 181 /* poly = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5 + 182 C7*r^6 + C8*r^7 + C9*r^8 + C10*r^9 + C11*r^10 + C12*r^11 183 = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r) + 184 r^6(C7 + C8*r) + r^8(C9 + C10*r) + r^10(C11 + C12*r) 185 */ 186 v_f64x2_t poly = POLY_EVAL_11(r, C1, C1, C3, C4, C5, C6, 187 C7, C8, C9, C10, C11, C12); 188 189 // result = polynomial * 2^m 190 v_f64x2_t ret = poly * as_v2_f64_u64(m); 191 192 if(unlikely(any_v2_u64_loop(cond))) { 193 194 v_i64x2_t inf_condition = x > EXP_MAX; 195 196 v_i64x2_t zero_condition = x < EXP_LOW; 197 198 v_64x2 vx = {.f64x2 = ret}; 199 200 //Zero out the elements that have to be set to infinity 201 vx.i64x2 = vx.i64x2 & (~inf_condition); 202 203 inf_condition = inf_condition & INF; 204 205 vx.i64x2 = vx.i64x2 | inf_condition; 206 207 ret = vx.f64x2; 208 209 /*To handle denormal numbers */ 210 if(any_v2_u64_loop(zero_condition)) { 211 return (v_f64x2_t) { 212 (zero_condition[0] && (x[0] < EXP_MIN_VAL)) ? 0.0:SCALAR_EXP(x[0]), 213 (zero_condition[1] && (x[1] < EXP_MIN_VAL)) ? 0.0:SCALAR_EXP(x[1]), 214 }; 215 } 216 } 217 218 return ret; 219 }