vrs8_expf.c
1 /* 2 * Copyright (C) 2019-2020, Advanced Micro Devices. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * C implementation of exp single precision 256-bit vector version (v8s) 30 * 31 * Implementation Notes 32 * ---------------------- 33 * 1. Argument Reduction: 34 * e^x = 2^(x/ln2) --- (1) 35 * 36 * Let x/ln(2) = z --- (2) 37 * 38 * Let z = n + r , where n is an integer --- (3) 39 * |r| <= 1/2 40 * 41 * From (1), (2) and (3), 42 * e^x = 2^z 43 * = 2^(N+r) 44 * = (2^N)*(2^r) --- (4) 45 * 46 * 2. Polynomial Evaluation 47 * From (4), 48 * r = z - N 49 * 2^r = C1 + C2*r + C3*r^2 + C4*r^3 + C5 *r^4 + C6*r^5 50 * 51 * 4. Reconstruction 52 * Thus, 53 * e^x = (2^N) * (2^r) 54 * 55 * 56 */ 57 58 #include <stdint.h> 59 #include <emmintrin.h> 60 61 #include <libm_util_amd.h> 62 #include <libm_special.h> 63 #include <libm_macros.h> 64 #include <libm_amd.h> 65 #include <libm/types.h> 66 #include <libm/typehelper.h> 67 #include <libm/typehelper-vec.h> 68 #include <libm/compiler.h> 69 #include <libm/amd_funcs_internal.h> 70 71 #include <libm/poly-vec.h> 72 73 static const struct { 74 v_f32x8_t tblsz_byln2; 75 v_f32x8_t ln2_tbl_head, ln2_tbl_tail; 76 v_f32x8_t huge; 77 v_i32x8_t arg_max; 78 v_i32x8_t mask; 79 v_i32x8_t expf_bias; 80 v_f32x8_t poly_expf_5[5]; 81 v_f32x8_t poly_expf_7[7]; 82 } v_expf_data ={ 83 .tblsz_byln2 = _MM256_SET1_PS8(0x1.71547652b82fep+0), 84 .ln2_tbl_head = _MM256_SET1_PS8(0x1.63p-1), 85 .ln2_tbl_tail = _MM256_SET1_PS8(-0x1.bd0104p-13), 86 .huge = _MM256_SET1_PS8(0x1.8p+23) , 87 .arg_max = _MM256_SET1_I32(0x42AE0000), 88 .mask = _MM256_SET1_I32(0x7FFFFFFF), 89 .expf_bias = _MM256_SET1_I32(127), 90 91 // Polynomial coefficients obtained using Remez algorithm 92 93 .poly_expf_5 = { 94 _MM256_SET1_PS8(0x1p0), 95 _MM256_SET1_PS8(0x1.fffdc4p-2), 96 _MM256_SET1_PS8(0x1.55543cp-3), 97 _MM256_SET1_PS8(0x1.573aecp-5), 98 _MM256_SET1_PS8(0x1.126bb6p-7), 99 }, 100 101 .poly_expf_7 = { 102 _MM256_SET1_PS8(0x1p0), 103 _MM256_SET1_PS8(0x1p-1), 104 _MM256_SET1_PS8(0x1.555554p-3), 105 _MM256_SET1_PS8(0x1.555468p-5), 106 _MM256_SET1_PS8(0x1.1112fap-7), 107 _MM256_SET1_PS8(0x1.6da4acp-10), 108 _MM256_SET1_PS8(0x1.9eb724p-13), 109 }, 110 111 }; 112 113 114 #define TBL_LN2 v_expf_data.tblsz_byln2 115 #define LN2_TBL_H v_expf_data.ln2_tbl_head 116 #define LN2_TBL_T v_expf_data.ln2_tbl_tail 117 #define EXPF_BIAS v_expf_data.expf_bias 118 #define EXP_HUGE v_expf_data.huge 119 #define ARG_MAX v_expf_data.arg_max 120 #define MASK v_expf_data.mask 121 122 // Coefficients for 5-degree polynomial 123 #define A0 v_expf_data.poly_expf_5[0] 124 #define A1 v_expf_data.poly_expf_5[1] 125 #define A2 v_expf_data.poly_expf_5[2] 126 #define A3 v_expf_data.poly_expf_5[3] 127 #define A4 v_expf_data.poly_expf_5[4] 128 129 // Coefficients for 7-degree polynomial 130 #define C0 v_expf_data.poly_expf_7[0] 131 #define C1 v_expf_data.poly_expf_7[1] 132 #define C2 v_expf_data.poly_expf_7[2] 133 #define C3 v_expf_data.poly_expf_7[3] 134 #define C4 v_expf_data.poly_expf_7[4] 135 #define C5 v_expf_data.poly_expf_7[5] 136 #define C6 v_expf_data.poly_expf_7[6] 137 138 139 #define SCALAR_EXPF ALM_PROTO_OPT(expf) 140 141 /* 142 Implementation with 7-degree polynomial 143 144 Performance numbers: 145 GCC - 731 MOPS 146 AOCC - 833 MOPS 147 148 Max ULP error : 1.7 149 */ 150 v_f32x8_t 151 ALM_PROTO_OPT(vrs8_expf_experimental)(v_f32x8_t _x) 152 { 153 154 // vx = int(x) 155 v_i32x8_t vx = as_v8_u32_f32(_x); 156 157 // Get absolute value of vx 158 vx = vx & MASK; 159 160 // Check if -103 < vx < 88 161 v_i32x8_t cond = (vx > ARG_MAX); 162 163 // x * (64.0/ln(2)) 164 v_f32x8_t z = _x * TBL_LN2; 165 166 v_f32x8_t dn = z + EXP_HUGE; 167 168 // n = int(z) 169 v_u32x8_t n = as_v8_u32_f32(dn); 170 171 // dn = double(n) 172 dn = dn - EXP_HUGE; 173 174 // r = x - (dn * (ln(2)/64)) 175 // where ln(2)/64 is split into Head and Tail values 176 v_f32x8_t r1 = _x - ( dn * LN2_TBL_H); 177 v_f32x8_t r2 = dn * LN2_TBL_T; 178 v_f32x8_t r = r1 - r2; 179 180 // m = (n - j)/64 181 // Calculate 2^m 182 v_i32x8_t m = (n + EXPF_BIAS) << 23; 183 184 // Compute polynomial 185 /* poly = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5 186 = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r) 187 */ 188 v_f32x8_t poly = POLY_EVAL_7(r, C0, C0, C1, C2, C3, C4, C5, C6); 189 190 // result = polynomial * 2^m 191 v_f32x8_t result = poly * as_v8_f32_u32(m); 192 193 // If input value is outside valid range, call scalar expf(value) 194 // Else, return the above computed result 195 if(unlikely(any_v8_u32_loop(cond))) { 196 return (v_f32x8_t) { 197 cond[0] ? SCALAR_EXPF(_x[0]) : result[0], 198 cond[1] ? SCALAR_EXPF(_x[1]) : result[1], 199 cond[2] ? SCALAR_EXPF(_x[2]) : result[2], 200 cond[3] ? SCALAR_EXPF(_x[3]) : result[3], 201 cond[4] ? SCALAR_EXPF(_x[4]) : result[4], 202 cond[5] ? SCALAR_EXPF(_x[5]) : result[5], 203 cond[6] ? SCALAR_EXPF(_x[6]) : result[6], 204 cond[7] ? SCALAR_EXPF(_x[7]) : result[7], 205 206 }; 207 } 208 209 return result; 210 211 } 212 213 /* 214 Implementation with 5-degree polynomial 215 216 Performance numbers: 217 GCC - 773 MOPS 218 AOCC - 958 MOPS 219 220 Max ULP - 3.3 221 */ 222 v_f32x8_t 223 ALM_PROTO_OPT(vrs8_expf)(v_f32x8_t _x) 224 { 225 226 // vx = int(x) 227 v_i32x8_t vx = as_v8_u32_f32(_x); 228 229 // Get absolute value of vx 230 vx = vx & MASK; 231 232 // Check if -103 < vx < 88 233 v_i32x8_t cond = (vx > ARG_MAX); 234 235 // x * (64.0/ln(2)) 236 v_f32x8_t z = _x * TBL_LN2; 237 238 v_f32x8_t dn = z + EXP_HUGE; 239 240 // n = int(z) 241 v_u32x8_t n = as_v8_u32_f32(dn); 242 243 // dn = double(n) 244 dn = dn - EXP_HUGE; 245 246 // r = x - (dn * (ln(2)/64)) 247 // where ln(2)/64 is split into Head and Tail values 248 v_f32x8_t r1 = _x - ( dn * LN2_TBL_H); 249 v_f32x8_t r2 = dn * LN2_TBL_T; 250 v_f32x8_t r = r1 - r2; 251 252 // m = (n - j)/64 253 // Calculate 2^m 254 v_i32x8_t m = (n + EXPF_BIAS) << 23; 255 256 // Compute polynomial 257 /* poly = A1 + A2*r + A3*r^2 + A4*r^3 + A5*r^4 + A6*r^5 258 = (A1 + A2*r) + r^2(A3 + A4*r) + r^4(A5 + A6*r) 259 */ 260 v_f32x8_t poly = POLY_EVAL_5(r, A0, A0, A1, A2, A3, A4); 261 262 // result = polynomial * 2^m 263 v_f32x8_t result = poly * as_v8_f32_u32(m); 264 265 // If input value is outside valid range, call scalar expf(value) 266 // Else, return the above computed result 267 if(unlikely(any_v8_u32_loop(cond))) { 268 return (v_f32x8_t) { 269 cond[0] ? SCALAR_EXPF(_x[0]) : result[0], 270 cond[1] ? SCALAR_EXPF(_x[1]) : result[1], 271 cond[2] ? SCALAR_EXPF(_x[2]) : result[2], 272 cond[3] ? SCALAR_EXPF(_x[3]) : result[3], 273 cond[4] ? SCALAR_EXPF(_x[4]) : result[4], 274 cond[5] ? SCALAR_EXPF(_x[5]) : result[5], 275 cond[6] ? SCALAR_EXPF(_x[6]) : result[6], 276 cond[7] ? SCALAR_EXPF(_x[7]) : result[7], 277 278 }; 279 } 280 281 return result; 282 283 } 284 285