vrs4_powf.c
1 /* 2 * Copyright (C) 2018-2020, Advanced Micro Devices. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 #include <libm_util_amd.h> 29 #include <libm_special.h> 30 31 #include <libm_macros.h> 32 #include <libm_amd.h> 33 #include <libm/amd_funcs_internal.h> 34 #include <libm/types.h> 35 #include <libm/typehelper.h> 36 #include <libm/typehelper-vec.h> 37 #include <libm/compiler.h> 38 39 #define AMD_LIBM_FMA_USABLE 1 /* needed for poly.h */ 40 #include <libm/poly-vec.h> 41 42 #define VECTOR_LENGTH 4 43 #define N 8 44 #define TABLE_SIZE (1ULL << N) 45 #define MAX_POLYDEGREE 8 46 extern const uint64_t log_256[]; 47 extern const uint64_t log_f_inv_256[]; 48 #define TAB_F_INV log_f_inv_256 49 #define TAB_LOG log_256 50 #define MANT_MASK_N (0x000FF00000000000ULL) 51 #define MANT_MASK_N1 (0x0000080000000000ULL) 52 #define SINGLE_PRECISION_BIAS 127 53 #define DOUBLE_PRECISION_MANTISSA 0x000fffffffffffffULL 54 #define ONE_BY_TWO 0x3fe0000000000000ULL 55 56 v_i32x4_t float_bias = _MM_SET1_I32(SINGLE_PRECISION_BIAS); 57 v_u64x4_t mantissa_bits = _MM_SET1_I64(DOUBLE_PRECISION_MANTISSA); 58 v_u64x4_t one_by_two = _MM_SET1_I64(ONE_BY_TWO); 59 v_u64x4_t mant_8_bits = _MM_SET1_I64(MANT_MASK_N); 60 61 62 static struct { 63 v_u32x4_t v_min, v_max; 64 double ALIGN(16) poly[MAX_POLYDEGREE]; 65 v_f64x4_t ln2; 66 } v_log_data = { 67 .ln2 = _MM_SET1_PD4(0x1.62e42fefa39efp-1), /* ln(2) */ 68 .v_min = _MM_SET1_I32(0x00800000), 69 .v_max = _MM_SET1_I32(0x7f800000), 70 /* 71 * Polynomial constants, 1/x! (reciprocal x) 72 */ 73 .poly = { /* skip for 0/1 and 1/1 */ 74 0x1.0000000000000p-1, /* 1/2 */ 75 0x1.5555555555555p-2, /* 1/3 */ 76 0x1.0000000000000p-2, /* 1/4 */ 77 0x1.999999999999ap-3, /* 1/5 */ 78 0x1.5555555555555p-3, /* 1/6 */ 79 0x1.2492492492492p-3, /* 1/7 */ 80 0x1.0000000000000p-3, /* 1/8 */ 81 0x1.c71c71c71c71cp-4, /* 1/9 */ 82 }, 83 }; 84 85 86 static struct { 87 v_f64x4_t ln2by_tblsz, tblsz_byln2, Huge; 88 double_t ALIGN(16) poly[MAX_POLYDEGREE]; 89 v_u64x4_t expf_max; 90 v_f32x4_t expf_maxf, expf_minf; 91 v_i32x4_t infinity; 92 } expf_v4_data = { 93 .ln2by_tblsz = _MM_SET1_PD4(0x1.62e42fefa39efp-7), 94 .tblsz_byln2 = _MM_SET1_PD4(0x1.71547652b82fep+0), 95 .Huge = _MM_SET1_PD4(0x1.8000000000000p+52), 96 .expf_max = _MM_SET1_I64(0x4056000000000000), 97 .infinity = _MM_SET1_I32(0x7f800000), 98 .expf_minf = _MM_SET1_PS4(-0x1.9fe368p6f), 99 .expf_maxf = _MM_SET1_PS4(88.7228393f), 100 .poly = { 101 0x1.0000014439a91p0, 102 0x1.62e43170e3344p-1, 103 0x1.ebf906bc4c115p-3, 104 0x1.c6ae2bb88c0c8p-5, 105 0x1.3d1079db4ef69p-7, 106 0x1.5f8905cb0cc4ep-10 107 }, 108 }; 109 110 #define SCALAR_POWF amd_powf 111 #define V_MIN v_log_data.v_min 112 #define V_MAX v_log_data.v_max 113 #define V_MASK v_log_data.v_mask 114 #define LN2 v_log_data.ln2 115 #define INVLN2 expf_v4_data.tblsz_byln2 116 #define EXPF_HUGE expf_v4_data.Huge 117 #define EXPF_MAX expf_v4_data.expf_max 118 #define EXPF_MAXF expf_v4_data.expf_maxf 119 #define EXPF_MINF expf_v4_data.expf_minf 120 #define INF expf_v4_data.infinity 121 /* 122 * Short names for polynomial coefficients 123 */ 124 125 #define D1 _MM_SET1_PD4(expf_v4_data.poly[0]) 126 #define D2 _MM_SET1_PD4(expf_v4_data.poly[1]) 127 #define D3 _MM_SET1_PD4(expf_v4_data.poly[2]) 128 #define D4 _MM_SET1_PD4(expf_v4_data.poly[3]) 129 #define D5 _MM_SET1_PD4(expf_v4_data.poly[4]) 130 #define D6 _MM_SET1_PD4(expf_v4_data.poly[5]) 131 132 #define C2 _MM_SET1_PD4(v_log_data.poly[0]) 133 #define C3 _MM_SET1_PD4(v_log_data.poly[1]) 134 135 136 /* 137 * __m128 ALM_PROTO_OPT(vrs4_powf)(__m128, __m128); 138 * 139 * Spec: 140 * - A slightly relaxed version of the scalar powf. 141 * - Maximum ULP is expected to be less than 2. 142 * - Performance is expected to be double of scalar powf 143 * 144 * 145 * Implementation Notes: 146 * pow(x,y) = e^(y * log(x)) 147 * 1. Calculation of log(x): 148 * x = 2^n*(1+f) .... (1) 149 * where n is exponent and is an integer 150 * (1+f) is mantissa ∈ [1,2). i.e., 1 ≤ 1+f < 2 .... (2) 151 * 152 * From (1), taking log on both sides 153 * log(x) = log(2^n * (1+f)) 154 * = log(2^n) + log(1+f) 155 * = n*log(2) + log(1+f) .... (3) 156 * 157 * let z = 1 + f 158 * log(x) = n*log(2) + log(z) 159 * z = G + g 160 * log(x) = n*log(2) + log(G + g) 161 * 162 * log(x) = n*log(2) + log(2*(G/2 + g/2)) 163 * 164 * log(x) = n*log(2) + log(2) + log(F + f) with (0.5 <= F < 1) and (f <= 2 ^ (-10)) 165 * 166 * log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F)) 167 * 168 * Y = (2 ^ (-1)) * (2 ^ (-m)) * (2 ^ m) * A 169 * 170 * Now, range of Y is: 0.5 <= Y < 1 171 * 172 * F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit) 173 * 174 * Now, range of F is: 256 <= F <= 512 175 * 176 * F = F / 512 177 * 178 * Now, range of F is: 0.5 <= F <= 1 179 * 180 * f = -(Y - F), with (f <= 2^(-10)) 181 * 182 * log(x) = m * log(2) + log(2) + log(F) + log(1 -(f / F)) 183 * 184 * log(x) = m * log(2) + log(2 * F) + log(1 - r) 185 * 186 * r = (f / F), with (r <= 2^(-9)) 187 * 188 * r = f * (1 / F) with (1 / F) precomputed to avoid division 189 * 190 * log(x) = m * log(2) + log(G) - poly 191 * 192 * log(G) is precomputed 193 * 194 * poly = (r + (r ^ 2) / 2 + (r ^ 3) / 3 195 * 196 * 2. Computation of e^(y * log(x)) 197 * e^x = 2 ^ (x/ln(2)) 198 * 199 * z = x/ln(2) 200 * 201 * z = n + r where n is the integer part of the number and r is the fraction 202 * 203 * 2^z = 2^n * 2^r 204 * 205 * 2^n can be calculated as (n + 1023) << 52 206 * 207 * 2^r is approximated by a polynomial approximated by sollya 208 * 209 * Polynomial Approximation: 210 * For More information refer to tools/sollya/vrs4_expf.sollya 211 * 212 * 213 */ 214 215 static inline v_f64x4_t look_table_access(const uint64_t* table, 216 const int vector_size, 217 v_u64x4_t indices) 218 { 219 uint64_t j; 220 v_f64x4_t ret; 221 for(int i = 0; i < vector_size; i++) { 222 j = indices[i]; 223 ret[i] = asdouble(table[j]); 224 } 225 return ret; 226 } 227 228 __m128 229 ALM_PROTO_OPT(vrs4_powf)(__m128 _x,__m128 _y) 230 { 231 232 v_u32x4_t u; 233 234 v_f32x4_t ret = _x; 235 236 u = as_v4_u32_f32(_x); 237 238 v_i32x4_t condition = (u - V_MIN >= V_MAX - V_MIN); 239 240 if(any_v4_u32_loop(condition)) { 241 242 ret[0] = SCALAR_POWF(_x[0], _y[0]); 243 ret[1] = SCALAR_POWF(_x[1], _y[1]); 244 ret[2] = SCALAR_POWF(_x[2], _y[2]); 245 ret[3] = SCALAR_POWF(_x[3], _y[3]); 246 247 return ret; 248 } 249 250 v_f64x4_t xd = _mm256_cvtps_pd(_x); 251 252 v_f64x4_t yd = _mm256_cvtps_pd(_y); 253 254 v_u64x4_t ux = as_v4_u64_f64(xd); 255 256 v_i32x4_t int_exponent = (u >> 23) - float_bias; 257 258 v_f64x4_t exponent = _mm256_cvtepi32_pd (int_exponent); 259 260 v_u64x4_t mant = ((ux & mantissa_bits) | one_by_two); 261 262 v_u64x4_t index = ux & mant_8_bits; 263 264 v_f64x4_t index_times_half = as_v4_f64_u64(index | one_by_two); 265 266 index = index >> (52 - N); 267 268 v_f64x4_t y1 = as_v4_f64_u64(mant); 269 270 v_f64x4_t f = index_times_half - y1; 271 272 v_f64x4_t F_INV, LOG_256, r; 273 274 /* Avoiding the use of vgatherpd instruction for performance reasons */ 275 276 F_INV = look_table_access(TAB_F_INV, VECTOR_LENGTH, index); 277 278 LOG_256 = look_table_access(TAB_LOG, VECTOR_LENGTH, index); 279 280 r = f * F_INV; 281 282 v_f64x4_t r2 = r * r; /* r^2 */ 283 284 v_f64x4_t temp = C2 + r * C3; 285 286 v_f64x4_t q = r + r2 * temp; 287 288 /* m*log(2) + log(G) - poly */ 289 290 temp = (exponent * LN2) + LOG_256; 291 292 temp -= q; 293 294 v_f64x4_t ylogx = temp * yd; 295 296 /* Calculate exp*/ 297 298 v_u64x4_t v = as_v4_u64_f64(ylogx); 299 300 v_i64x4_t condition2 = (v >= EXPF_MAX); 301 302 v_f64x4_t z = ylogx * INVLN2; 303 304 v_f64x4_t dn = z + EXPF_HUGE; 305 306 v_u64x4_t n = as_v4_u64_f64(dn); 307 308 dn = dn - EXPF_HUGE; 309 310 r = z - dn; 311 312 v_f64x4_t qtmp1 = D1 + (D2 * r); 313 314 v_f64x4_t qtmp2 = D3 + (D4 * r); 315 316 r2 = r * r; 317 318 v_f64x4_t qtmp3 = D5 + (D6 * r); 319 320 q = qtmp1 + r2 * qtmp2; 321 322 v_f64x4_t result = q + r2 * r2 * qtmp3; 323 324 ret = _mm256_cvtpd_ps(as_v4_f64_u64(as_v4_u64_f64(result) + (n << 52))); 325 326 if(any_v4_u64_loop(condition2)) { 327 328 v_f32x4_t x = _mm256_cvtpd_ps(ylogx); 329 330 v_i32x4_t inf_condition = x > EXPF_MAXF; 331 332 v_i32x4_t zero_condition = x < EXPF_MINF; 333 334 v_32x4 vx = {.f32x4 = ret}; 335 336 //Zero out the elements that have to be set to infinity 337 vx.i32x4 = vx.i32x4 & (~inf_condition); 338 339 inf_condition = inf_condition & INF; 340 341 vx.i32x4 = vx.i32x4 | inf_condition; 342 343 vx.i32x4 = vx.i32x4 & (~zero_condition); 344 345 ret = vx.f32x4; 346 } 347 348 return ret; 349 350 } 351 352