log1pf.c
1 /* 2 * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 #include "libm_amd.h" 29 #include "libm_util_amd.h" 30 #include "libm_special.h" 31 32 #undef _FUNCNAME 33 #define _FUNCNAME "log1pf" 34 35 float FN_PROTOTYPE_BAS64(log1pf)(float x) 36 { 37 38 int xexp; 39 double dx, r, f, f1, f2, q, u, v, z1, z2, poly, m2; 40 int index; 41 unsigned int ux, ax; 42 unsigned long long lux; 43 44 /* 45 Computes natural log(1+x) for float arguments. Algorithm is 46 basically a promotion of the arguments to double followed 47 by an inlined version of the double algorithm, simplified 48 for efficiency (see log1p_amd.c). Simplifications include: 49 * Special algorithm for arguments near 0.0 not required 50 * Scaling of denormalised arguments not required 51 * Shorter core series approximations used 52 Note that we use a lookup table of size 64 rather than 128, 53 and compensate by having extra terms in the minimax polynomial 54 for the kernel approximation. 55 */ 56 57 /* Arrays ln_lead_table and ln_tail_table contain 58 leading and trailing parts respectively of precomputed 59 values of natural log(1+i/64), for i = 0, 1, ..., 64. 60 ln_lead_table contains the first 24 bits of precision, 61 and ln_tail_table contains a further 53 bits precision. */ 62 63 static const double ln_lead_table[65] = { 64 0.00000000000000000000e+00, /* 0x0000000000000000 */ 65 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ 66 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ 67 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ 68 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ 69 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ 70 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ 71 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ 72 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ 73 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ 74 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ 75 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ 76 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ 77 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ 78 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ 79 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ 80 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ 81 2.35566020011901855469e-01, /* 0x3fce270700000000 */ 82 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ 83 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ 84 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ 85 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ 86 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ 87 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ 88 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ 89 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ 90 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ 91 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ 92 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ 93 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ 94 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ 95 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ 96 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ 97 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ 98 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ 99 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ 100 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ 101 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ 102 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ 103 4.75845873355865478516e-01, /* 0x3fde744240000000 */ 104 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ 105 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ 106 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ 107 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ 108 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ 109 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ 110 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ 111 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ 112 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ 113 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ 114 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ 115 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ 116 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ 117 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ 118 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ 119 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ 120 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ 121 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ 122 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ 123 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ 124 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ 125 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ 126 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ 127 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ 128 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ 129 130 static const double ln_tail_table[65] = { 131 0.00000000000000000000e+00, /* 0x0000000000000000 */ 132 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ 133 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ 134 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ 135 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ 136 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ 137 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ 138 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ 139 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ 140 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ 141 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ 142 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ 143 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ 144 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ 145 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ 146 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ 147 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ 148 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ 149 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ 150 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ 151 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ 152 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ 153 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ 154 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ 155 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ 156 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ 157 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ 158 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ 159 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ 160 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ 161 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ 162 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ 163 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ 164 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ 165 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ 166 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ 167 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ 168 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ 169 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ 170 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ 171 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ 172 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ 173 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ 174 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ 175 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ 176 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ 177 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ 178 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ 179 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ 180 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ 181 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ 182 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ 183 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ 184 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ 185 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ 186 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ 187 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ 188 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ 189 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ 190 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ 191 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ 192 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ 193 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ 194 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ 195 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ 196 197 static const double 198 log2 = 6.931471805599453e-01, /* 0x3fe62e42fefa39ef */ 199 200 /* Approximating polynomial coefficients */ 201 cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ 202 cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */ 203 204 GET_BITS_SP32(x, ux); 205 ax = ux & ~SIGNBIT_SP32; 206 207 if ((ux & EXPBITS_SP32) == EXPBITS_SP32) 208 { 209 /* x is either NaN or infinity */ 210 if (ux & MANTBITS_SP32) 211 { 212 /* x is NaN */ 213 #ifdef WINDOWS 214 return __amd_handle_errorf(_FUNCNAME, __amd_log1p, ux|0x00400000, _DOMAIN, AMD_F_NONE, EDOM, x, 0.0, 1); 215 #else 216 return x + x; /* Raise invalid if it is a signalling NaN */ 217 #endif 218 } 219 else 220 { 221 /* x is infinity */ 222 if (ux & SIGNBIT_SP32) 223 { 224 /* x is negative infinity. Return a NaN. */ 225 return __amd_handle_errorf(_FUNCNAME, __amd_log1p, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); 226 } 227 else 228 return x; 229 } 230 } 231 else if (ux >= 0xbf800000) 232 { 233 /* x <= -1.0 */ 234 if (ux > 0xbf800000) 235 { 236 /* x is less than -1.0. Return a NaN. */ 237 return __amd_handle_errorf(_FUNCNAME, __amd_log1p, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); 238 } 239 else 240 { 241 /* x is exactly -1.0. Return -infinity with div-by-zero flag. */ 242 return __amd_handle_errorf(_FUNCNAME, __amd_log1p, NINFBITPATT_SP32, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1); 243 244 } 245 } 246 else if (ax < 0x33800000) 247 { 248 if (ax == 0x00000000) 249 { 250 /* x is +/-zero. Return the same zero. */ 251 return x; 252 } 253 else 254 /* abs(x) is less than float epsilon. Return x with inexact. */ 255 return x; 256 } 257 258 dx = x; 259 /* 260 First, we decompose the argument dx to the form 261 1 + dx = 2**M * (F1 + F2), 262 where 1 <= F1+F2 < 2, M has the value of an integer, 263 F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. 264 265 Second, we approximate log( 1 + F2/F1 ) by an odd polynomial 266 in U, where U = 2 F2 / (2 F2 + F1). 267 Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). 268 The core approximation calculates 269 Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. 270 Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), 271 thus, Poly = 2 arctanh( U/2 ) / U - 1. 272 273 It is not hard to see that 274 log(dx) = M*log(2) + log(F1) + log( 1 + F2/F1 ). 275 Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). 276 The values of log(F1) are calculated beforehand and stored 277 in the program. 278 */ 279 280 f = 1.0 + dx; 281 GET_BITS_DP64(f, lux); 282 283 /* Store the exponent of f = 1 + dx in xexp and put 284 f into the range [1.0,2.0) */ 285 xexp = (int)((lux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; 286 PUT_BITS_DP64((lux & MANTBITS_DP64) | ONEEXPBITS_DP64, f); 287 288 /* Now (1+dx) = 2**(xexp) * f, 1 <= f < 2. */ 289 290 /* Set index to be the nearest integer to 64*f */ 291 /* 64 <= index <= 128 */ 292 /* 293 r = 64.0 * f; 294 index = (int)(r + 0.5); 295 */ 296 /* This code instead of the above can save several cycles. 297 It only works because 64 <= r < 128, so 298 the nearest integer is always contained in exactly 299 7 bits, and the right shift is always the same. */ 300 index = (int)((((lux & 0x000fc00000000000) | 0x0010000000000000) >> 46) 301 + ((lux & 0x0000200000000000) >> 45)); 302 303 f1 = index * 0.015625; /* 0.015625 = 1/64 */ 304 index -= 64; 305 306 /* Now take great care to compute f2 such that f1 + f2 = f */ 307 if (xexp <= -2 || xexp >= MANTLENGTH_DP64 + 8) 308 { 309 f2 = f - f1; 310 } 311 else 312 { 313 /* Create the number m2 = 2.0^(-xexp) */ 314 lux = (unsigned long long)(0x3ff - xexp) << EXPSHIFTBITS_DP64; 315 PUT_BITS_DP64(lux,m2); 316 if (xexp <= MANTLENGTH_DP64 - 1) 317 { 318 f2 = (m2 - f1) + m2*dx; 319 } 320 else 321 { 322 f2 = (m2*dx - f1) + m2; 323 } 324 } 325 326 /* At this point, dx = 2**xexp * ( f1 + f2 ) where 327 f1 = j/64, j = 1, 2, ..., 64 and |f2| <= 1/128. */ 328 329 z1 = ln_lead_table[index]; 330 q = ln_tail_table[index]; 331 332 /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ 333 u = f2 / (f1 + 0.5 * f2); 334 335 /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). 336 The core approximation calculates 337 poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ 338 v = u * u; 339 poly = (v * (cb_1 + v * cb_2)); 340 z2 = q + (u + u * poly); 341 342 /* Now z1,z2 is an extra-precise approximation of log(f). */ 343 344 /* Add xexp * log(2) to z1,z2 to get the result log(1+x). */ 345 r = xexp * log2 + z1 + z2; 346 /* Natural log(1+x) */ 347 return (float)r; 348 } 349 350