vrs4_tanf.c
1 /* 2 * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 #include <stdint.h> 29 30 #include <libm_macros.h> 31 #include <libm/amd_funcs_internal.h> 32 #include <libm/types.h> 33 #include <libm/constants.h> 34 #include <libm/typehelper-vec.h> 35 #include <libm/compiler.h> 36 #include <libm/poly-vec.h> 37 38 extern float _tanf_special(float); 39 40 /* 41 * ISO-IEC-10967-2: Elementary Numerical Functions 42 * Signature: 43 * float tanf(float x) 44 * 45 * Spec: 46 * tanf(n· 2π + π/4) = 1 if n∈Z and |n· 2π + π/4| <= big_angle_F 47 * tanf(n· 2π + 3π/4) = −1 if n∈Z and |n· 2π + 3π/4| <= big angle rF 48 * tanf(x) = x if x ∈ F^(2·π) and tanf(x) != tan(x) 49 * and |x| < sqrt(epsilonF/rF) 50 * tanf(−x) = −tanf(x) if x ∈ F^(2·π) 51 */ 52 53 static const struct { 54 v_u32x4_t arg_max; 55 v_u32x4_t sign_mask; 56 v_f32x4_t huge; 57 v_f32x4_t invhalfpi, halfpi1, halfpi2, halfpi3; 58 v_f32x4_t poly_tanf[7]; 59 } v4_tanf_data = { 60 .sign_mask = _MM_SET1_I32(1U<<31), 61 .arg_max = _MM_SET1_I32(0x49742400), /* 10^6 */ 62 .huge = _MM_SET1_PS4(0x1.80000000p23f), 63 .invhalfpi = _MM_SET1_PS4(0x1.45f306p-1f), 64 .halfpi1 = _MM_SET1_PS4(-0x1.921fb6p0f), 65 .halfpi2 = _MM_SET1_PS4(0x1.777a5cp-25f), 66 .halfpi3 = _MM_SET1_PS4(0x1.ee59dap-50f), 67 68 // Polynomial coefficients obtained using Remez algorithm from Sollya 69 .poly_tanf = { 70 _MM_SET1_PS4(0x1.555566p-2f), 71 _MM_SET1_PS4(0x1.110cdp-3f), 72 _MM_SET1_PS4(0x1.baf34p-5f), 73 _MM_SET1_PS4(0x1.5bf38ep-6f), 74 _MM_SET1_PS4(0x1.663acap-7f), 75 _MM_SET1_PS4(-0x1.07c6f4p-16f), 76 _MM_SET1_PS4(0x1.21cedap-8f), 77 }, 78 79 }; 80 81 #define V4_SIMD_WIDTH 4 82 #define ALM_TANF_HUGE_VAL v4_tanf_data.huge 83 #define ALM_TANF_HALFPI v4_tanf_data.halfpi 84 #define ALM_TANF_PI_HIGH v4_tanf_data.pihi 85 #define ALM_TANF_PI_LOW v4_tanf_data.pilow 86 #define ALM_TANF_INVHALFPI v4_tanf_data.invhalfpi 87 #define ALM_TANF_ARG_MAX v4_tanf_data.arg_max 88 #define ALM_TANF_SIGN_MASK32 v4_tanf_data.sign_mask 89 90 #define ALM_TANF_HALFPI1 v4_tanf_data.halfpi1 91 #define ALM_TANF_HALFPI2 v4_tanf_data.halfpi2 92 #define ALM_TANF_HALFPI3 v4_tanf_data.halfpi3 93 94 #define C1 v4_tanf_data.poly_tanf[0] 95 #define C2 v4_tanf_data.poly_tanf[1] 96 #define C3 v4_tanf_data.poly_tanf[2] 97 #define C4 v4_tanf_data.poly_tanf[3] 98 #define C5 v4_tanf_data.poly_tanf[4] 99 #define C6 v4_tanf_data.poly_tanf[5] 100 #define C7 v4_tanf_data.poly_tanf[6] 101 102 extern float tanf_specialcase(float); 103 #if 0 104 extern float tanf_oddcase(float); 105 106 static inline v_f32x4_t 107 vrs4_tanf_oddcase(v_f32x4_t _x, v_f32x4_t result, v_i32x4_t odd) 108 { 109 /* We send the result in this case as we have already computed them */ 110 return call_v4_f32(tanf_oddcase, result, _x, odd); 111 } 112 #endif 113 114 static inline v_f32x4_t 115 vrs4_tanf_specialcase(v_f32x4_t _x, v_f32x4_t result, v_i32x4_t cond) 116 { 117 return call_v4_f32(tanf_specialcase, _x, result, cond); 118 } 119 120 /* 121 * Implementation Notes: 122 * 123 * float tanf(float x) 124 * A given x is reduced into the form: 125 * 126 * |x| = (N * π/2) + F 127 * 128 * Where N is an integer obtained using: 129 * N = round(x * 2/π) 130 * And F is a fraction part lying in the interval 131 * [-π/4, +π/4]; 132 * 133 * obtained as F = |x| - (N * π/2) 134 * 135 * Thus tan(x) is given by 136 * 137 * tan(x) = tan((N * π/2) + F) = tan(F) 138 * when N is even, 139 * = -cot(F) = -1/tan(F) 140 * when N is odd, tan(F) is approximated using a polynomial 141 * obtained from Remez approximation from Sollya. 142 * 143 */ 144 145 __m128 146 ALM_PROTO_OPT(vrs4_tanf)(__m128 xf32x4) 147 { 148 v_f32x4_t F, xx; 149 v_f32x4_t poly; 150 int32_t i; 151 v_u32x4_t sign = {0}, n; 152 v_u32x4_t ux = as_v4_u32_f32(xf32x4); 153 154 v_i32x4_t cond = (ux & ~ALM_TANF_SIGN_MASK32) > ALM_TANF_ARG_MAX; 155 156 sign = ux & ALM_TANF_SIGN_MASK32; 157 158 /* fabs(x) */ 159 xx = as_v4_f32_u32(ux & ~ALM_TANF_SIGN_MASK32); 160 161 /* 162 * dn = x * (2/π) 163 * would turn to fma 164 */ 165 166 v_f32x4_t nn = xx * ALM_TANF_INVHALFPI + ALM_TANF_HUGE_VAL; 167 168 // n = (int)dn / 169 n = as_v4_u32_f32(nn); 170 171 nn -= ALM_TANF_HUGE_VAL; 172 /* 173 v_f32x4_t nn = xx * ALM_TANF_INVHALFPI; 174 for(int i =0;i<4;i++) { 175 n[i] = (nn[i] < 0)?(nn[i] - 0.5):(nn[i] + 0.5); 176 nn[i] = (float)n[i]; 177 } 178 */ 179 /* F = x - (n * π/2) */ 180 F = xx + nn * ALM_TANF_HALFPI1; 181 F = F + nn * ALM_TANF_HALFPI2; 182 F = F + nn * ALM_TANF_HALFPI3; 183 184 v_u32x4_t odd = n << 31; 185 186 /* 187 * Calculate the polynomial approximation 188 * x * (C1 + C2*x^2 + C3*x^4 + C4*x^6 + \ 189 * C5*x^8 + C6*x^10 + C7*x^12 + C8*x^14) 190 * polynomial is approximated as x*P(x^2) 191 */ 192 poly = POLY_EVAL_ODD_15(F, C1, C2, C3, C4, C5, C6, C7); 193 194 v_f32x4_t result = as_v4_f32_u32(as_v4_u32_f32(poly) ^ sign); 195 196 /* if n is odd, result = -1.0/result */ 197 for(i = 0; i < V4_SIMD_WIDTH; i++) { 198 199 result[i] = odd[i] ? (-1.0f / result[i]) : result[i]; 200 201 } 202 203 if (any_v4_u32_loop(cond)) { 204 result = vrs4_tanf_specialcase(xf32x4, result, cond); 205 } 206 207 return result; 208 }