/ src / optmized / vec / vrs4_tanf.c
vrs4_tanf.c
  1  /*
  2   * Copyright (C) 2008-2020 Advanced Micro Devices, Inc. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  #include <stdint.h>
 29  
 30  #include <libm_macros.h>
 31  #include <libm/amd_funcs_internal.h>
 32  #include <libm/types.h>
 33  #include <libm/constants.h>
 34  #include <libm/typehelper-vec.h>
 35  #include <libm/compiler.h>
 36  #include <libm/poly-vec.h>
 37  
 38  extern float _tanf_special(float);
 39  
 40  /*
 41   * ISO-IEC-10967-2: Elementary Numerical Functions
 42   * Signature:
 43   *   float tanf(float x)
 44   *
 45   * Spec:
 46   *   tanf(n· 2π + π/4)  = 1       if n∈Z and |n· 2π + π/4|   <= big_angle_F
 47   *   tanf(n· 2π + 3π/4) = −1      if n∈Z and |n· 2π + 3π/4|  <= big angle rF
 48   *   tanf(x) = x                  if x ∈ F^(2·π) and tanf(x) != tan(x)
 49   *                                               and |x| < sqrt(epsilonF/rF)
 50   *   tanf(−x) = −tanf(x)          if x ∈ F^(2·π)
 51   */
 52  
 53  static const struct {
 54      v_u32x4_t    arg_max;
 55      v_u32x4_t    sign_mask;
 56      v_f32x4_t    huge;
 57      v_f32x4_t    invhalfpi, halfpi1, halfpi2, halfpi3;
 58      v_f32x4_t    poly_tanf[7];
 59  } v4_tanf_data = {
 60      .sign_mask = _MM_SET1_I32(1U<<31),
 61      .arg_max   = _MM_SET1_I32(0x49742400), /* 10^6 */
 62      .huge      = _MM_SET1_PS4(0x1.80000000p23f),
 63      .invhalfpi = _MM_SET1_PS4(0x1.45f306p-1f),
 64      .halfpi1   = _MM_SET1_PS4(-0x1.921fb6p0f),
 65      .halfpi2   = _MM_SET1_PS4(0x1.777a5cp-25f),
 66      .halfpi3   = _MM_SET1_PS4(0x1.ee59dap-50f),
 67  
 68      // Polynomial coefficients obtained using Remez algorithm from Sollya
 69      .poly_tanf = {
 70          _MM_SET1_PS4(0x1.555566p-2f),
 71          _MM_SET1_PS4(0x1.110cdp-3f),
 72          _MM_SET1_PS4(0x1.baf34p-5f),
 73          _MM_SET1_PS4(0x1.5bf38ep-6f),
 74          _MM_SET1_PS4(0x1.663acap-7f),
 75          _MM_SET1_PS4(-0x1.07c6f4p-16f),
 76          _MM_SET1_PS4(0x1.21cedap-8f),
 77      },
 78  
 79  };
 80  
 81  #define V4_SIMD_WIDTH        4
 82  #define ALM_TANF_HUGE_VAL    v4_tanf_data.huge
 83  #define ALM_TANF_HALFPI      v4_tanf_data.halfpi
 84  #define ALM_TANF_PI_HIGH     v4_tanf_data.pihi
 85  #define ALM_TANF_PI_LOW      v4_tanf_data.pilow
 86  #define ALM_TANF_INVHALFPI   v4_tanf_data.invhalfpi
 87  #define ALM_TANF_ARG_MAX     v4_tanf_data.arg_max
 88  #define ALM_TANF_SIGN_MASK32 v4_tanf_data.sign_mask
 89  
 90  #define  ALM_TANF_HALFPI1 v4_tanf_data.halfpi1
 91  #define  ALM_TANF_HALFPI2 v4_tanf_data.halfpi2
 92  #define  ALM_TANF_HALFPI3 v4_tanf_data.halfpi3
 93  
 94  #define C1 v4_tanf_data.poly_tanf[0]
 95  #define C2 v4_tanf_data.poly_tanf[1]
 96  #define C3 v4_tanf_data.poly_tanf[2]
 97  #define C4 v4_tanf_data.poly_tanf[3]
 98  #define C5 v4_tanf_data.poly_tanf[4]
 99  #define C6 v4_tanf_data.poly_tanf[5]
100  #define C7 v4_tanf_data.poly_tanf[6]
101  
102  extern float tanf_specialcase(float);
103  #if 0
104  extern float tanf_oddcase(float);
105  
106  static inline v_f32x4_t
107  vrs4_tanf_oddcase(v_f32x4_t _x, v_f32x4_t result, v_i32x4_t odd)
108  {
109      /* We send the result in this case as we have already computed them */
110      return call_v4_f32(tanf_oddcase, result, _x, odd);
111  }
112  #endif
113  
114  static inline v_f32x4_t
115  vrs4_tanf_specialcase(v_f32x4_t _x, v_f32x4_t result, v_i32x4_t cond)
116  {
117      return call_v4_f32(tanf_specialcase, _x, result, cond);
118  }
119  
120  /*
121   * Implementation Notes:
122   *
123   * float tanf(float x)
124   *      A given x is reduced into the form:
125   *
126   *               |x| = (N * π/2) + F
127   *
128   *      Where N is an integer obtained using:
129   *              N = round(x * 2/π)
130   *      And F is a fraction part lying in the interval
131   *              [-π/4, +π/4];
132   *
133   *      obtained as F = |x| - (N * π/2)
134   *
135   *      Thus tan(x) is given by
136   *
137   *              tan(x) = tan((N * π/2) + F) = tan(F)
138   *              when N is even,
139   *                     = -cot(F) = -1/tan(F)
140   *              when N is odd, tan(F) is approximated using a polynomial
141   *                      obtained from Remez approximation from Sollya.
142   *
143   */
144  
145  __m128
146  ALM_PROTO_OPT(vrs4_tanf)(__m128 xf32x4)
147  {
148      v_f32x4_t   F, xx;
149      v_f32x4_t   poly;
150      int32_t i;
151      v_u32x4_t   sign = {0}, n;
152      v_u32x4_t   ux = as_v4_u32_f32(xf32x4);
153  
154      v_i32x4_t  cond = (ux  & ~ALM_TANF_SIGN_MASK32) > ALM_TANF_ARG_MAX;
155  
156  	sign = ux & ALM_TANF_SIGN_MASK32;
157  
158      /* fabs(x) */
159      xx = as_v4_f32_u32(ux & ~ALM_TANF_SIGN_MASK32);
160  
161      /*
162       * dn = x * (2/π)
163       * would turn to fma
164       */
165  
166      v_f32x4_t nn =  xx * ALM_TANF_INVHALFPI + ALM_TANF_HUGE_VAL;
167  
168      // n = (int)dn /
169      n   = as_v4_u32_f32(nn);
170  
171      nn -= ALM_TANF_HUGE_VAL;
172  /*
173       v_f32x4_t nn =  xx * ALM_TANF_INVHALFPI;
174      for(int i =0;i<4;i++) {
175          n[i] = (nn[i] < 0)?(nn[i] - 0.5):(nn[i] + 0.5);
176          nn[i] = (float)n[i];
177      }
178  */
179      /* F = x - (n * π/2) */
180      F = xx + nn * ALM_TANF_HALFPI1;
181      F = F + nn * ALM_TANF_HALFPI2;
182      F = F + nn * ALM_TANF_HALFPI3;
183  
184      v_u32x4_t odd = n << 31;
185  
186      /*
187       * Calculate the polynomial approximation
188       *					x * (C1 + C2*x^2 + C3*x^4 + C4*x^6 + \
189       *									C5*x^8 + C6*x^10 + C7*x^12 + C8*x^14)
190       * polynomial is approximated as x*P(x^2)
191       */
192      poly = POLY_EVAL_ODD_15(F, C1, C2, C3, C4, C5, C6, C7);
193  
194      v_f32x4_t result = as_v4_f32_u32(as_v4_u32_f32(poly) ^ sign);
195  
196      /* if n is odd, result = -1.0/result */
197      for(i = 0; i < V4_SIMD_WIDTH; i++) {
198  
199          result[i] = odd[i] ? (-1.0f / result[i]) : result[i];
200  
201      }
202  
203      if (any_v4_u32_loop(cond)) {
204          result = vrs4_tanf_specialcase(xf32x4, result, cond);
205      }
206  
207      return result;
208  }