/ include / libm / poly-vec.h
poly-vec.h
  1  /*
  2   * Copyright (C) 2018-2020, Advanced Micro Devices, Inc. All rights reserved.
  3   *
  4   * Redistribution and use in source and binary forms, with or without modification,
  5   * are permitted provided that the following conditions are met:
  6   * 1. Redistributions of source code must retain the above copyright notice,
  7   *    this list of conditions and the following disclaimer.
  8   * 2. Redistributions in binary form must reproduce the above copyright notice,
  9   *    this list of conditions and the following disclaimer in the documentation
 10   *    and/or other materials provided with the distribution.
 11   * 3. Neither the name of the copyright holder nor the names of its contributors
 12   *    may be used to endorse or promote products derived from this software without
 13   *    specific prior written permission.
 14   *
 15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19   * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24   * POSSIBILITY OF SUCH DAMAGE.
 25   *
 26   */
 27  
 28  
 29  #ifndef __LIBM_POLY_VEC_H__
 30  #define __LIBM_POLY_VEC_H__
 31  
 32  #if defined(AMD_LIBM_FMA_USABLE)
 33  
 34  #define mul_add(x, y, z)                                        \
 35          _Generic((x),                                           \
 36                   float  : _mm_fmadd_ss,                         \
 37                   double : _mm_fmadd_sd,                         \
 38                   __m128 : _mm_fmadd_ps,                         \
 39                   __m128d: _mm_fmadd_pd,                         \
 40                   __m256 : _mm256_fmadd_ps,                      \
 41                   __m256d: _mm256_fmadd_pd,                      \
 42                   __m512 : _mm512_fmadd_ps,                      \
 43                   __m512d: _mm512_fmadd_pd)((x), (y), (z))
 44  
 45  #else /* ! FMA_USABLE */
 46  
 47  #define no_fma_mul(a, b)                                                \
 48          _Generic((a),                                                   \
 49                   float  : _mm_mul_ss,                                   \
 50                   double : _mm_mul_sd,                                   \
 51                   __m128 : _mm_mul_ps,                                   \
 52                   __m128d: _mm_mul_pd,                                   \
 53                   __m256 : _mm256_mul_ps,                                \
 54                   __m256d: _mm256_mul_pd,                                \
 55                   __m512 : _mm512_mul_ps,                                \
 56                   __m512d: _mm512_mul_pd)((a), (b))
 57  
 58  #define mul_add(x, y, z)                                                \
 59          _Generic((x),                                                   \
 60                   float  : _mm_add_ss,                                   \
 61                   double : _mm_add_sd,                                   \
 62                   __m128 : _mm_add_ps,                                   \
 63                   __m128d: _mm_add_pd,                                   \
 64                   __m256 : _mm256_add_ps,                                \
 65                   __m256d: _mm256_add_pd,                                \
 66                   __m512 : _mm512_add_ps,                                \
 67                   __m512d: _mm512_add_pd)(no_fma_mul((x), (y)), (z))
 68  
 69  #endif  /* FMA_USABLE */
 70  
 71  /*
 72   *  Estrins Scheme Polynomial evaluation
 73   */
 74  
 75  /*
 76   * p20(x),
 77   * for special polynomial, assumes c0=0.0 and c1=1.0, rename to POLY_EVAL_20_1
 78   *								\
 79   * p1 = c2 + c3 * x;
 80   * p2 = c4 + c5 * x;
 81   * p3 = c6 + c7 * x;
 82   * p4 = c8 + c9 * x;
 83   * p5 = c10 + c11* x;
 84   * p6 = c12 + c13 * x;
 85   * p7 = c14 + c15 * x;
 86   * p8 = c16 + c17 * x;
 87   * p9 = c18 + c19 * x;
 88   *
 89   */
 90  #define POLY_EVAL_20(x, c0, c1, c2, c3, c4, c5, c6, c7,                 \
 91                       c8, c9, c10,                                       \
 92                       c11, c12, c13, c14, c15, c16, c17,                 \
 93                       c18, c19, c20) ({                                  \
 94      __typeof(x) x2 = x  * x;                                            \
 95      __typeof(x) x4 = x2 * x2;                                           \
 96      __typeof(x) x8 = x4 * x4;                                           \
 97      __typeof(x) x16= x8 * x8;                                           \
 98      __typeof(x) q;                                                      \
 99      __typeof(x) q1, q2, q3, q4, q5;                                     \
100      __typeof(x) r1, r2, r3;                                             \
101      __typeof(x) p10 = c20 * x4;                                         \
102                                                                          \
103      /*q1 = x + x2 * p1; */                                              \
104      q1 = mul_add(mul_add(c3, x, c2),                                    \
105                   x2,                                                    \
106                   mul_add(c1, x, c0));                                   \
107                                                                          \
108      /* q2 = p2 + x2 * p3; */                                            \
109      q2 = mul_add(mul_add(c7, x, c6),                                    \
110                   x2,                                                    \
111                   mul_add(c5, x, c4));                                   \
112                                                                          \
113      /*q3 = p4 + x2 * p5; */                                             \
114      q3 = mul_add(mul_add(c11, x, c10),                                  \
115                   x2,                                                    \
116                   mul_add(c9, x, c8));                                   \
117                                                                          \
118      /* q4 = p6 + x2 * p7; */                                            \
119      q4 = mul_add(mul_add(c15, x, c14),                                  \
120                   x2,                                                    \
121                   mul_add(c13, x, c12));                                 \
122                                                                          \
123      /* q5 = p8 + x2 * p9; */                                            \
124      q5 = mul_add(mul_add(c19, x, c18),                                  \
125                   x2,                                                    \
126                   mul_add(c17, x, c16));                                 \
127                                                                          \
128      r1 = q1 + x4 * q2;                                                  \
129      r2 = x8 * (q3 + x4 * q4);                                           \
130      r3 = x16 * (q5 + p10);                                              \
131                                                                          \
132      q = r1 + r2 + r3;                                                   \
133      q;                                                                  \
134      })
135  
136  /*
137   * p(x) = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5 +
138   *          C7*r^6 + C8*r^7 + C9*r^8 + C10*r^9 + C11*r^10 + C12*r^11
139   *      = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r) +
140   *           r^6(C7 + C8*r) + r^8(C9 + C10*r) + r^10(C11 + C12*r)
141   */
142  #define POLY_EVAL_11(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) ({ \
143          __typeof(x) x2 = x * x;                                              \
144          __typeof(x) x4 = x2 * x2;                                            \
145          __typeof(x) x8 = x4 * x4;                                            \
146          __typeof(x) q = mul_add( mul_add ( mul_add(c11, x, c10), x2,         \
147                                             mul_add(c9 ,x ,c8)), x8,          \
148                                   mul_add ( mul_add(mul_add(c7, x, c6), x2,   \
149                                             mul_add(c5, x, c4)) ,x4,          \
150                                             mul_add( mul_add(c3, x, c2),x2,   \
151                                             mul_add(c0, x, c1)) ));           \
152           q;                                                                  \
153           })
154  
155  
156  
157  /*
158   * p(x) = c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + \
159   *                      c3*x^3 + c2*x^2 + c1*x + c0
160   *      = (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + \
161   *                      ((c2+c3*x)*x2 + (c0+c1*x));
162   */
163  #define POLY_EVAL_10(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) ({ \
164          __typeof(x) x2 = x  * x;                                        \
165          __typeof(x) x4 = x2 * x2;                                       \
166          __typeof(x) x8 = x4 * x4;                                       \
167          __typeof(x) q =  mul_add(mul_add(x2, c10, mul_add(c9, x, c8)),  \
168                                  x8,                                     \
169                                  mul_add(mul_add(mul_add(c7, x, c6),     \
170                                                  x2,                     \
171                                                  mul_add(c5, x, c4)),    \
172                                          x4,                             \
173                                          mul_add(mul_add(c3, x, c2),     \
174                                                  x2,                     \
175                                                  mul_add(c1, x, c0))));	\
176          q;                                                              \
177          })
178  
179  /*
180   * p(x) = c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 +
181   *                      c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
182   *      = ((c6+c7*x)*x2 + (c4+c5*x))*x4 +
183   *                      (c8+c9*x)*x8) +
184   *                      ((c2+c3*x)*x2 + (c0+c1*x));
185   */
186  #define POLY_EVAL_9(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) ({       \
187      __typeof(x) x2 = x  * x;                                            \
188      __typeof(x) x4 = x2 * x2;                                           \
189      __typeof(x) x8 = x4 * x4;                                           \
190      __typeof(x) q = mul_add(mul_add(c9, x, c8),                         \
191                              x8,                                         \
192                              mul_add(mul_add(mul_add(c7, x, c6),         \
193                                              x2,                         \
194                                              mul_add(c5, x, c4)),        \
195                                      x4,                                 \
196                                      mul_add(mul_add(c3, x, c2),         \
197                                              x2,                         \
198                                              mul_add(c1, x, c0))));      \
199          q;                                                              \
200      })
201  
202  /*
203   * p(x) = c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
204   *      = ((c6+c7*x)*x2 + (c4+c5*x))*x4 +
205   *                 (c8*x8 + (c2+c3*x)*x2 +  (c0+c1*x))
206   */
207  #define POLY_EVAL_8(x, c0, c1, c2, c3, c4, c5, c6, c7, c8) ({           \
208          __typeof(x) x2 = x * x;                                         \
209          __typeof(x) x4 = x2 * x2;                                       \
210          __typeof(x) x8 = x4 * x4;                                       \
211          __typeof(x) q = mul_add(mul_add(mul_add(c7, x, c6),             \
212                                          x2,                             \
213                                          mul_add(c5, x, c4)),            \
214                                  x4,                                     \
215                                  mul_add(mul_add(c3, x, c2),             \
216                                          x2,                             \
217                                          mul_add(c1, x, c0) + c8*x8));   \
218          q;                                                              \
219          })
220  
221  
222  /*
223   * p(x) = c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
224   *      = ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x))
225   */
226  
227  #define POLY_EVAL_7(x, c0, c1, c2, c3, c4, c5, c6, c7) ({               \
228              __typeof(x) x2 = x * x;                                     \
229              __typeof(x) x4 = x2 * x2;                                   \
230              __typeof(x) q = mul_add(mul_add(mul_add(c7, x, c6),         \
231                                              x2,                         \
232                                              mul_add(c5, x, c4)),        \
233                                      x4,                                 \
234                                      mul_add(mul_add(c3, x, c2),         \
235                                              x2,                         \
236                                              mul_add(c1, x, c0)));       \
237              q;                                                          \
238          })
239  
240  /*
241   * p(x) = 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
242   *      = (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
243   */
244  #define POLY_EVAL_6(x, c0, c1, c2, c3, c4, c5, c6) ({                   \
245              __typeof(x) x2 = x * x;                                     \
246              __typeof(x) x4 = x2 * x2;                                   \
247              __typeof(x) q = mul_add(mul_add(c6,                         \
248                                              x2,                         \
249                                              mul_add(c5, x, c4)),        \
250                                      x4,                                 \
251                                      mul_add(mul_add(c3, x, c2),         \
252                                              x2,                         \
253                                              mul_add(c1, x, c0)));       \
254              q;                                                          \
255          })
256  
257  /*
258   * p(x) = c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
259   *      = (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x))
260   */
261  #define POLY_EVAL_5(x, c0, c1, c2, c3, c4, c5) ({                       \
262              __typeof(x) x2 = x * x;                                     \
263              __typeof(x) x4 = x2 * x2;                                   \
264              __typeof(x) q = mul_add(mul_add(c3, x, c2),                 \
265                                      x2,                                 \
266                                      mul_add(mul_add(c5, x, c4),         \
267                                              x4,                         \
268                                              mul_add(c1, x, c0)));       \
269              q;                                                          \
270          })
271  
272  /*
273   * p(x) = c1*x^3 + c0*x^2 + x
274   *      = (c0+c1*x)*x^2 + x
275   */
276  
277  #define POLY_EVAL_1(x, c0, c1) ({                                       \
278              __typeof(x) x2 = x * x;                                     \
279              __typeof(x) q = mul_add(mul_add(c1, x, c0),                 \
280                                      x2,                                 \
281                                      x);                                 \
282              q;                                                          \
283          })
284  
285  /*
286   * Polynomial of degree 15,
287   *      - uses only odd terms and
288   *      - C0 = 0
289   * p(x) = (c7*x^15 + c6*x^13 + c5*x^11 + c4*x^9 + c3*x^7 + c2*x^5 + c1*x^3) + x
290   *      = (c7*x^12 + c6*x^10 + c5*x^8 + c4*x^6 + c3*x^4 + c2*x^2 + c1) * \
291   *                      x^2 *x + x
292   */
293  #define POLY_EVAL_ODD_15(x, c1, c2, c3, c4, c5, c6, c7) ({          \
294              __typeof(x) x2  = x * x;                                \
295              __typeof(x) x4  = x2 * x2;                              \
296              __typeof(x) x8  = x4 * x4;                              \
297              __typeof(x) q = mul_add(mul_add(c7,                     \
298                                              x4,                     \
299                                              mul_add(c6, x2, c5)),   \
300                                      x8,                             \
301                                      mul_add(mul_add(c4, x2, c3),    \
302                                              x4,                     \
303                                              mul_add(c2, x2, c1)));  \
304              x + ((q * x2) * x) ;                                  \
305          })
306  
307  /*
308   * Polynomial of degree 15,
309   *      - uses only even terms and
310   *      - C0 = 0
311   * p(x) = (c8*x^14 + c7*x^12 + c6*x^10 + c5*x^8 + c4*x^6 + c3*x^4 + c2*x^2 + c1)*x
312   *      = ((c7+c8*x2)*x4 + (c5+c6*x2))*x8 +
313   *                 (c3+c4*x2)*x4 + (c1+c2*x2))*x
314   */
315  #define POLY_EVAL_EVEN_15(x, c1, c2, c3, c4, c5, c6, c7, c8) ({         \
316              __typeof(x) x2  = x * x;                                    \
317              __typeof(x) x4  = x2 * x2;                                  \
318              __typeof(x) x8  = x4 * x4;                                  \
319              __typeof(x) q = mul_add(mul_add(mul_add(c8, x2, c7),        \
320                                              x4,                         \
321                                              mul_add(c6, x2, c5)),       \
322                                      x8,                                 \
323                                      mul_add(mul_add(c4, x2, c3),        \
324                                              x4,                         \
325                                              mul_add(c2, x2, c1)));      \
326              x * q;                                                      \
327      })
328  
329  
330  #endif  /* LIBM_POLY_VEC_H */
331