poly-vec.h
1 /* 2 * Copyright (C) 2018-2020, Advanced Micro Devices, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without modification, 5 * are permitted provided that the following conditions are met: 6 * 1. Redistributions of source code must retain the above copyright notice, 7 * this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright notice, 9 * this list of conditions and the following disclaimer in the documentation 10 * and/or other materials provided with the distribution. 11 * 3. Neither the name of the copyright holder nor the names of its contributors 12 * may be used to endorse or promote products derived from this software without 13 * specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 29 #ifndef __LIBM_POLY_VEC_H__ 30 #define __LIBM_POLY_VEC_H__ 31 32 #if defined(AMD_LIBM_FMA_USABLE) 33 34 #define mul_add(x, y, z) \ 35 _Generic((x), \ 36 float : _mm_fmadd_ss, \ 37 double : _mm_fmadd_sd, \ 38 __m128 : _mm_fmadd_ps, \ 39 __m128d: _mm_fmadd_pd, \ 40 __m256 : _mm256_fmadd_ps, \ 41 __m256d: _mm256_fmadd_pd, \ 42 __m512 : _mm512_fmadd_ps, \ 43 __m512d: _mm512_fmadd_pd)((x), (y), (z)) 44 45 #else /* ! FMA_USABLE */ 46 47 #define no_fma_mul(a, b) \ 48 _Generic((a), \ 49 float : _mm_mul_ss, \ 50 double : _mm_mul_sd, \ 51 __m128 : _mm_mul_ps, \ 52 __m128d: _mm_mul_pd, \ 53 __m256 : _mm256_mul_ps, \ 54 __m256d: _mm256_mul_pd, \ 55 __m512 : _mm512_mul_ps, \ 56 __m512d: _mm512_mul_pd)((a), (b)) 57 58 #define mul_add(x, y, z) \ 59 _Generic((x), \ 60 float : _mm_add_ss, \ 61 double : _mm_add_sd, \ 62 __m128 : _mm_add_ps, \ 63 __m128d: _mm_add_pd, \ 64 __m256 : _mm256_add_ps, \ 65 __m256d: _mm256_add_pd, \ 66 __m512 : _mm512_add_ps, \ 67 __m512d: _mm512_add_pd)(no_fma_mul((x), (y)), (z)) 68 69 #endif /* FMA_USABLE */ 70 71 /* 72 * Estrins Scheme Polynomial evaluation 73 */ 74 75 /* 76 * p20(x), 77 * for special polynomial, assumes c0=0.0 and c1=1.0, rename to POLY_EVAL_20_1 78 * \ 79 * p1 = c2 + c3 * x; 80 * p2 = c4 + c5 * x; 81 * p3 = c6 + c7 * x; 82 * p4 = c8 + c9 * x; 83 * p5 = c10 + c11* x; 84 * p6 = c12 + c13 * x; 85 * p7 = c14 + c15 * x; 86 * p8 = c16 + c17 * x; 87 * p9 = c18 + c19 * x; 88 * 89 */ 90 #define POLY_EVAL_20(x, c0, c1, c2, c3, c4, c5, c6, c7, \ 91 c8, c9, c10, \ 92 c11, c12, c13, c14, c15, c16, c17, \ 93 c18, c19, c20) ({ \ 94 __typeof(x) x2 = x * x; \ 95 __typeof(x) x4 = x2 * x2; \ 96 __typeof(x) x8 = x4 * x4; \ 97 __typeof(x) x16= x8 * x8; \ 98 __typeof(x) q; \ 99 __typeof(x) q1, q2, q3, q4, q5; \ 100 __typeof(x) r1, r2, r3; \ 101 __typeof(x) p10 = c20 * x4; \ 102 \ 103 /*q1 = x + x2 * p1; */ \ 104 q1 = mul_add(mul_add(c3, x, c2), \ 105 x2, \ 106 mul_add(c1, x, c0)); \ 107 \ 108 /* q2 = p2 + x2 * p3; */ \ 109 q2 = mul_add(mul_add(c7, x, c6), \ 110 x2, \ 111 mul_add(c5, x, c4)); \ 112 \ 113 /*q3 = p4 + x2 * p5; */ \ 114 q3 = mul_add(mul_add(c11, x, c10), \ 115 x2, \ 116 mul_add(c9, x, c8)); \ 117 \ 118 /* q4 = p6 + x2 * p7; */ \ 119 q4 = mul_add(mul_add(c15, x, c14), \ 120 x2, \ 121 mul_add(c13, x, c12)); \ 122 \ 123 /* q5 = p8 + x2 * p9; */ \ 124 q5 = mul_add(mul_add(c19, x, c18), \ 125 x2, \ 126 mul_add(c17, x, c16)); \ 127 \ 128 r1 = q1 + x4 * q2; \ 129 r2 = x8 * (q3 + x4 * q4); \ 130 r3 = x16 * (q5 + p10); \ 131 \ 132 q = r1 + r2 + r3; \ 133 q; \ 134 }) 135 136 /* 137 * p(x) = C1 + C2*r + C3*r^2 + C4*r^3 + C5*r^4 + C6*r^5 + 138 * C7*r^6 + C8*r^7 + C9*r^8 + C10*r^9 + C11*r^10 + C12*r^11 139 * = (C1 + C2*r) + r^2(C3 + C4*r) + r^4(C5 + C6*r) + 140 * r^6(C7 + C8*r) + r^8(C9 + C10*r) + r^10(C11 + C12*r) 141 */ 142 #define POLY_EVAL_11(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) ({ \ 143 __typeof(x) x2 = x * x; \ 144 __typeof(x) x4 = x2 * x2; \ 145 __typeof(x) x8 = x4 * x4; \ 146 __typeof(x) q = mul_add( mul_add ( mul_add(c11, x, c10), x2, \ 147 mul_add(c9 ,x ,c8)), x8, \ 148 mul_add ( mul_add(mul_add(c7, x, c6), x2, \ 149 mul_add(c5, x, c4)) ,x4, \ 150 mul_add( mul_add(c3, x, c2),x2, \ 151 mul_add(c0, x, c1)) )); \ 152 q; \ 153 }) 154 155 156 157 /* 158 * p(x) = c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + \ 159 * c3*x^3 + c2*x^2 + c1*x + c0 160 * = (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + \ 161 * ((c2+c3*x)*x2 + (c0+c1*x)); 162 */ 163 #define POLY_EVAL_10(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) ({ \ 164 __typeof(x) x2 = x * x; \ 165 __typeof(x) x4 = x2 * x2; \ 166 __typeof(x) x8 = x4 * x4; \ 167 __typeof(x) q = mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), \ 168 x8, \ 169 mul_add(mul_add(mul_add(c7, x, c6), \ 170 x2, \ 171 mul_add(c5, x, c4)), \ 172 x4, \ 173 mul_add(mul_add(c3, x, c2), \ 174 x2, \ 175 mul_add(c1, x, c0)))); \ 176 q; \ 177 }) 178 179 /* 180 * p(x) = c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + 181 * c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 182 * = ((c6+c7*x)*x2 + (c4+c5*x))*x4 + 183 * (c8+c9*x)*x8) + 184 * ((c2+c3*x)*x2 + (c0+c1*x)); 185 */ 186 #define POLY_EVAL_9(x, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) ({ \ 187 __typeof(x) x2 = x * x; \ 188 __typeof(x) x4 = x2 * x2; \ 189 __typeof(x) x8 = x4 * x4; \ 190 __typeof(x) q = mul_add(mul_add(c9, x, c8), \ 191 x8, \ 192 mul_add(mul_add(mul_add(c7, x, c6), \ 193 x2, \ 194 mul_add(c5, x, c4)), \ 195 x4, \ 196 mul_add(mul_add(c3, x, c2), \ 197 x2, \ 198 mul_add(c1, x, c0)))); \ 199 q; \ 200 }) 201 202 /* 203 * p(x) = c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 204 * = ((c6+c7*x)*x2 + (c4+c5*x))*x4 + 205 * (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x)) 206 */ 207 #define POLY_EVAL_8(x, c0, c1, c2, c3, c4, c5, c6, c7, c8) ({ \ 208 __typeof(x) x2 = x * x; \ 209 __typeof(x) x4 = x2 * x2; \ 210 __typeof(x) x8 = x4 * x4; \ 211 __typeof(x) q = mul_add(mul_add(mul_add(c7, x, c6), \ 212 x2, \ 213 mul_add(c5, x, c4)), \ 214 x4, \ 215 mul_add(mul_add(c3, x, c2), \ 216 x2, \ 217 mul_add(c1, x, c0) + c8*x8)); \ 218 q; \ 219 }) 220 221 222 /* 223 * p(x) = c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 224 * = ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x)) 225 */ 226 227 #define POLY_EVAL_7(x, c0, c1, c2, c3, c4, c5, c6, c7) ({ \ 228 __typeof(x) x2 = x * x; \ 229 __typeof(x) x4 = x2 * x2; \ 230 __typeof(x) q = mul_add(mul_add(mul_add(c7, x, c6), \ 231 x2, \ 232 mul_add(c5, x, c4)), \ 233 x4, \ 234 mul_add(mul_add(c3, x, c2), \ 235 x2, \ 236 mul_add(c1, x, c0))); \ 237 q; \ 238 }) 239 240 /* 241 * p(x) = 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 242 * = (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 243 */ 244 #define POLY_EVAL_6(x, c0, c1, c2, c3, c4, c5, c6) ({ \ 245 __typeof(x) x2 = x * x; \ 246 __typeof(x) x4 = x2 * x2; \ 247 __typeof(x) q = mul_add(mul_add(c6, \ 248 x2, \ 249 mul_add(c5, x, c4)), \ 250 x4, \ 251 mul_add(mul_add(c3, x, c2), \ 252 x2, \ 253 mul_add(c1, x, c0))); \ 254 q; \ 255 }) 256 257 /* 258 * p(x) = c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 259 * = (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x)) 260 */ 261 #define POLY_EVAL_5(x, c0, c1, c2, c3, c4, c5) ({ \ 262 __typeof(x) x2 = x * x; \ 263 __typeof(x) x4 = x2 * x2; \ 264 __typeof(x) q = mul_add(mul_add(c3, x, c2), \ 265 x2, \ 266 mul_add(mul_add(c5, x, c4), \ 267 x4, \ 268 mul_add(c1, x, c0))); \ 269 q; \ 270 }) 271 272 /* 273 * p(x) = c1*x^3 + c0*x^2 + x 274 * = (c0+c1*x)*x^2 + x 275 */ 276 277 #define POLY_EVAL_1(x, c0, c1) ({ \ 278 __typeof(x) x2 = x * x; \ 279 __typeof(x) q = mul_add(mul_add(c1, x, c0), \ 280 x2, \ 281 x); \ 282 q; \ 283 }) 284 285 /* 286 * Polynomial of degree 15, 287 * - uses only odd terms and 288 * - C0 = 0 289 * p(x) = (c7*x^15 + c6*x^13 + c5*x^11 + c4*x^9 + c3*x^7 + c2*x^5 + c1*x^3) + x 290 * = (c7*x^12 + c6*x^10 + c5*x^8 + c4*x^6 + c3*x^4 + c2*x^2 + c1) * \ 291 * x^2 *x + x 292 */ 293 #define POLY_EVAL_ODD_15(x, c1, c2, c3, c4, c5, c6, c7) ({ \ 294 __typeof(x) x2 = x * x; \ 295 __typeof(x) x4 = x2 * x2; \ 296 __typeof(x) x8 = x4 * x4; \ 297 __typeof(x) q = mul_add(mul_add(c7, \ 298 x4, \ 299 mul_add(c6, x2, c5)), \ 300 x8, \ 301 mul_add(mul_add(c4, x2, c3), \ 302 x4, \ 303 mul_add(c2, x2, c1))); \ 304 x + ((q * x2) * x) ; \ 305 }) 306 307 /* 308 * Polynomial of degree 15, 309 * - uses only even terms and 310 * - C0 = 0 311 * p(x) = (c8*x^14 + c7*x^12 + c6*x^10 + c5*x^8 + c4*x^6 + c3*x^4 + c2*x^2 + c1)*x 312 * = ((c7+c8*x2)*x4 + (c5+c6*x2))*x8 + 313 * (c3+c4*x2)*x4 + (c1+c2*x2))*x 314 */ 315 #define POLY_EVAL_EVEN_15(x, c1, c2, c3, c4, c5, c6, c7, c8) ({ \ 316 __typeof(x) x2 = x * x; \ 317 __typeof(x) x4 = x2 * x2; \ 318 __typeof(x) x8 = x4 * x4; \ 319 __typeof(x) q = mul_add(mul_add(mul_add(c8, x2, c7), \ 320 x4, \ 321 mul_add(c6, x2, c5)), \ 322 x8, \ 323 mul_add(mul_add(c4, x2, c3), \ 324 x4, \ 325 mul_add(c2, x2, c1))); \ 326 x * q; \ 327 }) 328 329 330 #endif /* LIBM_POLY_VEC_H */ 331