ffx_a.h
   1  //==============================================================================================================================
   2  //
   3  //                                               [A] SHADER PORTABILITY 1.20210629
   4  //
   5  //==============================================================================================================================
   6  // FidelityFX Super Resolution Sample
   7  //
   8  // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
   9  // Permission is hereby granted, free of charge, to any person obtaining a copy
  10  // of this software and associated documentation files(the "Software"), to deal
  11  // in the Software without restriction, including without limitation the rights
  12  // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  13  // copies of the Software, and to permit persons to whom the Software is
  14  // furnished to do so, subject to the following conditions :
  15  // The above copyright notice and this permission notice shall be included in
  16  // all copies or substantial portions of the Software.
  17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23  // THE SOFTWARE.
  24  //------------------------------------------------------------------------------------------------------------------------------
  25  // MIT LICENSE
  26  // ===========
  27  // Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
  28  // -----------
  29  // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
  30  // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
  31  // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
  32  // Software is furnished to do so, subject to the following conditions:
  33  // -----------
  34  // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  35  // Software.
  36  // -----------
  37  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  38  // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
  39  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  40  // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  41  //------------------------------------------------------------------------------------------------------------------------------
  42  // ABOUT
  43  // =====
  44  // Common central point for high-level shading language and C portability for various shader headers.
  45  //------------------------------------------------------------------------------------------------------------------------------
  46  // DEFINES
  47  // =======
  48  // A_CPU ..... Include the CPU related code.
  49  // A_GPU ..... Include the GPU related code.
  50  // A_GLSL .... Using GLSL.
  51  // A_HLSL .... Using HLSL.
  52  // A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
  53  // A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
  54  // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
  55  // =======
  56  // A_BYTE .... Support 8-bit integer.
  57  // A_HALF .... Support 16-bit integer and floating point.
  58  // A_LONG .... Support 64-bit integer.
  59  // A_DUBL .... Support 64-bit floating point.
  60  // =======
  61  // A_WAVE .... Support wave-wide operations.
  62  //------------------------------------------------------------------------------------------------------------------------------
  63  // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
  64  //------------------------------------------------------------------------------------------------------------------------------
  65  // SIMPLIFIED TYPE SYSTEM
  66  // ======================
  67  //  - All ints will be unsigned with exception of when signed is required.
  68  //  - Type naming simplified and shortened "A<type><#components>",
  69  //     - H = 16-bit float (half)
  70  //     - F = 32-bit float (float)
  71  //     - D = 64-bit float (double)
  72  //     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
  73  //     - B = 8-bit integer (byte)
  74  //     - W = 16-bit integer (word)
  75  //     - U = 32-bit integer (unsigned)
  76  //     - L = 64-bit integer (long)
  77  //  - Using "AS<type><#components>" for signed when required.
  78  //------------------------------------------------------------------------------------------------------------------------------
  79  // TODO
  80  // ====
  81  //  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
  82  //------------------------------------------------------------------------------------------------------------------------------
  83  // CHANGE LOG
  84  // ==========
  85  // 20200914 - Expanded wave ops and prx code.
  86  // 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
  87  //==============================================================================================================================
  88  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  89  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  90  //_____________________________________________________________/\_______________________________________________________________
  91  //==============================================================================================================================
  92  //                                                           COMMON
  93  //==============================================================================================================================
  94  #define A_2PI 6.28318530718
  95  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  96  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  97  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  98  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  99  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 100  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 101  //_____________________________________________________________/\_______________________________________________________________
 102  //==============================================================================================================================
 103  //
 104  //
 105  //                                                             CPU
 106  //
 107  //
 108  //==============================================================================================================================
 109  #ifdef A_CPU
 110   // Supporting user defined overrides.
 111   #ifndef A_RESTRICT
 112    #define A_RESTRICT __restrict
 113   #endif
 114  //------------------------------------------------------------------------------------------------------------------------------
 115   #ifndef A_STATIC
 116    #define A_STATIC static
 117   #endif
 118  //------------------------------------------------------------------------------------------------------------------------------
 119   // Same types across CPU and GPU.
 120   // Predicate uses 32-bit integer (C friendly bool).
 121   typedef uint32_t AP1;
 122   typedef float AF1;
 123   typedef double AD1;
 124   typedef uint8_t AB1;
 125   typedef uint16_t AW1;
 126   typedef uint32_t AU1;
 127   typedef uint64_t AL1;
 128   typedef int8_t ASB1;
 129   typedef int16_t ASW1;
 130   typedef int32_t ASU1;
 131   typedef int64_t ASL1;
 132  //------------------------------------------------------------------------------------------------------------------------------
 133   #define AD1_(a) ((AD1)(a))
 134   #define AF1_(a) ((AF1)(a))
 135   #define AL1_(a) ((AL1)(a))
 136   #define AU1_(a) ((AU1)(a))
 137  //------------------------------------------------------------------------------------------------------------------------------
 138   #define ASL1_(a) ((ASL1)(a))
 139   #define ASU1_(a) ((ASU1)(a))
 140  //------------------------------------------------------------------------------------------------------------------------------
 141   A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
 142  //------------------------------------------------------------------------------------------------------------------------------
 143   #define A_TRUE 1
 144   #define A_FALSE 0
 145  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 146  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 147  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 148  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 149  //_____________________________________________________________/\_______________________________________________________________
 150  //==============================================================================================================================
 151  //
 152  //                                                       CPU/GPU PORTING
 153  //
 154  //------------------------------------------------------------------------------------------------------------------------------
 155  // Get CPU and GPU to share all setup code, without duplicate code paths.
 156  // This uses a lower-case prefix for special vector constructs.
 157  //  - In C restrict pointers are used.
 158  //  - In the shading language, in/inout/out arguments are used.
 159  // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
 160  //==============================================================================================================================
 161  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 162  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 163  //_____________________________________________________________/\_______________________________________________________________
 164  //==============================================================================================================================
 165  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
 166  //==============================================================================================================================
 167   #define retAD2 AD1 *A_RESTRICT
 168   #define retAD3 AD1 *A_RESTRICT
 169   #define retAD4 AD1 *A_RESTRICT
 170   #define retAF2 AF1 *A_RESTRICT
 171   #define retAF3 AF1 *A_RESTRICT
 172   #define retAF4 AF1 *A_RESTRICT
 173   #define retAL2 AL1 *A_RESTRICT
 174   #define retAL3 AL1 *A_RESTRICT
 175   #define retAL4 AL1 *A_RESTRICT
 176   #define retAU2 AU1 *A_RESTRICT
 177   #define retAU3 AU1 *A_RESTRICT
 178   #define retAU4 AU1 *A_RESTRICT
 179  //------------------------------------------------------------------------------------------------------------------------------
 180   #define inAD2 AD1 *A_RESTRICT
 181   #define inAD3 AD1 *A_RESTRICT
 182   #define inAD4 AD1 *A_RESTRICT
 183   #define inAF2 AF1 *A_RESTRICT
 184   #define inAF3 AF1 *A_RESTRICT
 185   #define inAF4 AF1 *A_RESTRICT
 186   #define inAL2 AL1 *A_RESTRICT
 187   #define inAL3 AL1 *A_RESTRICT
 188   #define inAL4 AL1 *A_RESTRICT
 189   #define inAU2 AU1 *A_RESTRICT
 190   #define inAU3 AU1 *A_RESTRICT
 191   #define inAU4 AU1 *A_RESTRICT
 192  //------------------------------------------------------------------------------------------------------------------------------
 193   #define inoutAD2 AD1 *A_RESTRICT
 194   #define inoutAD3 AD1 *A_RESTRICT
 195   #define inoutAD4 AD1 *A_RESTRICT
 196   #define inoutAF2 AF1 *A_RESTRICT
 197   #define inoutAF3 AF1 *A_RESTRICT
 198   #define inoutAF4 AF1 *A_RESTRICT
 199   #define inoutAL2 AL1 *A_RESTRICT
 200   #define inoutAL3 AL1 *A_RESTRICT
 201   #define inoutAL4 AL1 *A_RESTRICT
 202   #define inoutAU2 AU1 *A_RESTRICT
 203   #define inoutAU3 AU1 *A_RESTRICT
 204   #define inoutAU4 AU1 *A_RESTRICT
 205  //------------------------------------------------------------------------------------------------------------------------------
 206   #define outAD2 AD1 *A_RESTRICT
 207   #define outAD3 AD1 *A_RESTRICT
 208   #define outAD4 AD1 *A_RESTRICT
 209   #define outAF2 AF1 *A_RESTRICT
 210   #define outAF3 AF1 *A_RESTRICT
 211   #define outAF4 AF1 *A_RESTRICT
 212   #define outAL2 AL1 *A_RESTRICT
 213   #define outAL3 AL1 *A_RESTRICT
 214   #define outAL4 AL1 *A_RESTRICT
 215   #define outAU2 AU1 *A_RESTRICT
 216   #define outAU3 AU1 *A_RESTRICT
 217   #define outAU4 AU1 *A_RESTRICT
 218  //------------------------------------------------------------------------------------------------------------------------------
 219   #define varAD2(x) AD1 x[2]
 220   #define varAD3(x) AD1 x[3]
 221   #define varAD4(x) AD1 x[4]
 222   #define varAF2(x) AF1 x[2]
 223   #define varAF3(x) AF1 x[3]
 224   #define varAF4(x) AF1 x[4]
 225   #define varAL2(x) AL1 x[2]
 226   #define varAL3(x) AL1 x[3]
 227   #define varAL4(x) AL1 x[4]
 228   #define varAU2(x) AU1 x[2]
 229   #define varAU3(x) AU1 x[3]
 230   #define varAU4(x) AU1 x[4]
 231  //------------------------------------------------------------------------------------------------------------------------------
 232   #define initAD2(x,y) {x,y}
 233   #define initAD3(x,y,z) {x,y,z}
 234   #define initAD4(x,y,z,w) {x,y,z,w}
 235   #define initAF2(x,y) {x,y}
 236   #define initAF3(x,y,z) {x,y,z}
 237   #define initAF4(x,y,z,w) {x,y,z,w}
 238   #define initAL2(x,y) {x,y}
 239   #define initAL3(x,y,z) {x,y,z}
 240   #define initAL4(x,y,z,w) {x,y,z,w}
 241   #define initAU2(x,y) {x,y}
 242   #define initAU3(x,y,z) {x,y,z}
 243   #define initAU4(x,y,z,w) {x,y,z,w}
 244  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 245  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 246  //_____________________________________________________________/\_______________________________________________________________
 247  //==============================================================================================================================
 248  //                                                     SCALAR RETURN OPS
 249  //------------------------------------------------------------------------------------------------------------------------------
 250  // TODO
 251  // ====
 252  //  - Replace transcendentals with manual versions. 
 253  //==============================================================================================================================
 254   #ifdef A_GCC
 255    A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
 256    A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
 257    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
 258    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
 259   #else
 260    A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
 261    A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
 262    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
 263    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
 264   #endif
 265  //------------------------------------------------------------------------------------------------------------------------------
 266   #ifdef A_GCC
 267    A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
 268    A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
 269   #else
 270    A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
 271    A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
 272   #endif
 273  //------------------------------------------------------------------------------------------------------------------------------
 274   A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
 275   A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 276   A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 277   A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
 278   A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 279   A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 280  //------------------------------------------------------------------------------------------------------------------------------
 281   #ifdef A_GCC
 282    A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
 283    A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
 284   #else
 285    A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
 286    A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
 287   #endif
 288  //------------------------------------------------------------------------------------------------------------------------------
 289   #ifdef A_GCC
 290    A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
 291    A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
 292   #else
 293    A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
 294    A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
 295   #endif
 296  //------------------------------------------------------------------------------------------------------------------------------
 297   A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
 298   A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
 299  //------------------------------------------------------------------------------------------------------------------------------
 300   #ifdef A_GCC
 301    A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
 302    A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
 303   #else
 304    A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
 305    A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
 306   #endif
 307  //------------------------------------------------------------------------------------------------------------------------------
 308   A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
 309   A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
 310   A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
 311   A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
 312  //------------------------------------------------------------------------------------------------------------------------------
 313   // These follow the convention that A integer types don't have signage, until they are operated on. 
 314   A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
 315   A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
 316  //------------------------------------------------------------------------------------------------------------------------------
 317   A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
 318   A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
 319   A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
 320   A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
 321  //------------------------------------------------------------------------------------------------------------------------------
 322   A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
 323   A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
 324  //------------------------------------------------------------------------------------------------------------------------------
 325   A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
 326   A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
 327  //------------------------------------------------------------------------------------------------------------------------------
 328   A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
 329   A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
 330  //------------------------------------------------------------------------------------------------------------------------------
 331   #ifdef A_GCC
 332    A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
 333    A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
 334   #else
 335    A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
 336    A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
 337   #endif
 338  //------------------------------------------------------------------------------------------------------------------------------
 339   #ifdef A_GCC
 340    A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
 341    A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
 342   #else
 343    A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
 344    A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
 345   #endif
 346  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 347  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 348  //_____________________________________________________________/\_______________________________________________________________
 349  //==============================================================================================================================
 350  //                                               SCALAR RETURN OPS - DEPENDENT
 351  //==============================================================================================================================
 352   A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
 353   A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
 354  //------------------------------------------------------------------------------------------------------------------------------
 355   A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
 356   A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
 357  //------------------------------------------------------------------------------------------------------------------------------
 358   A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
 359   A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
 360  //------------------------------------------------------------------------------------------------------------------------------
 361   A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
 362   A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
 363  //------------------------------------------------------------------------------------------------------------------------------
 364   A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
 365   A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
 366  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 367  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 368  //_____________________________________________________________/\_______________________________________________________________
 369  //==============================================================================================================================
 370  //                                                         VECTOR OPS
 371  //------------------------------------------------------------------------------------------------------------------------------
 372  // These are added as needed for production or prototyping, so not necessarily a complete set.
 373  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
 374  //==============================================================================================================================
 375   A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
 376   A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
 377   A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
 378  //------------------------------------------------------------------------------------------------------------------------------
 379   A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
 380   A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
 381   A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
 382  //==============================================================================================================================
 383   A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 384   A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 385   A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 386  //------------------------------------------------------------------------------------------------------------------------------
 387   A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 388   A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 389   A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 390  //==============================================================================================================================
 391   A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 392   A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 393   A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 394  //------------------------------------------------------------------------------------------------------------------------------
 395   A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 396   A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 397   A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 398  //==============================================================================================================================
 399   A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
 400   A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 401   A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 402  //------------------------------------------------------------------------------------------------------------------------------
 403   A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
 404   A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 405   A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 406  //==============================================================================================================================
 407   A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
 408   A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
 409   A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
 410  //------------------------------------------------------------------------------------------------------------------------------
 411   A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
 412   A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
 413   A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
 414  //==============================================================================================================================
 415   A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
 416   A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
 417   A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
 418  //------------------------------------------------------------------------------------------------------------------------------
 419   A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
 420   A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
 421   A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
 422  //==============================================================================================================================
 423   A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
 424   A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
 425   A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
 426  //------------------------------------------------------------------------------------------------------------------------------
 427   A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
 428   A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
 429   A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
 430  //==============================================================================================================================
 431   A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
 432   A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
 433   A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
 434  //------------------------------------------------------------------------------------------------------------------------------
 435   A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
 436   A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
 437   A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
 438  //==============================================================================================================================
 439   A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 440   A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 441   A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 442  //------------------------------------------------------------------------------------------------------------------------------
 443   A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 444   A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 445   A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 446  //==============================================================================================================================
 447   A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 448   A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 449   A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 450  //------------------------------------------------------------------------------------------------------------------------------
 451   A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 452   A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 453   A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 454  //==============================================================================================================================
 455   A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 456   A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 457   A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 458  //------------------------------------------------------------------------------------------------------------------------------
 459   A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 460   A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 461   A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 462  //==============================================================================================================================
 463   A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
 464   A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
 465   A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
 466  //------------------------------------------------------------------------------------------------------------------------------
 467   A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
 468   A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
 469   A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
 470  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 471  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 472  //_____________________________________________________________/\_______________________________________________________________
 473  //==============================================================================================================================
 474  //                                                     HALF FLOAT PACKING
 475  //==============================================================================================================================
 476   // Convert float to half (in lower 16-bits of output).
 477   // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
 478   // Supports denormals.
 479   // Conversion rules are to make computations possibly "safer" on the GPU,
 480   //  -INF & -NaN -> -65504
 481   //  +INF & +NaN -> +65504
 482   A_STATIC AU1 AU1_AH1_AF1(AF1 f){
 483    static AW1 base[512]={
 484     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 485     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 486     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 487     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 488     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 489     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 490     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
 491     0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
 492     0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
 493     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 494     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 495     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 496     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 497     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 498     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 499     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 500     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 501     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 502     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 503     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 504     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 505     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 506     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
 507     0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
 508     0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
 509     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 510     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 511     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 512     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 513     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 514     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 515     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
 516    static AB1 shift[512]={
 517     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 518     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 519     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 520     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 521     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 522     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 523     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 524     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 525     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 526     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 527     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 528     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 529     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 530     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 531     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 532     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 533     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 534     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 535     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 536     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 537     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 538     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 539     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 540     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 541     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 542     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 543     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 544     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 545     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 546     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 547     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 548     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
 549    union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
 550  //------------------------------------------------------------------------------------------------------------------------------
 551   // Used to output packed constant.
 552   A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
 553  #endif
 554  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 555  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 556  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 557  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 558  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 559  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 560  //_____________________________________________________________/\_______________________________________________________________
 561  //==============================================================================================================================
 562  //
 563  //
 564  //                                                            GLSL
 565  //
 566  //
 567  //==============================================================================================================================
 568  #if defined(A_GLSL) && defined(A_GPU)
 569   #ifndef A_SKIP_EXT
 570    #ifdef A_HALF
 571     #extension GL_EXT_shader_16bit_storage:require
 572     #extension GL_EXT_shader_explicit_arithmetic_types:require 
 573    #endif
 574  //------------------------------------------------------------------------------------------------------------------------------
 575    #ifdef A_LONG
 576     #extension GL_ARB_gpu_shader_int64:require
 577     #extension GL_NV_shader_atomic_int64:require
 578    #endif
 579  //------------------------------------------------------------------------------------------------------------------------------
 580    #ifdef A_WAVE
 581     #extension GL_KHR_shader_subgroup_arithmetic:require
 582     #extension GL_KHR_shader_subgroup_ballot:require
 583     #extension GL_KHR_shader_subgroup_quad:require
 584     #extension GL_KHR_shader_subgroup_shuffle:require
 585    #endif
 586   #endif
 587  //==============================================================================================================================
 588   #define AP1 bool
 589   #define AP2 bvec2
 590   #define AP3 bvec3
 591   #define AP4 bvec4
 592  //------------------------------------------------------------------------------------------------------------------------------
 593   #define AF1 float
 594   #define AF2 vec2
 595   #define AF3 vec3
 596   #define AF4 vec4
 597  //------------------------------------------------------------------------------------------------------------------------------
 598   #define AU1 uint
 599   #define AU2 uvec2
 600   #define AU3 uvec3
 601   #define AU4 uvec4
 602  //------------------------------------------------------------------------------------------------------------------------------
 603   #define ASU1 int
 604   #define ASU2 ivec2
 605   #define ASU3 ivec3
 606   #define ASU4 ivec4
 607  //==============================================================================================================================
 608   #define AF1_AU1(x) uintBitsToFloat(AU1(x))
 609   #define AF2_AU2(x) uintBitsToFloat(AU2(x))
 610   #define AF3_AU3(x) uintBitsToFloat(AU3(x))
 611   #define AF4_AU4(x) uintBitsToFloat(AU4(x))
 612  //------------------------------------------------------------------------------------------------------------------------------
 613   #define AU1_AF1(x) floatBitsToUint(AF1(x))
 614   #define AU2_AF2(x) floatBitsToUint(AF2(x))
 615   #define AU3_AF3(x) floatBitsToUint(AF3(x))
 616   #define AU4_AF4(x) floatBitsToUint(AF4(x))
 617  //------------------------------------------------------------------------------------------------------------------------------
 618   AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
 619   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
 620  //------------------------------------------------------------------------------------------------------------------------------
 621   #define AU1_AH2_AF2 packHalf2x16
 622   #define AU1_AW2Unorm_AF2 packUnorm2x16
 623   #define AU1_AB4Unorm_AF4 packUnorm4x8
 624  //------------------------------------------------------------------------------------------------------------------------------
 625   #define AF2_AH2_AU1 unpackHalf2x16
 626   #define AF2_AW2Unorm_AU1 unpackUnorm2x16
 627   #define AF4_AB4Unorm_AU1 unpackUnorm4x8
 628  //==============================================================================================================================
 629   AF1 AF1_x(AF1 a){return AF1(a);}
 630   AF2 AF2_x(AF1 a){return AF2(a,a);}
 631   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
 632   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
 633   #define AF1_(a) AF1_x(AF1(a))
 634   #define AF2_(a) AF2_x(AF1(a))
 635   #define AF3_(a) AF3_x(AF1(a))
 636   #define AF4_(a) AF4_x(AF1(a))
 637  //------------------------------------------------------------------------------------------------------------------------------
 638   AU1 AU1_x(AU1 a){return AU1(a);}
 639   AU2 AU2_x(AU1 a){return AU2(a,a);}
 640   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
 641   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
 642   #define AU1_(a) AU1_x(AU1(a))
 643   #define AU2_(a) AU2_x(AU1(a))
 644   #define AU3_(a) AU3_x(AU1(a))
 645   #define AU4_(a) AU4_x(AU1(a))
 646  //==============================================================================================================================
 647   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
 648   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
 649   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
 650   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
 651  //------------------------------------------------------------------------------------------------------------------------------
 652   AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
 653   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
 654   // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
 655   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
 656  //------------------------------------------------------------------------------------------------------------------------------
 657   // V_MED3_F32.
 658   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
 659   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
 660   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
 661   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
 662  //------------------------------------------------------------------------------------------------------------------------------
 663   // V_FRACT_F32 (note DX frac() is different).
 664   AF1 AFractF1(AF1 x){return fract(x);}
 665   AF2 AFractF2(AF2 x){return fract(x);}
 666   AF3 AFractF3(AF3 x){return fract(x);}
 667   AF4 AFractF4(AF4 x){return fract(x);}
 668  //------------------------------------------------------------------------------------------------------------------------------
 669   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
 670   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
 671   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
 672   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
 673  //------------------------------------------------------------------------------------------------------------------------------
 674   // V_MAX3_F32.
 675   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
 676   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
 677   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
 678   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
 679  //------------------------------------------------------------------------------------------------------------------------------
 680   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
 681   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
 682   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
 683   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
 684  //------------------------------------------------------------------------------------------------------------------------------
 685   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
 686   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
 687   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
 688   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
 689  //------------------------------------------------------------------------------------------------------------------------------
 690   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
 691   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
 692   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
 693   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
 694  //------------------------------------------------------------------------------------------------------------------------------
 695   // Clamp has an easier pattern match for med3 when some ordering is known.
 696   // V_MED3_F32.
 697   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
 698   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
 699   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
 700   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
 701  //------------------------------------------------------------------------------------------------------------------------------
 702   // V_MIN3_F32.
 703   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
 704   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
 705   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
 706   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
 707  //------------------------------------------------------------------------------------------------------------------------------
 708   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
 709   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
 710   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
 711   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
 712  //------------------------------------------------------------------------------------------------------------------------------
 713   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
 714   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
 715   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
 716   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
 717  //------------------------------------------------------------------------------------------------------------------------------
 718   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
 719   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
 720   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
 721   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
 722  //------------------------------------------------------------------------------------------------------------------------------
 723   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 724   // V_COS_F32.
 725   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
 726   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
 727   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
 728   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
 729  //------------------------------------------------------------------------------------------------------------------------------
 730   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 731   // V_SIN_F32.
 732   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
 733   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
 734   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
 735   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
 736  //------------------------------------------------------------------------------------------------------------------------------
 737   AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
 738   AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
 739   AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
 740   AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
 741  //------------------------------------------------------------------------------------------------------------------------------
 742   AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
 743   AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
 744   AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
 745   AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
 746  //------------------------------------------------------------------------------------------------------------------------------
 747   AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
 748   AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
 749   AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
 750   AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
 751  //------------------------------------------------------------------------------------------------------------------------------
 752   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
 753   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
 754   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
 755   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
 756  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 757  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 758  //_____________________________________________________________/\_______________________________________________________________
 759  //==============================================================================================================================
 760  //                                                          GLSL BYTE
 761  //==============================================================================================================================
 762   #ifdef A_BYTE
 763    #define AB1 uint8_t
 764    #define AB2 u8vec2
 765    #define AB3 u8vec3
 766    #define AB4 u8vec4
 767  //------------------------------------------------------------------------------------------------------------------------------
 768    #define ASB1 int8_t
 769    #define ASB2 i8vec2
 770    #define ASB3 i8vec3
 771    #define ASB4 i8vec4
 772  //------------------------------------------------------------------------------------------------------------------------------
 773    AB1 AB1_x(AB1 a){return AB1(a);}
 774    AB2 AB2_x(AB1 a){return AB2(a,a);}
 775    AB3 AB3_x(AB1 a){return AB3(a,a,a);}
 776    AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
 777    #define AB1_(a) AB1_x(AB1(a))
 778    #define AB2_(a) AB2_x(AB1(a))
 779    #define AB3_(a) AB3_x(AB1(a))
 780    #define AB4_(a) AB4_x(AB1(a))
 781   #endif
 782  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 783  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 784  //_____________________________________________________________/\_______________________________________________________________
 785  //==============================================================================================================================
 786  //                                                          GLSL HALF
 787  //==============================================================================================================================
 788   #ifdef A_HALF
 789    #define AH1 float16_t
 790    #define AH2 f16vec2
 791    #define AH3 f16vec3
 792    #define AH4 f16vec4
 793  //------------------------------------------------------------------------------------------------------------------------------
 794    #define AW1 uint16_t
 795    #define AW2 u16vec2
 796    #define AW3 u16vec3
 797    #define AW4 u16vec4
 798  //------------------------------------------------------------------------------------------------------------------------------
 799    #define ASW1 int16_t
 800    #define ASW2 i16vec2
 801    #define ASW3 i16vec3
 802    #define ASW4 i16vec4
 803  //==============================================================================================================================
 804    #define AH2_AU1(x) unpackFloat2x16(AU1(x))
 805    AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
 806    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
 807    #define AW2_AU1(x) unpackUint2x16(AU1(x))
 808    #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
 809  //------------------------------------------------------------------------------------------------------------------------------
 810    #define AU1_AH2(x) packFloat2x16(AH2(x))
 811    AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
 812    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
 813    #define AU1_AW2(x) packUint2x16(AW2(x))
 814    #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
 815  //==============================================================================================================================
 816    #define AW1_AH1(x) halfBitsToUint16(AH1(x))
 817    #define AW2_AH2(x) halfBitsToUint16(AH2(x))
 818    #define AW3_AH3(x) halfBitsToUint16(AH3(x))
 819    #define AW4_AH4(x) halfBitsToUint16(AH4(x))
 820  //------------------------------------------------------------------------------------------------------------------------------
 821    #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
 822    #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
 823    #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
 824    #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
 825  //==============================================================================================================================
 826    AH1 AH1_x(AH1 a){return AH1(a);}
 827    AH2 AH2_x(AH1 a){return AH2(a,a);}
 828    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
 829    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
 830    #define AH1_(a) AH1_x(AH1(a))
 831    #define AH2_(a) AH2_x(AH1(a))
 832    #define AH3_(a) AH3_x(AH1(a))
 833    #define AH4_(a) AH4_x(AH1(a))
 834  //------------------------------------------------------------------------------------------------------------------------------
 835    AW1 AW1_x(AW1 a){return AW1(a);}
 836    AW2 AW2_x(AW1 a){return AW2(a,a);}
 837    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
 838    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
 839    #define AW1_(a) AW1_x(AW1(a))
 840    #define AW2_(a) AW2_x(AW1(a))
 841    #define AW3_(a) AW3_x(AW1(a))
 842    #define AW4_(a) AW4_x(AW1(a))
 843  //==============================================================================================================================
 844    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
 845    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
 846    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
 847    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
 848  //------------------------------------------------------------------------------------------------------------------------------
 849    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
 850    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
 851    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
 852    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
 853  //------------------------------------------------------------------------------------------------------------------------------
 854    AH1 AFractH1(AH1 x){return fract(x);}
 855    AH2 AFractH2(AH2 x){return fract(x);}
 856    AH3 AFractH3(AH3 x){return fract(x);}
 857    AH4 AFractH4(AH4 x){return fract(x);}
 858  //------------------------------------------------------------------------------------------------------------------------------
 859    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
 860    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
 861    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
 862    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
 863  //------------------------------------------------------------------------------------------------------------------------------
 864    // No packed version of max3.
 865    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
 866    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
 867    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
 868    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
 869  //------------------------------------------------------------------------------------------------------------------------------
 870    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
 871    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
 872    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
 873    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
 874  //------------------------------------------------------------------------------------------------------------------------------
 875    // No packed version of min3.
 876    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
 877    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
 878    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
 879    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
 880  //------------------------------------------------------------------------------------------------------------------------------
 881    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
 882    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
 883    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
 884    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
 885  //------------------------------------------------------------------------------------------------------------------------------
 886    AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
 887    AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
 888    AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
 889    AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
 890  //------------------------------------------------------------------------------------------------------------------------------
 891    AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
 892    AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
 893    AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
 894    AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
 895  //------------------------------------------------------------------------------------------------------------------------------
 896    AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
 897    AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
 898    AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
 899    AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
 900  //------------------------------------------------------------------------------------------------------------------------------
 901    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
 902    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
 903    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
 904    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
 905   #endif
 906  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 907  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 908  //_____________________________________________________________/\_______________________________________________________________
 909  //==============================================================================================================================
 910  //                                                         GLSL DOUBLE
 911  //==============================================================================================================================
 912   #ifdef A_DUBL
 913    #define AD1 double
 914    #define AD2 dvec2
 915    #define AD3 dvec3
 916    #define AD4 dvec4
 917  //------------------------------------------------------------------------------------------------------------------------------
 918    AD1 AD1_x(AD1 a){return AD1(a);}
 919    AD2 AD2_x(AD1 a){return AD2(a,a);}
 920    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
 921    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
 922    #define AD1_(a) AD1_x(AD1(a))
 923    #define AD2_(a) AD2_x(AD1(a))
 924    #define AD3_(a) AD3_x(AD1(a))
 925    #define AD4_(a) AD4_x(AD1(a))
 926  //==============================================================================================================================
 927    AD1 AFractD1(AD1 x){return fract(x);}
 928    AD2 AFractD2(AD2 x){return fract(x);}
 929    AD3 AFractD3(AD3 x){return fract(x);}
 930    AD4 AFractD4(AD4 x){return fract(x);}
 931  //------------------------------------------------------------------------------------------------------------------------------
 932    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
 933    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
 934    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
 935    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
 936  //------------------------------------------------------------------------------------------------------------------------------
 937    AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
 938    AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
 939    AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
 940    AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
 941  //------------------------------------------------------------------------------------------------------------------------------
 942    AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
 943    AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
 944    AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
 945    AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
 946  //------------------------------------------------------------------------------------------------------------------------------
 947    AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
 948    AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
 949    AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
 950    AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
 951   #endif
 952  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 953  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 954  //_____________________________________________________________/\_______________________________________________________________
 955  //==============================================================================================================================
 956  //                                                         GLSL LONG
 957  //==============================================================================================================================
 958   #ifdef A_LONG
 959    #define AL1 uint64_t
 960    #define AL2 u64vec2
 961    #define AL3 u64vec3
 962    #define AL4 u64vec4
 963  //------------------------------------------------------------------------------------------------------------------------------
 964    #define ASL1 int64_t
 965    #define ASL2 i64vec2
 966    #define ASL3 i64vec3
 967    #define ASL4 i64vec4
 968  //------------------------------------------------------------------------------------------------------------------------------
 969    #define AL1_AU2(x) packUint2x32(AU2(x))
 970    #define AU2_AL1(x) unpackUint2x32(AL1(x))
 971  //------------------------------------------------------------------------------------------------------------------------------
 972    AL1 AL1_x(AL1 a){return AL1(a);}
 973    AL2 AL2_x(AL1 a){return AL2(a,a);}
 974    AL3 AL3_x(AL1 a){return AL3(a,a,a);}
 975    AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
 976    #define AL1_(a) AL1_x(AL1(a))
 977    #define AL2_(a) AL2_x(AL1(a))
 978    #define AL3_(a) AL3_x(AL1(a))
 979    #define AL4_(a) AL4_x(AL1(a))
 980  //==============================================================================================================================
 981    AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
 982    AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
 983    AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
 984    AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
 985  //------------------------------------------------------------------------------------------------------------------------------
 986    AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
 987    AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
 988    AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
 989    AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
 990  //------------------------------------------------------------------------------------------------------------------------------
 991    AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
 992    AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
 993    AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
 994    AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
 995   #endif
 996  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 997  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 998  //_____________________________________________________________/\_______________________________________________________________
 999  //==============================================================================================================================
1000  //                                                      WAVE OPERATIONS
1001  //==============================================================================================================================
1002   #ifdef A_WAVE
1003    // Where 'x' must be a compile time literal.
1004    AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
1005    AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
1006    AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
1007    AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
1008    AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
1009    AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
1010    AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
1011    AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
1012  //------------------------------------------------------------------------------------------------------------------------------
1013    #ifdef A_HALF
1014     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
1015     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
1016     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
1017     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
1018    #endif
1019   #endif
1020  //==============================================================================================================================
1021  #endif
1022  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1023  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1024  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1025  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1026  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1027  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1028  //_____________________________________________________________/\_______________________________________________________________
1029  //==============================================================================================================================
1030  //
1031  //
1032  //                                                            HLSL
1033  //
1034  //
1035  //==============================================================================================================================
1036  #if defined(A_HLSL) && defined(A_GPU)
1037   #ifdef A_HLSL_6_2
1038    #define AP1 bool
1039    #define AP2 bool2
1040    #define AP3 bool3
1041    #define AP4 bool4
1042  //------------------------------------------------------------------------------------------------------------------------------
1043    #define AF1 float32_t
1044    #define AF2 float32_t2
1045    #define AF3 float32_t3
1046    #define AF4 float32_t4
1047  //------------------------------------------------------------------------------------------------------------------------------
1048    #define AU1 uint32_t
1049    #define AU2 uint32_t2
1050    #define AU3 uint32_t3
1051    #define AU4 uint32_t4
1052  //------------------------------------------------------------------------------------------------------------------------------
1053    #define ASU1 int32_t
1054    #define ASU2 int32_t2
1055    #define ASU3 int32_t3
1056    #define ASU4 int32_t4
1057   #else
1058    #define AP1 bool
1059    #define AP2 bool2
1060    #define AP3 bool3
1061    #define AP4 bool4
1062  //------------------------------------------------------------------------------------------------------------------------------
1063    #define AF1 float
1064    #define AF2 float2
1065    #define AF3 float3
1066    #define AF4 float4
1067  //------------------------------------------------------------------------------------------------------------------------------
1068    #define AU1 uint
1069    #define AU2 uint2
1070    #define AU3 uint3
1071    #define AU4 uint4
1072  //------------------------------------------------------------------------------------------------------------------------------
1073    #define ASU1 int
1074    #define ASU2 int2
1075    #define ASU3 int3
1076    #define ASU4 int4
1077   #endif
1078  //==============================================================================================================================
1079   #define AF1_AU1(x) asfloat(AU1(x))
1080   #define AF2_AU2(x) asfloat(AU2(x))
1081   #define AF3_AU3(x) asfloat(AU3(x))
1082   #define AF4_AU4(x) asfloat(AU4(x))
1083  //------------------------------------------------------------------------------------------------------------------------------
1084   #define AU1_AF1(x) asuint(AF1(x))
1085   #define AU2_AF2(x) asuint(AF2(x))
1086   #define AU3_AF3(x) asuint(AF3(x))
1087   #define AU4_AF4(x) asuint(AF4(x))
1088  //------------------------------------------------------------------------------------------------------------------------------
1089   AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
1090   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
1091  //------------------------------------------------------------------------------------------------------------------------------
1092   AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
1093   #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
1094   #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
1095  //------------------------------------------------------------------------------------------------------------------------------
1096   AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
1097   #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
1098  //==============================================================================================================================
1099   AF1 AF1_x(AF1 a){return AF1(a);}
1100   AF2 AF2_x(AF1 a){return AF2(a,a);}
1101   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
1102   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
1103   #define AF1_(a) AF1_x(AF1(a))
1104   #define AF2_(a) AF2_x(AF1(a))
1105   #define AF3_(a) AF3_x(AF1(a))
1106   #define AF4_(a) AF4_x(AF1(a))
1107  //------------------------------------------------------------------------------------------------------------------------------
1108   AU1 AU1_x(AU1 a){return AU1(a);}
1109   AU2 AU2_x(AU1 a){return AU2(a,a);}
1110   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
1111   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
1112   #define AU1_(a) AU1_x(AU1(a))
1113   #define AU2_(a) AU2_x(AU1(a))
1114   #define AU3_(a) AU3_x(AU1(a))
1115   #define AU4_(a) AU4_x(AU1(a))
1116  //==============================================================================================================================
1117   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
1118   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
1119   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
1120   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
1121  //------------------------------------------------------------------------------------------------------------------------------
1122   AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
1123   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
1124   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
1125  //------------------------------------------------------------------------------------------------------------------------------
1126   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
1127   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
1128   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
1129   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
1130  //------------------------------------------------------------------------------------------------------------------------------
1131   AF1 AFractF1(AF1 x){return x-floor(x);}
1132   AF2 AFractF2(AF2 x){return x-floor(x);}
1133   AF3 AFractF3(AF3 x){return x-floor(x);}
1134   AF4 AFractF4(AF4 x){return x-floor(x);}
1135  //------------------------------------------------------------------------------------------------------------------------------
1136   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
1137   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
1138   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
1139   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
1140  //------------------------------------------------------------------------------------------------------------------------------
1141   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
1142   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
1143   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
1144   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
1145  //------------------------------------------------------------------------------------------------------------------------------
1146   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
1147   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
1148   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
1149   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
1150  //------------------------------------------------------------------------------------------------------------------------------
1151   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
1152   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
1153   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
1154   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
1155  //------------------------------------------------------------------------------------------------------------------------------
1156   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
1157   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
1158   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
1159   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
1160  //------------------------------------------------------------------------------------------------------------------------------
1161   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
1162   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
1163   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
1164   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
1165  //------------------------------------------------------------------------------------------------------------------------------
1166   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
1167   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
1168   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
1169   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
1170  //------------------------------------------------------------------------------------------------------------------------------
1171   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
1172   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
1173   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
1174   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
1175  //------------------------------------------------------------------------------------------------------------------------------
1176   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
1177   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
1178   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
1179   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
1180  //------------------------------------------------------------------------------------------------------------------------------
1181   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
1182   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
1183   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
1184   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
1185  //------------------------------------------------------------------------------------------------------------------------------
1186   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
1187   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
1188   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
1189   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
1190  //------------------------------------------------------------------------------------------------------------------------------
1191   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
1192   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
1193   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
1194   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
1195  //------------------------------------------------------------------------------------------------------------------------------
1196   AF1 ARcpF1(AF1 x){return rcp(x);}
1197   AF2 ARcpF2(AF2 x){return rcp(x);}
1198   AF3 ARcpF3(AF3 x){return rcp(x);}
1199   AF4 ARcpF4(AF4 x){return rcp(x);}
1200  //------------------------------------------------------------------------------------------------------------------------------
1201   AF1 ARsqF1(AF1 x){return rsqrt(x);}
1202   AF2 ARsqF2(AF2 x){return rsqrt(x);}
1203   AF3 ARsqF3(AF3 x){return rsqrt(x);}
1204   AF4 ARsqF4(AF4 x){return rsqrt(x);}
1205  //------------------------------------------------------------------------------------------------------------------------------
1206   AF1 ASatF1(AF1 x){return saturate(x);}
1207   AF2 ASatF2(AF2 x){return saturate(x);}
1208   AF3 ASatF3(AF3 x){return saturate(x);}
1209   AF4 ASatF4(AF4 x){return saturate(x);}
1210  //------------------------------------------------------------------------------------------------------------------------------
1211   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
1212   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
1213   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
1214   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
1215  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1216  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1217  //_____________________________________________________________/\_______________________________________________________________
1218  //==============================================================================================================================
1219  //                                                          HLSL BYTE
1220  //==============================================================================================================================
1221   #ifdef A_BYTE
1222   #endif
1223  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1224  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1225  //_____________________________________________________________/\_______________________________________________________________
1226  //==============================================================================================================================
1227  //                                                          HLSL HALF
1228  //==============================================================================================================================
1229   #ifdef A_HALF
1230    #ifdef A_HLSL_6_2
1231     #define AH1 float16_t
1232     #define AH2 float16_t2
1233     #define AH3 float16_t3
1234     #define AH4 float16_t4
1235  //------------------------------------------------------------------------------------------------------------------------------
1236     #define AW1 uint16_t
1237     #define AW2 uint16_t2
1238     #define AW3 uint16_t3
1239     #define AW4 uint16_t4
1240  //------------------------------------------------------------------------------------------------------------------------------
1241     #define ASW1 int16_t
1242     #define ASW2 int16_t2
1243     #define ASW3 int16_t3
1244     #define ASW4 int16_t4
1245    #else
1246     #define AH1 min16float
1247     #define AH2 min16float2
1248     #define AH3 min16float3
1249     #define AH4 min16float4
1250  //------------------------------------------------------------------------------------------------------------------------------
1251     #define AW1 min16uint
1252     #define AW2 min16uint2
1253     #define AW3 min16uint3
1254     #define AW4 min16uint4
1255  //------------------------------------------------------------------------------------------------------------------------------
1256     #define ASW1 min16int
1257     #define ASW2 min16int2
1258     #define ASW3 min16int3
1259     #define ASW4 min16int4
1260    #endif
1261  //==============================================================================================================================
1262    // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
1263    // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
1264    AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
1265    AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
1266    AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
1267    AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
1268    #define AH2_AU1(x) AH2_AU1_x(AU1(x))
1269    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
1270    #define AW2_AU1(x) AW2_AU1_x(AU1(x))
1271    #define AW4_AU2(x) AW4_AU2_x(AU2(x))
1272  //------------------------------------------------------------------------------------------------------------------------------
1273    AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
1274    AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
1275    AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
1276    AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
1277    #define AU1_AH2(x) AU1_AH2_x(AH2(x))
1278    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
1279    #define AU1_AW2(x) AU1_AW2_x(AW2(x))
1280    #define AU2_AW4(x) AU2_AW4_x(AW4(x))
1281  //==============================================================================================================================
1282    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1283     #define AW1_AH1(x) asuint16(x)
1284     #define AW2_AH2(x) asuint16(x)
1285     #define AW3_AH3(x) asuint16(x)
1286     #define AW4_AH4(x) asuint16(x)
1287    #else
1288     #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
1289     #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
1290     #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
1291     #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
1292    #endif
1293  //------------------------------------------------------------------------------------------------------------------------------
1294    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1295     #define AH1_AW1(x) asfloat16(x)
1296     #define AH2_AW2(x) asfloat16(x)
1297     #define AH3_AW3(x) asfloat16(x)
1298     #define AH4_AW4(x) asfloat16(x)
1299    #else
1300     #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
1301     #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
1302     #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
1303     #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
1304    #endif
1305  //==============================================================================================================================
1306    AH1 AH1_x(AH1 a){return AH1(a);}
1307    AH2 AH2_x(AH1 a){return AH2(a,a);}
1308    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
1309    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
1310    #define AH1_(a) AH1_x(AH1(a))
1311    #define AH2_(a) AH2_x(AH1(a))
1312    #define AH3_(a) AH3_x(AH1(a))
1313    #define AH4_(a) AH4_x(AH1(a))
1314  //------------------------------------------------------------------------------------------------------------------------------
1315    AW1 AW1_x(AW1 a){return AW1(a);}
1316    AW2 AW2_x(AW1 a){return AW2(a,a);}
1317    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
1318    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
1319    #define AW1_(a) AW1_x(AW1(a))
1320    #define AW2_(a) AW2_x(AW1(a))
1321    #define AW3_(a) AW3_x(AW1(a))
1322    #define AW4_(a) AW4_x(AW1(a))
1323  //==============================================================================================================================
1324    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
1325    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
1326    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
1327    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
1328  //------------------------------------------------------------------------------------------------------------------------------
1329    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
1330    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
1331    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
1332    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
1333  //------------------------------------------------------------------------------------------------------------------------------
1334   // V_FRACT_F16 (note DX frac() is different).
1335    AH1 AFractH1(AH1 x){return x-floor(x);}
1336    AH2 AFractH2(AH2 x){return x-floor(x);}
1337    AH3 AFractH3(AH3 x){return x-floor(x);}
1338    AH4 AFractH4(AH4 x){return x-floor(x);}
1339  //------------------------------------------------------------------------------------------------------------------------------
1340    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
1341    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
1342    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
1343    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
1344  //------------------------------------------------------------------------------------------------------------------------------
1345    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
1346    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
1347    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
1348    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
1349  //------------------------------------------------------------------------------------------------------------------------------
1350    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
1351    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
1352    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
1353    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
1354  //------------------------------------------------------------------------------------------------------------------------------
1355    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
1356    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
1357    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
1358    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
1359  //------------------------------------------------------------------------------------------------------------------------------
1360    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
1361    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
1362    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
1363    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
1364  //------------------------------------------------------------------------------------------------------------------------------
1365    AH1 ARcpH1(AH1 x){return rcp(x);}
1366    AH2 ARcpH2(AH2 x){return rcp(x);}
1367    AH3 ARcpH3(AH3 x){return rcp(x);}
1368    AH4 ARcpH4(AH4 x){return rcp(x);}
1369  //------------------------------------------------------------------------------------------------------------------------------
1370    AH1 ARsqH1(AH1 x){return rsqrt(x);}
1371    AH2 ARsqH2(AH2 x){return rsqrt(x);}
1372    AH3 ARsqH3(AH3 x){return rsqrt(x);}
1373    AH4 ARsqH4(AH4 x){return rsqrt(x);}
1374  //------------------------------------------------------------------------------------------------------------------------------
1375    AH1 ASatH1(AH1 x){return saturate(x);}
1376    AH2 ASatH2(AH2 x){return saturate(x);}
1377    AH3 ASatH3(AH3 x){return saturate(x);}
1378    AH4 ASatH4(AH4 x){return saturate(x);}
1379  //------------------------------------------------------------------------------------------------------------------------------
1380    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
1381    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
1382    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
1383    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
1384   #endif
1385  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1386  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1387  //_____________________________________________________________/\_______________________________________________________________
1388  //==============================================================================================================================
1389  //                                                         HLSL DOUBLE
1390  //==============================================================================================================================
1391   #ifdef A_DUBL
1392    #ifdef A_HLSL_6_2
1393     #define AD1 float64_t
1394     #define AD2 float64_t2
1395     #define AD3 float64_t3
1396     #define AD4 float64_t4
1397    #else
1398     #define AD1 double
1399     #define AD2 double2
1400     #define AD3 double3
1401     #define AD4 double4
1402    #endif
1403  //------------------------------------------------------------------------------------------------------------------------------
1404    AD1 AD1_x(AD1 a){return AD1(a);}
1405    AD2 AD2_x(AD1 a){return AD2(a,a);}
1406    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
1407    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
1408    #define AD1_(a) AD1_x(AD1(a))
1409    #define AD2_(a) AD2_x(AD1(a))
1410    #define AD3_(a) AD3_x(AD1(a))
1411    #define AD4_(a) AD4_x(AD1(a))
1412  //==============================================================================================================================
1413    AD1 AFractD1(AD1 a){return a-floor(a);}
1414    AD2 AFractD2(AD2 a){return a-floor(a);}
1415    AD3 AFractD3(AD3 a){return a-floor(a);}
1416    AD4 AFractD4(AD4 a){return a-floor(a);}
1417  //------------------------------------------------------------------------------------------------------------------------------
1418    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
1419    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
1420    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
1421    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
1422  //------------------------------------------------------------------------------------------------------------------------------
1423    AD1 ARcpD1(AD1 x){return rcp(x);}
1424    AD2 ARcpD2(AD2 x){return rcp(x);}
1425    AD3 ARcpD3(AD3 x){return rcp(x);}
1426    AD4 ARcpD4(AD4 x){return rcp(x);}
1427  //------------------------------------------------------------------------------------------------------------------------------
1428    AD1 ARsqD1(AD1 x){return rsqrt(x);}
1429    AD2 ARsqD2(AD2 x){return rsqrt(x);}
1430    AD3 ARsqD3(AD3 x){return rsqrt(x);}
1431    AD4 ARsqD4(AD4 x){return rsqrt(x);}
1432  //------------------------------------------------------------------------------------------------------------------------------
1433    AD1 ASatD1(AD1 x){return saturate(x);}
1434    AD2 ASatD2(AD2 x){return saturate(x);}
1435    AD3 ASatD3(AD3 x){return saturate(x);}
1436    AD4 ASatD4(AD4 x){return saturate(x);}
1437   #endif
1438  //==============================================================================================================================
1439  //                                                         HLSL WAVE
1440  //==============================================================================================================================
1441   #ifdef A_WAVE
1442    // Where 'x' must be a compile time literal.
1443    AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1444    AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1445    AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1446    AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1447    AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1448    AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1449    AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1450    AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1451  //------------------------------------------------------------------------------------------------------------------------------
1452    #ifdef A_HALF
1453     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
1454     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
1455     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
1456     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
1457    #endif
1458   #endif
1459  //==============================================================================================================================
1460  #endif
1461  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1462  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1463  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1464  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1465  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1466  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1467  //_____________________________________________________________/\_______________________________________________________________
1468  //==============================================================================================================================
1469  //
1470  //
1471  //                                                          GPU COMMON
1472  //
1473  //
1474  //==============================================================================================================================
1475  #ifdef A_GPU
1476   // Negative and positive infinity.
1477   #define A_INFP_F AF1_AU1(0x7f800000u)
1478   #define A_INFN_F AF1_AU1(0xff800000u)
1479  //------------------------------------------------------------------------------------------------------------------------------
1480   // Copy sign from 's' to positive 'd'.
1481   AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
1482   AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
1483   AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
1484   AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
1485  //------------------------------------------------------------------------------------------------------------------------------
1486   // Single operation to return (useful to create a mask to use in lerp for branch free logic),
1487   //  m=NaN := 0
1488   //  m>=0  := 0
1489   //  m<0   := 1
1490   // Uses the following useful floating point logic,
1491   //  saturate(+a*(-INF)==-INF) := 0
1492   //  saturate( 0*(-INF)== NaN) := 0
1493   //  saturate(-a*(-INF)==+INF) := 1
1494   AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
1495   AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
1496   AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
1497   AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
1498  //------------------------------------------------------------------------------------------------------------------------------
1499   AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
1500   AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
1501   AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
1502   AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
1503  //==============================================================================================================================
1504   #ifdef A_HALF
1505    #ifdef A_HLSL_6_2
1506     #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
1507     #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
1508    #else
1509     #define A_INFP_H AH1_AW1(0x7c00u)
1510     #define A_INFN_H AH1_AW1(0xfc00u)
1511    #endif
1512  
1513  //------------------------------------------------------------------------------------------------------------------------------
1514    AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
1515    AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
1516    AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
1517    AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
1518  //------------------------------------------------------------------------------------------------------------------------------
1519    AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
1520    AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
1521    AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
1522    AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
1523  //------------------------------------------------------------------------------------------------------------------------------
1524    AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
1525    AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
1526    AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
1527    AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
1528   #endif
1529  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1530  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1531  //_____________________________________________________________/\_______________________________________________________________
1532  //==============================================================================================================================
1533  //                                                [FIS] FLOAT INTEGER SORTABLE
1534  //------------------------------------------------------------------------------------------------------------------------------
1535  // Float to integer sortable.
1536  //  - If sign bit=0, flip the sign bit (positives).
1537  //  - If sign bit=1, flip all bits     (negatives).
1538  // Integer sortable to float.
1539  //  - If sign bit=1, flip the sign bit (positives).
1540  //  - If sign bit=0, flip all bits     (negatives).
1541  // Has nice side effects.
1542  //  - Larger integers are more positive values.
1543  //  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
1544  // Burns 3 ops for conversion {shift,or,xor}.
1545  //==============================================================================================================================
1546   AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1547   AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1548  //------------------------------------------------------------------------------------------------------------------------------
1549   // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
1550   AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1551   AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1552  //------------------------------------------------------------------------------------------------------------------------------
1553   #ifdef A_HALF
1554    AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1555    AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1556  //------------------------------------------------------------------------------------------------------------------------------
1557    AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1558    AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1559   #endif
1560  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1561  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1562  //_____________________________________________________________/\_______________________________________________________________
1563  //==============================================================================================================================
1564  //                                                      [PERM] V_PERM_B32
1565  //------------------------------------------------------------------------------------------------------------------------------
1566  // Support for V_PERM_B32 started in the 3rd generation of GCN.
1567  //------------------------------------------------------------------------------------------------------------------------------
1568  // yyyyxxxx - The 'i' input.
1569  // 76543210
1570  // ========
1571  // HGFEDCBA - Naming on permutation.
1572  //------------------------------------------------------------------------------------------------------------------------------
1573  // TODO
1574  // ====
1575  //  - Make sure compiler optimizes this.
1576  //==============================================================================================================================
1577   #ifdef A_HALF
1578    AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
1579    AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
1580    AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
1581    AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
1582  //------------------------------------------------------------------------------------------------------------------------------
1583    AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
1584    AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
1585    AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1586    AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1587    AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
1588    AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
1589    AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
1590    AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
1591  //------------------------------------------------------------------------------------------------------------------------------
1592    AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
1593    AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
1594   #endif
1595  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1596  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1597  //_____________________________________________________________/\_______________________________________________________________
1598  //==============================================================================================================================
1599  //                                               [BUC] BYTE UNSIGNED CONVERSION
1600  //------------------------------------------------------------------------------------------------------------------------------
1601  // Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
1602  // Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
1603  //------------------------------------------------------------------------------------------------------------------------------
1604  // OPCODE NOTES
1605  // ============
1606  // GCN does not do UNORM or SNORM for bytes in opcodes.
1607  //  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
1608  //  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
1609  // V_PERM_B32 does byte packing with ability to zero fill bytes as well.
1610  //  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
1611  //------------------------------------------------------------------------------------------------------------------------------
1612  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
1613  // ====   =====
1614  //    0 : 0
1615  //    1 : 1
1616  //     ...
1617  //  255 : 255
1618  //      : 256 (just outside the encoding range)
1619  //------------------------------------------------------------------------------------------------------------------------------
1620  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1621  // ====   =====
1622  //    0 : 0
1623  //    1 : 1/512
1624  //    2 : 1/256
1625  //     ...
1626  //   64 : 1/8
1627  //  128 : 1/4
1628  //  255 : 255/512
1629  //      : 1/2 (just outside the encoding range)
1630  //------------------------------------------------------------------------------------------------------------------------------
1631  // OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
1632  // ============================================
1633  // r=ABuc0FromU1(i)
1634  //   V_CVT_F32_UBYTE0 r,i
1635  // --------------------------------------------
1636  // r=ABuc0ToU1(d,i)
1637  //   V_CVT_PKACCUM_U8_F32 r,i,0,d
1638  // --------------------------------------------
1639  // d=ABuc0FromU2(i)
1640  //   Where 'k0' is an SGPR with 0x0E0A
1641  //   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
1642  //   V_PERM_B32 d,i.x,i.y,k0
1643  //   V_PK_FMA_F16 d,d,k1.x,0
1644  // --------------------------------------------
1645  // r=ABuc0ToU2(d,i)
1646  //   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
1647  //   Where 'k1' is an SGPR with 0x????
1648  //   Where 'k2' is an SGPR with 0x????
1649  //   V_PK_FMA_F16 i,i,k0.x,0
1650  //   V_PERM_B32 r.x,i,i,k1
1651  //   V_PERM_B32 r.y,i,i,k2
1652  //==============================================================================================================================
1653   // Peak range for 32-bit and 16-bit operations.
1654   #define A_BUC_32 (255.0)
1655   #define A_BUC_16 (255.0/512.0)
1656  //==============================================================================================================================
1657   #if 1
1658    // Designed to be one V_CVT_PKACCUM_U8_F32.
1659    // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
1660    AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
1661    AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
1662    AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
1663    AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
1664  //------------------------------------------------------------------------------------------------------------------------------
1665    // Designed to be one V_CVT_F32_UBYTE*.
1666    AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
1667    AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
1668    AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
1669    AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
1670   #endif
1671  //==============================================================================================================================
1672   #ifdef A_HALF
1673    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1674    AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
1675     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1676  //------------------------------------------------------------------------------------------------------------------------------
1677    // Designed for 3 ops to do SOA to AOS and conversion.
1678    AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1679     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1680    AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1681     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1682    AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1683     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1684    AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1685     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1686  //------------------------------------------------------------------------------------------------------------------------------
1687    // Designed for 2 ops to do both AOS to SOA, and conversion.
1688    AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
1689    AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
1690    AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
1691    AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
1692   #endif
1693  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1694  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1695  //_____________________________________________________________/\_______________________________________________________________
1696  //==============================================================================================================================
1697  //                                                 [BSC] BYTE SIGNED CONVERSION
1698  //------------------------------------------------------------------------------------------------------------------------------
1699  // Similar to [BUC].
1700  // Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
1701  //------------------------------------------------------------------------------------------------------------------------------
1702  // ENCODING (without zero-based encoding)
1703  // ========
1704  //   0 = unused (can be used to mean something else)
1705  //   1 = lowest value 
1706  // 128 = exact zero center (zero based encoding 
1707  // 255 = highest value
1708  //------------------------------------------------------------------------------------------------------------------------------
1709  // Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
1710  // This is useful if there is a desire for cleared values to decode as zero.
1711  //------------------------------------------------------------------------------------------------------------------------------
1712  // BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1713  // ====   =====
1714  //    0 : -127/512 (unused)
1715  //    1 : -126/512
1716  //    2 : -125/512
1717  //     ...
1718  //  128 : 0 
1719  //     ... 
1720  //  255 : 127/512
1721  //      : 1/4 (just outside the encoding range)
1722  //==============================================================================================================================
1723   // Peak range for 32-bit and 16-bit operations.
1724   #define A_BSC_32 (127.0)
1725   #define A_BSC_16 (127.0/512.0)
1726  //==============================================================================================================================
1727   #if 1
1728    AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
1729    AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
1730    AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
1731    AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
1732  //------------------------------------------------------------------------------------------------------------------------------
1733    AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
1734    AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
1735    AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
1736    AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
1737  //------------------------------------------------------------------------------------------------------------------------------
1738    AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
1739    AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
1740    AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
1741    AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
1742  //------------------------------------------------------------------------------------------------------------------------------
1743    AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
1744    AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
1745    AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
1746    AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
1747   #endif
1748  //==============================================================================================================================
1749   #ifdef A_HALF
1750    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1751    AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
1752     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1753  //------------------------------------------------------------------------------------------------------------------------------
1754    AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1755     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1756    AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1757     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1758    AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1759     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1760    AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1761     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1762  //------------------------------------------------------------------------------------------------------------------------------
1763    AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1764     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1765    AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1766     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1767    AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1768     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1769    AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1770     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1771  //------------------------------------------------------------------------------------------------------------------------------
1772    AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
1773    AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
1774    AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
1775    AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
1776  //------------------------------------------------------------------------------------------------------------------------------
1777    AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1778    AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1779    AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1780    AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1781   #endif
1782  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1783  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1784  //_____________________________________________________________/\_______________________________________________________________
1785  //==============================================================================================================================
1786  //                                                     HALF APPROXIMATIONS
1787  //------------------------------------------------------------------------------------------------------------------------------
1788  // These support only positive inputs.
1789  // Did not see value yet in specialization for range.
1790  // Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
1791  // With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
1792  // However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
1793  // And co-execution would require a compiler interleaving a lot of independent work for packed usage.
1794  //------------------------------------------------------------------------------------------------------------------------------
1795  // The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
1796  // Same with sqrt(), as this could be x*rsq() (7 ops).
1797  //==============================================================================================================================
1798   #ifdef A_HALF
1799    // Minimize squared error across full positive range, 2 ops.
1800    // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
1801    AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
1802    AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
1803    AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
1804    AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
1805  //------------------------------------------------------------------------------------------------------------------------------
1806    // Lower precision estimation, 1 op.
1807    // Minimize squared error across {smallest normal to 16384.0}.
1808    AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
1809    AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
1810    AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
1811    AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
1812  //------------------------------------------------------------------------------------------------------------------------------
1813    // Medium precision estimation, one Newton Raphson iteration, 3 ops.
1814    AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
1815    AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
1816    AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
1817    AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
1818  //------------------------------------------------------------------------------------------------------------------------------
1819    // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
1820    AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
1821    AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
1822    AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
1823    AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
1824   #endif
1825  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1826  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1827  //_____________________________________________________________/\_______________________________________________________________
1828  //==============================================================================================================================
1829  //                                                    FLOAT APPROXIMATIONS
1830  //------------------------------------------------------------------------------------------------------------------------------
1831  // Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
1832  //  - Idea dates back to SGI, then to Quake 3, etc.
1833  //  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
1834  //     - sqrt(x)=rsqrt(x)*x
1835  //     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
1836  //  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
1837  //------------------------------------------------------------------------------------------------------------------------------
1838  // These below are from perhaps less complete searching for optimal.
1839  // Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
1840  // So these match up well with the half approximations.
1841  //==============================================================================================================================
1842   AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
1843   AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
1844   AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
1845   AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
1846  //------------------------------------------------------------------------------------------------------------------------------
1847   AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
1848   AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
1849   AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
1850   AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
1851  //------------------------------------------------------------------------------------------------------------------------------
1852   AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
1853   AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
1854   AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
1855   AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
1856  //------------------------------------------------------------------------------------------------------------------------------
1857   AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
1858   AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
1859   AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
1860   AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
1861  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1862  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1863  //_____________________________________________________________/\_______________________________________________________________
1864  //==============================================================================================================================
1865  //                                                    PQ APPROXIMATIONS
1866  //------------------------------------------------------------------------------------------------------------------------------
1867  // PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
1868  // PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
1869  //==============================================================================================================================
1870  // Helpers
1871   AF1 Quart(AF1 a) { a = a * a; return a * a;}
1872   AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
1873   AF2 Quart(AF2 a) { a = a * a; return a * a; }
1874   AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
1875   AF3 Quart(AF3 a) { a = a * a; return a * a; }
1876   AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
1877   AF4 Quart(AF4 a) { a = a * a; return a * a; }
1878   AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
1879   //------------------------------------------------------------------------------------------------------------------------------
1880   AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
1881   AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
1882   AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
1883   AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1884   AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
1885   AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
1886   AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1887   AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
1888   //------------------------------------------------------------------------------------------------------------------------------
1889   AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
1890   AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
1891   AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
1892   AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1893   AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
1894   AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
1895   AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1896   AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
1897   //------------------------------------------------------------------------------------------------------------------------------
1898   AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
1899   AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
1900   AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
1901   AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1902   AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
1903   AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
1904   AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1905   AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
1906   //------------------------------------------------------------------------------------------------------------------------------
1907   AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
1908   AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
1909   AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
1910   AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1911   AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
1912   AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
1913   AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1914   AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
1915  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1916  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1917  //_____________________________________________________________/\_______________________________________________________________
1918  //==============================================================================================================================
1919  //                                                    PARABOLIC SIN & COS
1920  //------------------------------------------------------------------------------------------------------------------------------
1921  // Approximate answers to transcendental questions.
1922  //------------------------------------------------------------------------------------------------------------------------------
1923  //==============================================================================================================================
1924   #if 1
1925    // Valid input range is {-1 to 1} representing {0 to 2 pi}.
1926    // Output range is {-1/4 to 1/4} representing {-1 to 1}.
1927    AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
1928    AF2 APSinF2(AF2 x){return x*abs(x)-x;}
1929    AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
1930    AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
1931    AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
1932   #endif
1933  //------------------------------------------------------------------------------------------------------------------------------
1934   #ifdef A_HALF
1935    // For a packed {sin,cos} pair,
1936    //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
1937    //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
1938    AH1 APSinH1(AH1 x){return x*abs(x)-x;}
1939    AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
1940    AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
1941    AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
1942    AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
1943   #endif
1944  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1945  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1946  //_____________________________________________________________/\_______________________________________________________________
1947  //==============================================================================================================================
1948  //                                                     [ZOL] ZERO ONE LOGIC
1949  //------------------------------------------------------------------------------------------------------------------------------
1950  // Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
1951  //------------------------------------------------------------------------------------------------------------------------------
1952  // 0 := false
1953  // 1 := true
1954  //------------------------------------------------------------------------------------------------------------------------------
1955  // AndNot(x,y)   -> !(x&y) .... One op.
1956  // AndOr(x,y,z)  -> (x&y)|z ... One op.
1957  // GtZero(x)     -> x>0.0 ..... One op.
1958  // Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
1959  // Signed(x)     -> x<0.0 ..... One op.
1960  // ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
1961  //------------------------------------------------------------------------------------------------------------------------------
1962  // OPTIMIZATION NOTES
1963  // ==================
1964  // - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
1965  //   For example 'a.xy*k.xx+k.yy'.
1966  //==============================================================================================================================
1967   #if 1
1968    AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
1969    AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
1970    AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
1971    AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
1972  //------------------------------------------------------------------------------------------------------------------------------
1973    AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
1974    AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
1975    AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
1976    AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
1977  //------------------------------------------------------------------------------------------------------------------------------
1978    AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
1979    AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
1980    AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
1981    AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
1982  //==============================================================================================================================
1983    AU1 AZolF1ToU1(AF1 x){return AU1(x);}
1984    AU2 AZolF2ToU2(AF2 x){return AU2(x);}
1985    AU3 AZolF3ToU3(AF3 x){return AU3(x);}
1986    AU4 AZolF4ToU4(AF4 x){return AU4(x);}
1987  //------------------------------------------------------------------------------------------------------------------------------
1988    // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
1989    AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
1990    AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
1991    AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
1992    AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
1993  //------------------------------------------------------------------------------------------------------------------------------
1994    AF1 AZolU1ToF1(AU1 x){return AF1(x);}
1995    AF2 AZolU2ToF2(AU2 x){return AF2(x);}
1996    AF3 AZolU3ToF3(AU3 x){return AF3(x);}
1997    AF4 AZolU4ToF4(AU4 x){return AF4(x);}
1998  //==============================================================================================================================
1999    AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
2000    AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
2001    AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
2002    AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
2003  //------------------------------------------------------------------------------------------------------------------------------
2004    AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
2005    AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
2006    AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
2007    AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
2008  //------------------------------------------------------------------------------------------------------------------------------
2009    AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
2010    AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
2011    AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
2012    AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
2013  //------------------------------------------------------------------------------------------------------------------------------
2014    AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
2015    AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
2016    AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
2017    AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
2018  //------------------------------------------------------------------------------------------------------------------------------
2019    AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
2020    AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
2021    AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
2022    AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
2023  //------------------------------------------------------------------------------------------------------------------------------
2024    AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
2025    AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
2026    AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
2027    AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
2028  //------------------------------------------------------------------------------------------------------------------------------
2029    AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
2030    AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
2031    AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
2032    AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
2033  //------------------------------------------------------------------------------------------------------------------------------
2034    AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
2035    AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
2036    AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
2037    AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
2038  //------------------------------------------------------------------------------------------------------------------------------
2039    AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
2040    AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
2041    AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
2042    AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
2043   #endif
2044  //==============================================================================================================================
2045   #ifdef A_HALF
2046    AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
2047    AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
2048    AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
2049    AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
2050  //------------------------------------------------------------------------------------------------------------------------------
2051    AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
2052    AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
2053    AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
2054    AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
2055  //------------------------------------------------------------------------------------------------------------------------------
2056    AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
2057    AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
2058    AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
2059    AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
2060  //==============================================================================================================================
2061    // Uses denormal trick.
2062    AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
2063    AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
2064    AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
2065    AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
2066  //------------------------------------------------------------------------------------------------------------------------------
2067    // AMD arch lacks a packed conversion opcode.
2068    AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
2069    AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
2070    AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
2071    AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
2072  //==============================================================================================================================
2073    AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
2074    AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
2075    AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
2076    AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
2077  //------------------------------------------------------------------------------------------------------------------------------
2078    AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
2079    AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
2080    AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
2081    AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
2082  //------------------------------------------------------------------------------------------------------------------------------
2083    AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
2084    AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
2085    AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
2086    AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
2087  //------------------------------------------------------------------------------------------------------------------------------
2088    AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
2089    AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
2090    AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
2091    AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
2092  //------------------------------------------------------------------------------------------------------------------------------
2093    AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
2094    AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
2095    AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
2096    AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
2097  //------------------------------------------------------------------------------------------------------------------------------
2098    AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
2099    AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
2100    AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
2101    AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
2102  //------------------------------------------------------------------------------------------------------------------------------
2103    AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
2104    AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
2105    AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
2106    AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
2107  //------------------------------------------------------------------------------------------------------------------------------
2108    AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
2109    AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
2110    AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
2111    AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
2112   #endif
2113  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2114  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2115  //_____________________________________________________________/\_______________________________________________________________
2116  //==============================================================================================================================
2117  //                                                      COLOR CONVERSIONS
2118  //------------------------------------------------------------------------------------------------------------------------------
2119  // These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
2120  // So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
2121  // These are branch free implementations.
2122  // The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
2123  //------------------------------------------------------------------------------------------------------------------------------
2124  // TRANSFER FUNCTIONS
2125  // ==================
2126  // 709 ..... Rec709 used for some HDTVs
2127  // Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
2128  // Pq ...... PQ native for HDR10
2129  // Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
2130  // Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
2131  // Three ... Gamma 3.0, less fast, but good for HDR.
2132  //------------------------------------------------------------------------------------------------------------------------------
2133  // KEEPING TO SPEC
2134  // ===============
2135  // Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
2136  //  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
2137  //  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
2138  // Also there is a slight step in the transition regions.
2139  // Precision of the coefficients in the spec being the likely cause.
2140  // Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
2141  // This is to work around lack of hardware (typically only ROP does the conversion for free).
2142  // To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
2143  // So this header keeps with the spec.
2144  // For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
2145  // Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
2146  //------------------------------------------------------------------------------------------------------------------------------
2147  // FOR PQ
2148  // ======
2149  // Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
2150  // All constants are only specified to FP32 precision.
2151  // External PQ source reference,
2152  //  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
2153  //------------------------------------------------------------------------------------------------------------------------------
2154  // PACKED VERSIONS
2155  // ===============
2156  // These are the A*H2() functions.
2157  // There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
2158  // The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
2159  // Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
2160  //------------------------------------------------------------------------------------------------------------------------------
2161  // NOTES
2162  // =====
2163  // Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
2164  //==============================================================================================================================
2165   #if 1
2166    AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2167     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2168    AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2169     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2170    AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2171     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2172  //------------------------------------------------------------------------------------------------------------------------------
2173    // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
2174    AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
2175    AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
2176    AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
2177  //------------------------------------------------------------------------------------------------------------------------------
2178    AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
2179     return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
2180    AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
2181     return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
2182    AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
2183     return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
2184  //------------------------------------------------------------------------------------------------------------------------------
2185    AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2186     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2187    AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2188     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2189    AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2190     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2191  //------------------------------------------------------------------------------------------------------------------------------
2192    AF1 AToTwoF1(AF1 c){return sqrt(c);}
2193    AF2 AToTwoF2(AF2 c){return sqrt(c);}
2194    AF3 AToTwoF3(AF3 c){return sqrt(c);}
2195  //------------------------------------------------------------------------------------------------------------------------------
2196    AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
2197    AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
2198    AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
2199   #endif
2200  //==============================================================================================================================
2201   #if 1
2202    // Unfortunately median won't work here.
2203    AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2204     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2205    AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2206     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2207    AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2208     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2209  //------------------------------------------------------------------------------------------------------------------------------
2210    AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
2211    AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
2212    AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
2213  //------------------------------------------------------------------------------------------------------------------------------
2214    AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
2215     return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
2216    AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
2217     return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
2218    AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
2219     return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
2220  //------------------------------------------------------------------------------------------------------------------------------
2221    // Unfortunately median won't work here.
2222    AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2223     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2224    AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2225     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2226    AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2227     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2228  //------------------------------------------------------------------------------------------------------------------------------
2229    AF1 AFromTwoF1(AF1 c){return c*c;}
2230    AF2 AFromTwoF2(AF2 c){return c*c;}
2231    AF3 AFromTwoF3(AF3 c){return c*c;}
2232  //------------------------------------------------------------------------------------------------------------------------------
2233    AF1 AFromThreeF1(AF1 c){return c*c*c;}
2234    AF2 AFromThreeF2(AF2 c){return c*c*c;}
2235    AF3 AFromThreeF3(AF3 c){return c*c*c;}
2236   #endif
2237  //==============================================================================================================================
2238   #ifdef A_HALF
2239    AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2240     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2241    AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2242     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2243    AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2244     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2245  //------------------------------------------------------------------------------------------------------------------------------
2246    AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
2247    AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
2248    AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
2249  //------------------------------------------------------------------------------------------------------------------------------
2250    AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2251     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2252    AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2253     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2254    AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2255     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2256  //------------------------------------------------------------------------------------------------------------------------------
2257    AH1 AToTwoH1(AH1 c){return sqrt(c);}
2258    AH2 AToTwoH2(AH2 c){return sqrt(c);}
2259    AH3 AToTwoH3(AH3 c){return sqrt(c);}
2260  //------------------------------------------------------------------------------------------------------------------------------
2261    AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
2262    AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
2263    AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
2264   #endif
2265  //==============================================================================================================================
2266   #ifdef A_HALF
2267    AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2268     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2269    AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2270     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2271    AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2272     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2273  //------------------------------------------------------------------------------------------------------------------------------
2274    AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
2275    AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
2276    AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
2277  //------------------------------------------------------------------------------------------------------------------------------
2278    AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2279     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2280    AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2281     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2282    AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2283     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2284  //------------------------------------------------------------------------------------------------------------------------------
2285    AH1 AFromTwoH1(AH1 c){return c*c;}
2286    AH2 AFromTwoH2(AH2 c){return c*c;}
2287    AH3 AFromTwoH3(AH3 c){return c*c;}
2288  //------------------------------------------------------------------------------------------------------------------------------
2289    AH1 AFromThreeH1(AH1 c){return c*c*c;}
2290    AH2 AFromThreeH2(AH2 c){return c*c*c;}
2291    AH3 AFromThreeH3(AH3 c){return c*c*c;}
2292   #endif
2293  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2294  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2295  //_____________________________________________________________/\_______________________________________________________________
2296  //==============================================================================================================================
2297  //                                                          CS REMAP
2298  //==============================================================================================================================
2299   // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
2300   //  543210
2301   //  ======
2302   //  ..xxx.
2303   //  yy...y
2304   AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2305  //==============================================================================================================================
2306   // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
2307   //  543210
2308   //  ======
2309   //  .xx..x
2310   //  y..yy.
2311   // Details,
2312   //  LANE TO 8x8 MAPPING
2313   //  ===================
2314   //  00 01 08 09 10 11 18 19 
2315   //  02 03 0a 0b 12 13 1a 1b
2316   //  04 05 0c 0d 14 15 1c 1d
2317   //  06 07 0e 0f 16 17 1e 1f 
2318   //  20 21 28 29 30 31 38 39 
2319   //  22 23 2a 2b 32 33 3a 3b
2320   //  24 25 2c 2d 34 35 3c 3d
2321   //  26 27 2e 2f 36 37 3e 3f 
2322   AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2323  //==============================================================================================================================
2324   #ifdef A_HALF
2325    AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2326    AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2327   #endif
2328  #endif
2329  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2330  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2331  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2332  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2333  //_____________________________________________________________/\_______________________________________________________________
2334  //==============================================================================================================================
2335  //
2336  //                                                          REFERENCE
2337  //
2338  //------------------------------------------------------------------------------------------------------------------------------
2339  // IEEE FLOAT RULES
2340  // ================
2341  //  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
2342  //  - {+/-}0 * {+/-}INF = NaN
2343  //  - -INF + (+INF) = NaN
2344  //  - {+/-}0 / {+/-}0 = NaN
2345  //  - {+/-}INF / {+/-}INF = NaN
2346  //  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
2347  //  - 0 == -0
2348  //  - 4/0 = +INF
2349  //  - 4/-0 = -INF
2350  //  - 4+INF = +INF
2351  //  - 4-INF = -INF
2352  //  - 4*(+INF) = +INF
2353  //  - 4*(-INF) = -INF
2354  //  - -4*(+INF) = -INF
2355  //  - sqrt(+INF) = +INF
2356  //------------------------------------------------------------------------------------------------------------------------------
2357  // FP16 ENCODING
2358  // =============
2359  // fedcba9876543210
2360  // ----------------
2361  // ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
2362  // .eeeee..........  5-bit exponent
2363  // .00000..........  denormals
2364  // .00001..........  -14 exponent
2365  // .11110..........   15 exponent
2366  // .111110000000000  infinity
2367  // .11111nnnnnnnnnn  NaN with n!=0
2368  // s...............  sign
2369  //------------------------------------------------------------------------------------------------------------------------------
2370  // FP16/INT16 ALIASING DENORMAL
2371  // ============================
2372  // 11-bit unsigned integers alias with half float denormal/normal values,
2373  //     1 = 2^(-24) = 1/16777216 ....................... first denormal value
2374  //     2 = 2^(-23)
2375  //   ...
2376  //  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
2377  //  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
2378  //  2047 .............................................. last normal value that still maps to integers 
2379  // Scaling limits,
2380  //  2^15 = 32768 ...................................... largest power of 2 scaling
2381  // Largest pow2 conversion mapping is at *32768,
2382  //     1 : 2^(-9) = 1/512
2383  //     2 : 1/256
2384  //     4 : 1/128
2385  //     8 : 1/64
2386  //    16 : 1/32
2387  //    32 : 1/16
2388  //    64 : 1/8
2389  //   128 : 1/4
2390  //   256 : 1/2
2391  //   512 : 1
2392  //  1024 : 2
2393  //  2047 : a little less than 4
2394  //==============================================================================================================================
2395  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2396  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2397  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2398  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2399  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2400  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2401  //_____________________________________________________________/\_______________________________________________________________
2402  //==============================================================================================================================
2403  //
2404  //
2405  //                                                     GPU/CPU PORTABILITY
2406  //
2407  //
2408  //------------------------------------------------------------------------------------------------------------------------------
2409  // This is the GPU implementation.
2410  // See the CPU implementation for docs.
2411  //==============================================================================================================================
2412  #ifdef A_GPU
2413   #define A_TRUE true
2414   #define A_FALSE false
2415   #define A_STATIC
2416  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2417  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2418  //_____________________________________________________________/\_______________________________________________________________
2419  //==============================================================================================================================
2420  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
2421  //==============================================================================================================================
2422   #define retAD2 AD2
2423   #define retAD3 AD3
2424   #define retAD4 AD4
2425   #define retAF2 AF2
2426   #define retAF3 AF3
2427   #define retAF4 AF4
2428   #define retAL2 AL2
2429   #define retAL3 AL3
2430   #define retAL4 AL4
2431   #define retAU2 AU2
2432   #define retAU3 AU3
2433   #define retAU4 AU4
2434  //------------------------------------------------------------------------------------------------------------------------------
2435   #define inAD2 in AD2
2436   #define inAD3 in AD3
2437   #define inAD4 in AD4
2438   #define inAF2 in AF2
2439   #define inAF3 in AF3
2440   #define inAF4 in AF4
2441   #define inAL2 in AL2
2442   #define inAL3 in AL3
2443   #define inAL4 in AL4
2444   #define inAU2 in AU2
2445   #define inAU3 in AU3
2446   #define inAU4 in AU4
2447  //------------------------------------------------------------------------------------------------------------------------------
2448   #define inoutAD2 inout AD2
2449   #define inoutAD3 inout AD3
2450   #define inoutAD4 inout AD4
2451   #define inoutAF2 inout AF2
2452   #define inoutAF3 inout AF3
2453   #define inoutAF4 inout AF4
2454   #define inoutAL2 inout AL2
2455   #define inoutAL3 inout AL3
2456   #define inoutAL4 inout AL4
2457   #define inoutAU2 inout AU2
2458   #define inoutAU3 inout AU3
2459   #define inoutAU4 inout AU4
2460  //------------------------------------------------------------------------------------------------------------------------------
2461   #define outAD2 out AD2
2462   #define outAD3 out AD3
2463   #define outAD4 out AD4
2464   #define outAF2 out AF2
2465   #define outAF3 out AF3
2466   #define outAF4 out AF4
2467   #define outAL2 out AL2
2468   #define outAL3 out AL3
2469   #define outAL4 out AL4
2470   #define outAU2 out AU2
2471   #define outAU3 out AU3
2472   #define outAU4 out AU4
2473  //------------------------------------------------------------------------------------------------------------------------------
2474   #define varAD2(x) AD2 x
2475   #define varAD3(x) AD3 x
2476   #define varAD4(x) AD4 x
2477   #define varAF2(x) AF2 x
2478   #define varAF3(x) AF3 x
2479   #define varAF4(x) AF4 x
2480   #define varAL2(x) AL2 x
2481   #define varAL3(x) AL3 x
2482   #define varAL4(x) AL4 x
2483   #define varAU2(x) AU2 x
2484   #define varAU3(x) AU3 x
2485   #define varAU4(x) AU4 x
2486  //------------------------------------------------------------------------------------------------------------------------------
2487   #define initAD2(x,y) AD2(x,y)
2488   #define initAD3(x,y,z) AD3(x,y,z)
2489   #define initAD4(x,y,z,w) AD4(x,y,z,w)
2490   #define initAF2(x,y) AF2(x,y)
2491   #define initAF3(x,y,z) AF3(x,y,z)
2492   #define initAF4(x,y,z,w) AF4(x,y,z,w)
2493   #define initAL2(x,y) AL2(x,y)
2494   #define initAL3(x,y,z) AL3(x,y,z)
2495   #define initAL4(x,y,z,w) AL4(x,y,z,w)
2496   #define initAU2(x,y) AU2(x,y)
2497   #define initAU3(x,y,z) AU3(x,y,z)
2498   #define initAU4(x,y,z,w) AU4(x,y,z,w)
2499  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2500  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2501  //_____________________________________________________________/\_______________________________________________________________
2502  //==============================================================================================================================
2503  //                                                     SCALAR RETURN OPS
2504  //==============================================================================================================================
2505   #define AAbsD1(a) abs(AD1(a))
2506   #define AAbsF1(a) abs(AF1(a))
2507  //------------------------------------------------------------------------------------------------------------------------------
2508   #define ACosD1(a) cos(AD1(a))
2509   #define ACosF1(a) cos(AF1(a))
2510  //------------------------------------------------------------------------------------------------------------------------------
2511   #define ADotD2(a,b) dot(AD2(a),AD2(b))
2512   #define ADotD3(a,b) dot(AD3(a),AD3(b))
2513   #define ADotD4(a,b) dot(AD4(a),AD4(b))
2514   #define ADotF2(a,b) dot(AF2(a),AF2(b))
2515   #define ADotF3(a,b) dot(AF3(a),AF3(b))
2516   #define ADotF4(a,b) dot(AF4(a),AF4(b))
2517  //------------------------------------------------------------------------------------------------------------------------------
2518   #define AExp2D1(a) exp2(AD1(a))
2519   #define AExp2F1(a) exp2(AF1(a))
2520  //------------------------------------------------------------------------------------------------------------------------------
2521   #define AFloorD1(a) floor(AD1(a))
2522   #define AFloorF1(a) floor(AF1(a))
2523  //------------------------------------------------------------------------------------------------------------------------------
2524   #define ALog2D1(a) log2(AD1(a))
2525   #define ALog2F1(a) log2(AF1(a))
2526  //------------------------------------------------------------------------------------------------------------------------------
2527   #define AMaxD1(a,b) max(a,b)
2528   #define AMaxF1(a,b) max(a,b)
2529   #define AMaxL1(a,b) max(a,b)
2530   #define AMaxU1(a,b) max(a,b)
2531  //------------------------------------------------------------------------------------------------------------------------------
2532   #define AMinD1(a,b) min(a,b)
2533   #define AMinF1(a,b) min(a,b)
2534   #define AMinL1(a,b) min(a,b)
2535   #define AMinU1(a,b) min(a,b)
2536  //------------------------------------------------------------------------------------------------------------------------------
2537   #define ASinD1(a) sin(AD1(a))
2538   #define ASinF1(a) sin(AF1(a))
2539  //------------------------------------------------------------------------------------------------------------------------------
2540   #define ASqrtD1(a) sqrt(AD1(a))
2541   #define ASqrtF1(a) sqrt(AF1(a))
2542  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2543  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2544  //_____________________________________________________________/\_______________________________________________________________
2545  //==============================================================================================================================
2546  //                                               SCALAR RETURN OPS - DEPENDENT
2547  //==============================================================================================================================
2548   #define APowD1(a,b) pow(AD1(a),AF1(b))
2549   #define APowF1(a,b) pow(AF1(a),AF1(b))
2550  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2551  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2552  //_____________________________________________________________/\_______________________________________________________________
2553  //==============================================================================================================================
2554  //                                                         VECTOR OPS
2555  //------------------------------------------------------------------------------------------------------------------------------
2556  // These are added as needed for production or prototyping, so not necessarily a complete set.
2557  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
2558  //==============================================================================================================================
2559   #ifdef A_DUBL
2560    AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
2561    AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
2562    AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
2563  //------------------------------------------------------------------------------------------------------------------------------
2564    AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
2565    AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
2566    AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
2567  //------------------------------------------------------------------------------------------------------------------------------
2568    AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
2569    AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
2570    AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
2571  //------------------------------------------------------------------------------------------------------------------------------
2572    AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
2573    AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
2574    AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
2575  //------------------------------------------------------------------------------------------------------------------------------
2576    AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
2577    AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
2578    AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
2579  //------------------------------------------------------------------------------------------------------------------------------
2580    AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
2581    AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
2582    AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
2583  //------------------------------------------------------------------------------------------------------------------------------
2584    AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
2585    AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
2586    AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
2587  //------------------------------------------------------------------------------------------------------------------------------
2588    AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
2589    AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
2590    AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
2591  //------------------------------------------------------------------------------------------------------------------------------
2592    AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
2593    AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
2594    AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
2595  //------------------------------------------------------------------------------------------------------------------------------
2596    AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
2597    AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
2598    AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
2599  //------------------------------------------------------------------------------------------------------------------------------
2600    AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
2601    AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
2602    AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
2603  //------------------------------------------------------------------------------------------------------------------------------
2604    AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
2605    AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
2606    AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
2607   #endif
2608  //==============================================================================================================================
2609   AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
2610   AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
2611   AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
2612  //------------------------------------------------------------------------------------------------------------------------------
2613   AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
2614   AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
2615   AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
2616  //------------------------------------------------------------------------------------------------------------------------------
2617   AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
2618   AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
2619   AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
2620  //------------------------------------------------------------------------------------------------------------------------------
2621   AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
2622   AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
2623   AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
2624  //------------------------------------------------------------------------------------------------------------------------------
2625   AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
2626   AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
2627   AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
2628  //------------------------------------------------------------------------------------------------------------------------------
2629   AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
2630   AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
2631   AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
2632  //------------------------------------------------------------------------------------------------------------------------------
2633   AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
2634   AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
2635   AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
2636  //------------------------------------------------------------------------------------------------------------------------------
2637   AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
2638   AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
2639   AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
2640  //------------------------------------------------------------------------------------------------------------------------------
2641   AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
2642   AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
2643   AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
2644  //------------------------------------------------------------------------------------------------------------------------------
2645   AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
2646   AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
2647   AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
2648  //------------------------------------------------------------------------------------------------------------------------------
2649   AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
2650   AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
2651   AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
2652  //------------------------------------------------------------------------------------------------------------------------------
2653   AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
2654   AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
2655   AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
2656  #endif