/ src / Ryujinx.Graphics.Vulkan / Effects / Shaders / FsrSharpening.glsl
FsrSharpening.glsl
   1  // Sharpening
   2  #version 430 core
   3  layout (local_size_x = 64) in;
   4  layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput;
   5  layout( binding = 2 ) uniform invResolution
   6  {
   7      vec2 invResolution_data;
   8  };
   9  layout( binding = 3 ) uniform outvResolution
  10  {
  11      vec2 outvResolution_data;
  12  }; 
  13  layout( binding = 1, set = 2) uniform sampler2D source;
  14  layout( binding = 4 ) uniform sharpening
  15  {
  16      float sharpening_data;
  17  };
  18  
  19  #define A_GPU 1
  20  #define A_GLSL 1
  21  //==============================================================================================================================
  22  //
  23  //                                               [A] SHADER PORTABILITY 1.20210629
  24  //
  25  //==============================================================================================================================
  26  // FidelityFX Super Resolution Sample
  27  //
  28  // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  29  // Permission is hereby granted, free of charge, to any person obtaining a copy
  30  // of this software and associated documentation files(the "Software"), to deal
  31  // in the Software without restriction, including without limitation the rights
  32  // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  33  // copies of the Software, and to permit persons to whom the Software is
  34  // furnished to do so, subject to the following conditions :
  35  // The above copyright notice and this permission notice shall be included in
  36  // all copies or substantial portions of the Software.
  37  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  38  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  39  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  40  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  41  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  42  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  43  // THE SOFTWARE.
  44  //------------------------------------------------------------------------------------------------------------------------------
  45  // MIT LICENSE
  46  // ===========
  47  // Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
  48  // -----------
  49  // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
  50  // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
  51  // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
  52  // Software is furnished to do so, subject to the following conditions:
  53  // -----------
  54  // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  55  // Software.
  56  // -----------
  57  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  58  // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
  59  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  60  // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  61  //------------------------------------------------------------------------------------------------------------------------------
  62  // ABOUT
  63  // =====
  64  // Common central point for high-level shading language and C portability for various shader headers.
  65  //------------------------------------------------------------------------------------------------------------------------------
  66  // DEFINES
  67  // =======
  68  // A_CPU ..... Include the CPU related code.
  69  // A_GPU ..... Include the GPU related code.
  70  // A_GLSL .... Using GLSL.
  71  // A_HLSL .... Using HLSL.
  72  // A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
  73  // A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
  74  // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
  75  // =======
  76  // A_BYTE .... Support 8-bit integer.
  77  // A_HALF .... Support 16-bit integer and floating point.
  78  // A_LONG .... Support 64-bit integer.
  79  // A_DUBL .... Support 64-bit floating point.
  80  // =======
  81  // A_WAVE .... Support wave-wide operations.
  82  //------------------------------------------------------------------------------------------------------------------------------
  83  // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
  84  //------------------------------------------------------------------------------------------------------------------------------
  85  // SIMPLIFIED TYPE SYSTEM
  86  // ======================
  87  //  - All ints will be unsigned with exception of when signed is required.
  88  //  - Type naming simplified and shortened "A<type><#components>",
  89  //     - H = 16-bit float (half)
  90  //     - F = 32-bit float (float)
  91  //     - D = 64-bit float (double)
  92  //     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
  93  //     - B = 8-bit integer (byte)
  94  //     - W = 16-bit integer (word)
  95  //     - U = 32-bit integer (unsigned)
  96  //     - L = 64-bit integer (long)
  97  //  - Using "AS<type><#components>" for signed when required.
  98  //------------------------------------------------------------------------------------------------------------------------------
  99  // TODO
 100  // ====
 101  //  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
 102  //------------------------------------------------------------------------------------------------------------------------------
 103  // CHANGE LOG
 104  // ==========
 105  // 20200914 - Expanded wave ops and prx code.
 106  // 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
 107  //==============================================================================================================================
 108  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 109  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 110  //_____________________________________________________________/\_______________________________________________________________
 111  //==============================================================================================================================
 112  //                                                           COMMON
 113  //==============================================================================================================================
 114  #define A_2PI 6.28318530718
 115  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 116  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 117  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 118  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 119  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 120  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 121  //_____________________________________________________________/\_______________________________________________________________
 122  //==============================================================================================================================
 123  //
 124  //
 125  //                                                             CPU
 126  //
 127  //
 128  //==============================================================================================================================
 129  #ifdef A_CPU
 130   // Supporting user defined overrides.
 131   #ifndef A_RESTRICT
 132    #define A_RESTRICT __restrict
 133   #endif
 134  //------------------------------------------------------------------------------------------------------------------------------
 135   #ifndef A_STATIC
 136    #define A_STATIC static
 137   #endif
 138  //------------------------------------------------------------------------------------------------------------------------------
 139   // Same types across CPU and GPU.
 140   // Predicate uses 32-bit integer (C friendly bool).
 141   typedef uint32_t AP1;
 142   typedef float AF1;
 143   typedef double AD1;
 144   typedef uint8_t AB1;
 145   typedef uint16_t AW1;
 146   typedef uint32_t AU1;
 147   typedef uint64_t AL1;
 148   typedef int8_t ASB1;
 149   typedef int16_t ASW1;
 150   typedef int32_t ASU1;
 151   typedef int64_t ASL1;
 152  //------------------------------------------------------------------------------------------------------------------------------
 153   #define AD1_(a) ((AD1)(a))
 154   #define AF1_(a) ((AF1)(a))
 155   #define AL1_(a) ((AL1)(a))
 156   #define AU1_(a) ((AU1)(a))
 157  //------------------------------------------------------------------------------------------------------------------------------
 158   #define ASL1_(a) ((ASL1)(a))
 159   #define ASU1_(a) ((ASU1)(a))
 160  //------------------------------------------------------------------------------------------------------------------------------
 161   A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
 162  //------------------------------------------------------------------------------------------------------------------------------
 163   #define A_TRUE 1
 164   #define A_FALSE 0
 165  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 166  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 167  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 168  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 169  //_____________________________________________________________/\_______________________________________________________________
 170  //==============================================================================================================================
 171  //
 172  //                                                       CPU/GPU PORTING
 173  //
 174  //------------------------------------------------------------------------------------------------------------------------------
 175  // Get CPU and GPU to share all setup code, without duplicate code paths.
 176  // This uses a lower-case prefix for special vector constructs.
 177  //  - In C restrict pointers are used.
 178  //  - In the shading language, in/inout/out arguments are used.
 179  // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
 180  //==============================================================================================================================
 181  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 182  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 183  //_____________________________________________________________/\_______________________________________________________________
 184  //==============================================================================================================================
 185  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
 186  //==============================================================================================================================
 187   #define retAD2 AD1 *A_RESTRICT
 188   #define retAD3 AD1 *A_RESTRICT
 189   #define retAD4 AD1 *A_RESTRICT
 190   #define retAF2 AF1 *A_RESTRICT
 191   #define retAF3 AF1 *A_RESTRICT
 192   #define retAF4 AF1 *A_RESTRICT
 193   #define retAL2 AL1 *A_RESTRICT
 194   #define retAL3 AL1 *A_RESTRICT
 195   #define retAL4 AL1 *A_RESTRICT
 196   #define retAU2 AU1 *A_RESTRICT
 197   #define retAU3 AU1 *A_RESTRICT
 198   #define retAU4 AU1 *A_RESTRICT
 199  //------------------------------------------------------------------------------------------------------------------------------
 200   #define inAD2 AD1 *A_RESTRICT
 201   #define inAD3 AD1 *A_RESTRICT
 202   #define inAD4 AD1 *A_RESTRICT
 203   #define inAF2 AF1 *A_RESTRICT
 204   #define inAF3 AF1 *A_RESTRICT
 205   #define inAF4 AF1 *A_RESTRICT
 206   #define inAL2 AL1 *A_RESTRICT
 207   #define inAL3 AL1 *A_RESTRICT
 208   #define inAL4 AL1 *A_RESTRICT
 209   #define inAU2 AU1 *A_RESTRICT
 210   #define inAU3 AU1 *A_RESTRICT
 211   #define inAU4 AU1 *A_RESTRICT
 212  //------------------------------------------------------------------------------------------------------------------------------
 213   #define inoutAD2 AD1 *A_RESTRICT
 214   #define inoutAD3 AD1 *A_RESTRICT
 215   #define inoutAD4 AD1 *A_RESTRICT
 216   #define inoutAF2 AF1 *A_RESTRICT
 217   #define inoutAF3 AF1 *A_RESTRICT
 218   #define inoutAF4 AF1 *A_RESTRICT
 219   #define inoutAL2 AL1 *A_RESTRICT
 220   #define inoutAL3 AL1 *A_RESTRICT
 221   #define inoutAL4 AL1 *A_RESTRICT
 222   #define inoutAU2 AU1 *A_RESTRICT
 223   #define inoutAU3 AU1 *A_RESTRICT
 224   #define inoutAU4 AU1 *A_RESTRICT
 225  //------------------------------------------------------------------------------------------------------------------------------
 226   #define outAD2 AD1 *A_RESTRICT
 227   #define outAD3 AD1 *A_RESTRICT
 228   #define outAD4 AD1 *A_RESTRICT
 229   #define outAF2 AF1 *A_RESTRICT
 230   #define outAF3 AF1 *A_RESTRICT
 231   #define outAF4 AF1 *A_RESTRICT
 232   #define outAL2 AL1 *A_RESTRICT
 233   #define outAL3 AL1 *A_RESTRICT
 234   #define outAL4 AL1 *A_RESTRICT
 235   #define outAU2 AU1 *A_RESTRICT
 236   #define outAU3 AU1 *A_RESTRICT
 237   #define outAU4 AU1 *A_RESTRICT
 238  //------------------------------------------------------------------------------------------------------------------------------
 239   #define varAD2(x) AD1 x[2]
 240   #define varAD3(x) AD1 x[3]
 241   #define varAD4(x) AD1 x[4]
 242   #define varAF2(x) AF1 x[2]
 243   #define varAF3(x) AF1 x[3]
 244   #define varAF4(x) AF1 x[4]
 245   #define varAL2(x) AL1 x[2]
 246   #define varAL3(x) AL1 x[3]
 247   #define varAL4(x) AL1 x[4]
 248   #define varAU2(x) AU1 x[2]
 249   #define varAU3(x) AU1 x[3]
 250   #define varAU4(x) AU1 x[4]
 251  //------------------------------------------------------------------------------------------------------------------------------
 252   #define initAD2(x,y) {x,y}
 253   #define initAD3(x,y,z) {x,y,z}
 254   #define initAD4(x,y,z,w) {x,y,z,w}
 255   #define initAF2(x,y) {x,y}
 256   #define initAF3(x,y,z) {x,y,z}
 257   #define initAF4(x,y,z,w) {x,y,z,w}
 258   #define initAL2(x,y) {x,y}
 259   #define initAL3(x,y,z) {x,y,z}
 260   #define initAL4(x,y,z,w) {x,y,z,w}
 261   #define initAU2(x,y) {x,y}
 262   #define initAU3(x,y,z) {x,y,z}
 263   #define initAU4(x,y,z,w) {x,y,z,w}
 264  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 265  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 266  //_____________________________________________________________/\_______________________________________________________________
 267  //==============================================================================================================================
 268  //                                                     SCALAR RETURN OPS
 269  //------------------------------------------------------------------------------------------------------------------------------
 270  // TODO
 271  // ====
 272  //  - Replace transcendentals with manual versions. 
 273  //==============================================================================================================================
 274   #ifdef A_GCC
 275    A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
 276    A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
 277    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
 278    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
 279   #else
 280    A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
 281    A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
 282    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
 283    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
 284   #endif
 285  //------------------------------------------------------------------------------------------------------------------------------
 286   #ifdef A_GCC
 287    A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
 288    A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
 289   #else
 290    A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
 291    A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
 292   #endif
 293  //------------------------------------------------------------------------------------------------------------------------------
 294   A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
 295   A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 296   A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 297   A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
 298   A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 299   A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 300  //------------------------------------------------------------------------------------------------------------------------------
 301   #ifdef A_GCC
 302    A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
 303    A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
 304   #else
 305    A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
 306    A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
 307   #endif
 308  //------------------------------------------------------------------------------------------------------------------------------
 309   #ifdef A_GCC
 310    A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
 311    A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
 312   #else
 313    A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
 314    A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
 315   #endif
 316  //------------------------------------------------------------------------------------------------------------------------------
 317   A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
 318   A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
 319  //------------------------------------------------------------------------------------------------------------------------------
 320   #ifdef A_GCC
 321    A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
 322    A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
 323   #else
 324    A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
 325    A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
 326   #endif
 327  //------------------------------------------------------------------------------------------------------------------------------
 328   A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
 329   A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
 330   A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
 331   A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
 332  //------------------------------------------------------------------------------------------------------------------------------
 333   // These follow the convention that A integer types don't have signage, until they are operated on. 
 334   A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
 335   A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
 336  //------------------------------------------------------------------------------------------------------------------------------
 337   A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
 338   A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
 339   A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
 340   A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
 341  //------------------------------------------------------------------------------------------------------------------------------
 342   A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
 343   A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
 344  //------------------------------------------------------------------------------------------------------------------------------
 345   A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
 346   A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
 347  //------------------------------------------------------------------------------------------------------------------------------
 348   A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
 349   A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
 350  //------------------------------------------------------------------------------------------------------------------------------
 351   #ifdef A_GCC
 352    A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
 353    A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
 354   #else
 355    A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
 356    A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
 357   #endif
 358  //------------------------------------------------------------------------------------------------------------------------------
 359   #ifdef A_GCC
 360    A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
 361    A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
 362   #else
 363    A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
 364    A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
 365   #endif
 366  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 367  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 368  //_____________________________________________________________/\_______________________________________________________________
 369  //==============================================================================================================================
 370  //                                               SCALAR RETURN OPS - DEPENDENT
 371  //==============================================================================================================================
 372   A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
 373   A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
 374  //------------------------------------------------------------------------------------------------------------------------------
 375   A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
 376   A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
 377  //------------------------------------------------------------------------------------------------------------------------------
 378   A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
 379   A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
 380  //------------------------------------------------------------------------------------------------------------------------------
 381   A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
 382   A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
 383  //------------------------------------------------------------------------------------------------------------------------------
 384   A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
 385   A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
 386  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 387  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 388  //_____________________________________________________________/\_______________________________________________________________
 389  //==============================================================================================================================
 390  //                                                         VECTOR OPS
 391  //------------------------------------------------------------------------------------------------------------------------------
 392  // These are added as needed for production or prototyping, so not necessarily a complete set.
 393  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
 394  //==============================================================================================================================
 395   A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
 396   A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
 397   A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
 398  //------------------------------------------------------------------------------------------------------------------------------
 399   A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
 400   A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
 401   A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
 402  //==============================================================================================================================
 403   A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 404   A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 405   A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 406  //------------------------------------------------------------------------------------------------------------------------------
 407   A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 408   A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 409   A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 410  //==============================================================================================================================
 411   A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 412   A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 413   A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 414  //------------------------------------------------------------------------------------------------------------------------------
 415   A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 416   A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 417   A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 418  //==============================================================================================================================
 419   A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
 420   A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 421   A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 422  //------------------------------------------------------------------------------------------------------------------------------
 423   A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
 424   A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 425   A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 426  //==============================================================================================================================
 427   A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
 428   A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
 429   A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
 430  //------------------------------------------------------------------------------------------------------------------------------
 431   A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
 432   A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
 433   A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
 434  //==============================================================================================================================
 435   A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
 436   A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
 437   A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
 438  //------------------------------------------------------------------------------------------------------------------------------
 439   A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
 440   A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
 441   A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
 442  //==============================================================================================================================
 443   A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
 444   A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
 445   A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
 446  //------------------------------------------------------------------------------------------------------------------------------
 447   A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
 448   A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
 449   A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
 450  //==============================================================================================================================
 451   A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
 452   A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
 453   A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
 454  //------------------------------------------------------------------------------------------------------------------------------
 455   A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
 456   A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
 457   A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
 458  //==============================================================================================================================
 459   A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 460   A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 461   A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 462  //------------------------------------------------------------------------------------------------------------------------------
 463   A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 464   A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 465   A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 466  //==============================================================================================================================
 467   A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 468   A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 469   A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 470  //------------------------------------------------------------------------------------------------------------------------------
 471   A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 472   A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 473   A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 474  //==============================================================================================================================
 475   A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 476   A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 477   A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 478  //------------------------------------------------------------------------------------------------------------------------------
 479   A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 480   A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 481   A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 482  //==============================================================================================================================
 483   A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
 484   A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
 485   A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
 486  //------------------------------------------------------------------------------------------------------------------------------
 487   A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
 488   A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
 489   A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
 490  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 491  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 492  //_____________________________________________________________/\_______________________________________________________________
 493  //==============================================================================================================================
 494  //                                                     HALF FLOAT PACKING
 495  //==============================================================================================================================
 496   // Convert float to half (in lower 16-bits of output).
 497   // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
 498   // Supports denormals.
 499   // Conversion rules are to make computations possibly "safer" on the GPU,
 500   //  -INF & -NaN -> -65504
 501   //  +INF & +NaN -> +65504
 502   A_STATIC AU1 AU1_AH1_AF1(AF1 f){
 503    static AW1 base[512]={
 504     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 505     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 506     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 507     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 508     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 509     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 510     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
 511     0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
 512     0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
 513     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 514     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 515     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 516     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 517     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 518     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 519     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 520     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 521     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 522     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 523     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 524     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 525     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 526     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
 527     0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
 528     0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
 529     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 530     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 531     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 532     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 533     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 534     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 535     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
 536    static AB1 shift[512]={
 537     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 538     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 539     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 540     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 541     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 542     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 543     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 544     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 545     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 546     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 547     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 548     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 549     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 550     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 551     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 552     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 553     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 554     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 555     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 556     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 557     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 558     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 559     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 560     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 561     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 562     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 563     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 564     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 565     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 566     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 567     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 568     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
 569    union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
 570  //------------------------------------------------------------------------------------------------------------------------------
 571   // Used to output packed constant.
 572   A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
 573  #endif
 574  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 575  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 576  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 577  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 578  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 579  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 580  //_____________________________________________________________/\_______________________________________________________________
 581  //==============================================================================================================================
 582  //
 583  //
 584  //                                                            GLSL
 585  //
 586  //
 587  //==============================================================================================================================
 588  #if defined(A_GLSL) && defined(A_GPU)
 589   #ifndef A_SKIP_EXT
 590    #ifdef A_HALF
 591     #extension GL_EXT_shader_16bit_storage:require
 592     #extension GL_EXT_shader_explicit_arithmetic_types:require 
 593    #endif
 594  //------------------------------------------------------------------------------------------------------------------------------
 595    #ifdef A_LONG
 596     #extension GL_ARB_gpu_shader_int64:require
 597     #extension GL_NV_shader_atomic_int64:require
 598    #endif
 599  //------------------------------------------------------------------------------------------------------------------------------
 600    #ifdef A_WAVE
 601     #extension GL_KHR_shader_subgroup_arithmetic:require
 602     #extension GL_KHR_shader_subgroup_ballot:require
 603     #extension GL_KHR_shader_subgroup_quad:require
 604     #extension GL_KHR_shader_subgroup_shuffle:require
 605    #endif
 606   #endif
 607  //==============================================================================================================================
 608   #define AP1 bool
 609   #define AP2 bvec2
 610   #define AP3 bvec3
 611   #define AP4 bvec4
 612  //------------------------------------------------------------------------------------------------------------------------------
 613   #define AF1 float
 614   #define AF2 vec2
 615   #define AF3 vec3
 616   #define AF4 vec4
 617  //------------------------------------------------------------------------------------------------------------------------------
 618   #define AU1 uint
 619   #define AU2 uvec2
 620   #define AU3 uvec3
 621   #define AU4 uvec4
 622  //------------------------------------------------------------------------------------------------------------------------------
 623   #define ASU1 int
 624   #define ASU2 ivec2
 625   #define ASU3 ivec3
 626   #define ASU4 ivec4
 627  //==============================================================================================================================
 628   #define AF1_AU1(x) uintBitsToFloat(AU1(x))
 629   #define AF2_AU2(x) uintBitsToFloat(AU2(x))
 630   #define AF3_AU3(x) uintBitsToFloat(AU3(x))
 631   #define AF4_AU4(x) uintBitsToFloat(AU4(x))
 632  //------------------------------------------------------------------------------------------------------------------------------
 633   #define AU1_AF1(x) floatBitsToUint(AF1(x))
 634   #define AU2_AF2(x) floatBitsToUint(AF2(x))
 635   #define AU3_AF3(x) floatBitsToUint(AF3(x))
 636   #define AU4_AF4(x) floatBitsToUint(AF4(x))
 637  //------------------------------------------------------------------------------------------------------------------------------
 638   AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
 639   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
 640  //------------------------------------------------------------------------------------------------------------------------------
 641   #define AU1_AH2_AF2 packHalf2x16
 642   #define AU1_AW2Unorm_AF2 packUnorm2x16
 643   #define AU1_AB4Unorm_AF4 packUnorm4x8
 644  //------------------------------------------------------------------------------------------------------------------------------
 645   #define AF2_AH2_AU1 unpackHalf2x16
 646   #define AF2_AW2Unorm_AU1 unpackUnorm2x16
 647   #define AF4_AB4Unorm_AU1 unpackUnorm4x8
 648  //==============================================================================================================================
 649   AF1 AF1_x(AF1 a){return AF1(a);}
 650   AF2 AF2_x(AF1 a){return AF2(a,a);}
 651   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
 652   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
 653   #define AF1_(a) AF1_x(AF1(a))
 654   #define AF2_(a) AF2_x(AF1(a))
 655   #define AF3_(a) AF3_x(AF1(a))
 656   #define AF4_(a) AF4_x(AF1(a))
 657  //------------------------------------------------------------------------------------------------------------------------------
 658   AU1 AU1_x(AU1 a){return AU1(a);}
 659   AU2 AU2_x(AU1 a){return AU2(a,a);}
 660   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
 661   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
 662   #define AU1_(a) AU1_x(AU1(a))
 663   #define AU2_(a) AU2_x(AU1(a))
 664   #define AU3_(a) AU3_x(AU1(a))
 665   #define AU4_(a) AU4_x(AU1(a))
 666  //==============================================================================================================================
 667   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
 668   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
 669   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
 670   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
 671  //------------------------------------------------------------------------------------------------------------------------------
 672   AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
 673   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
 674   // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
 675   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
 676  //------------------------------------------------------------------------------------------------------------------------------
 677   // V_MED3_F32.
 678   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
 679   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
 680   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
 681   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
 682  //------------------------------------------------------------------------------------------------------------------------------
 683   // V_FRACT_F32 (note DX frac() is different).
 684   AF1 AFractF1(AF1 x){return fract(x);}
 685   AF2 AFractF2(AF2 x){return fract(x);}
 686   AF3 AFractF3(AF3 x){return fract(x);}
 687   AF4 AFractF4(AF4 x){return fract(x);}
 688  //------------------------------------------------------------------------------------------------------------------------------
 689   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
 690   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
 691   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
 692   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
 693  //------------------------------------------------------------------------------------------------------------------------------
 694   // V_MAX3_F32.
 695   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
 696   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
 697   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
 698   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
 699  //------------------------------------------------------------------------------------------------------------------------------
 700   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
 701   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
 702   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
 703   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
 704  //------------------------------------------------------------------------------------------------------------------------------
 705   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
 706   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
 707   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
 708   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
 709  //------------------------------------------------------------------------------------------------------------------------------
 710   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
 711   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
 712   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
 713   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
 714  //------------------------------------------------------------------------------------------------------------------------------
 715   // Clamp has an easier pattern match for med3 when some ordering is known.
 716   // V_MED3_F32.
 717   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
 718   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
 719   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
 720   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
 721  //------------------------------------------------------------------------------------------------------------------------------
 722   // V_MIN3_F32.
 723   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
 724   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
 725   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
 726   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
 727  //------------------------------------------------------------------------------------------------------------------------------
 728   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
 729   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
 730   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
 731   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
 732  //------------------------------------------------------------------------------------------------------------------------------
 733   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
 734   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
 735   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
 736   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
 737  //------------------------------------------------------------------------------------------------------------------------------
 738   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
 739   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
 740   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
 741   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
 742  //------------------------------------------------------------------------------------------------------------------------------
 743   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 744   // V_COS_F32.
 745   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
 746   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
 747   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
 748   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
 749  //------------------------------------------------------------------------------------------------------------------------------
 750   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 751   // V_SIN_F32.
 752   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
 753   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
 754   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
 755   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
 756  //------------------------------------------------------------------------------------------------------------------------------
 757   AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
 758   AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
 759   AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
 760   AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
 761  //------------------------------------------------------------------------------------------------------------------------------
 762   AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
 763   AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
 764   AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
 765   AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
 766  //------------------------------------------------------------------------------------------------------------------------------
 767   AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
 768   AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
 769   AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
 770   AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
 771  //------------------------------------------------------------------------------------------------------------------------------
 772   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
 773   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
 774   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
 775   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
 776  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 777  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 778  //_____________________________________________________________/\_______________________________________________________________
 779  //==============================================================================================================================
 780  //                                                          GLSL BYTE
 781  //==============================================================================================================================
 782   #ifdef A_BYTE
 783    #define AB1 uint8_t
 784    #define AB2 u8vec2
 785    #define AB3 u8vec3
 786    #define AB4 u8vec4
 787  //------------------------------------------------------------------------------------------------------------------------------
 788    #define ASB1 int8_t
 789    #define ASB2 i8vec2
 790    #define ASB3 i8vec3
 791    #define ASB4 i8vec4
 792  //------------------------------------------------------------------------------------------------------------------------------
 793    AB1 AB1_x(AB1 a){return AB1(a);}
 794    AB2 AB2_x(AB1 a){return AB2(a,a);}
 795    AB3 AB3_x(AB1 a){return AB3(a,a,a);}
 796    AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
 797    #define AB1_(a) AB1_x(AB1(a))
 798    #define AB2_(a) AB2_x(AB1(a))
 799    #define AB3_(a) AB3_x(AB1(a))
 800    #define AB4_(a) AB4_x(AB1(a))
 801   #endif
 802  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 803  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 804  //_____________________________________________________________/\_______________________________________________________________
 805  //==============================================================================================================================
 806  //                                                          GLSL HALF
 807  //==============================================================================================================================
 808   #ifdef A_HALF
 809    #define AH1 float16_t
 810    #define AH2 f16vec2
 811    #define AH3 f16vec3
 812    #define AH4 f16vec4
 813  //------------------------------------------------------------------------------------------------------------------------------
 814    #define AW1 uint16_t
 815    #define AW2 u16vec2
 816    #define AW3 u16vec3
 817    #define AW4 u16vec4
 818  //------------------------------------------------------------------------------------------------------------------------------
 819    #define ASW1 int16_t
 820    #define ASW2 i16vec2
 821    #define ASW3 i16vec3
 822    #define ASW4 i16vec4
 823  //==============================================================================================================================
 824    #define AH2_AU1(x) unpackFloat2x16(AU1(x))
 825    AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
 826    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
 827    #define AW2_AU1(x) unpackUint2x16(AU1(x))
 828    #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
 829  //------------------------------------------------------------------------------------------------------------------------------
 830    #define AU1_AH2(x) packFloat2x16(AH2(x))
 831    AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
 832    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
 833    #define AU1_AW2(x) packUint2x16(AW2(x))
 834    #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
 835  //==============================================================================================================================
 836    #define AW1_AH1(x) halfBitsToUint16(AH1(x))
 837    #define AW2_AH2(x) halfBitsToUint16(AH2(x))
 838    #define AW3_AH3(x) halfBitsToUint16(AH3(x))
 839    #define AW4_AH4(x) halfBitsToUint16(AH4(x))
 840  //------------------------------------------------------------------------------------------------------------------------------
 841    #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
 842    #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
 843    #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
 844    #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
 845  //==============================================================================================================================
 846    AH1 AH1_x(AH1 a){return AH1(a);}
 847    AH2 AH2_x(AH1 a){return AH2(a,a);}
 848    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
 849    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
 850    #define AH1_(a) AH1_x(AH1(a))
 851    #define AH2_(a) AH2_x(AH1(a))
 852    #define AH3_(a) AH3_x(AH1(a))
 853    #define AH4_(a) AH4_x(AH1(a))
 854  //------------------------------------------------------------------------------------------------------------------------------
 855    AW1 AW1_x(AW1 a){return AW1(a);}
 856    AW2 AW2_x(AW1 a){return AW2(a,a);}
 857    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
 858    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
 859    #define AW1_(a) AW1_x(AW1(a))
 860    #define AW2_(a) AW2_x(AW1(a))
 861    #define AW3_(a) AW3_x(AW1(a))
 862    #define AW4_(a) AW4_x(AW1(a))
 863  //==============================================================================================================================
 864    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
 865    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
 866    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
 867    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
 868  //------------------------------------------------------------------------------------------------------------------------------
 869    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
 870    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
 871    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
 872    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
 873  //------------------------------------------------------------------------------------------------------------------------------
 874    AH1 AFractH1(AH1 x){return fract(x);}
 875    AH2 AFractH2(AH2 x){return fract(x);}
 876    AH3 AFractH3(AH3 x){return fract(x);}
 877    AH4 AFractH4(AH4 x){return fract(x);}
 878  //------------------------------------------------------------------------------------------------------------------------------
 879    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
 880    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
 881    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
 882    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
 883  //------------------------------------------------------------------------------------------------------------------------------
 884    // No packed version of max3.
 885    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
 886    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
 887    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
 888    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
 889  //------------------------------------------------------------------------------------------------------------------------------
 890    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
 891    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
 892    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
 893    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
 894  //------------------------------------------------------------------------------------------------------------------------------
 895    // No packed version of min3.
 896    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
 897    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
 898    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
 899    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
 900  //------------------------------------------------------------------------------------------------------------------------------
 901    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
 902    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
 903    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
 904    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
 905  //------------------------------------------------------------------------------------------------------------------------------
 906    AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
 907    AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
 908    AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
 909    AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
 910  //------------------------------------------------------------------------------------------------------------------------------
 911    AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
 912    AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
 913    AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
 914    AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
 915  //------------------------------------------------------------------------------------------------------------------------------
 916    AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
 917    AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
 918    AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
 919    AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
 920  //------------------------------------------------------------------------------------------------------------------------------
 921    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
 922    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
 923    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
 924    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
 925   #endif
 926  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 927  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 928  //_____________________________________________________________/\_______________________________________________________________
 929  //==============================================================================================================================
 930  //                                                         GLSL DOUBLE
 931  //==============================================================================================================================
 932   #ifdef A_DUBL
 933    #define AD1 double
 934    #define AD2 dvec2
 935    #define AD3 dvec3
 936    #define AD4 dvec4
 937  //------------------------------------------------------------------------------------------------------------------------------
 938    AD1 AD1_x(AD1 a){return AD1(a);}
 939    AD2 AD2_x(AD1 a){return AD2(a,a);}
 940    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
 941    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
 942    #define AD1_(a) AD1_x(AD1(a))
 943    #define AD2_(a) AD2_x(AD1(a))
 944    #define AD3_(a) AD3_x(AD1(a))
 945    #define AD4_(a) AD4_x(AD1(a))
 946  //==============================================================================================================================
 947    AD1 AFractD1(AD1 x){return fract(x);}
 948    AD2 AFractD2(AD2 x){return fract(x);}
 949    AD3 AFractD3(AD3 x){return fract(x);}
 950    AD4 AFractD4(AD4 x){return fract(x);}
 951  //------------------------------------------------------------------------------------------------------------------------------
 952    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
 953    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
 954    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
 955    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
 956  //------------------------------------------------------------------------------------------------------------------------------
 957    AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
 958    AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
 959    AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
 960    AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
 961  //------------------------------------------------------------------------------------------------------------------------------
 962    AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
 963    AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
 964    AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
 965    AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
 966  //------------------------------------------------------------------------------------------------------------------------------
 967    AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
 968    AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
 969    AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
 970    AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
 971   #endif
 972  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 973  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 974  //_____________________________________________________________/\_______________________________________________________________
 975  //==============================================================================================================================
 976  //                                                         GLSL LONG
 977  //==============================================================================================================================
 978   #ifdef A_LONG
 979    #define AL1 uint64_t
 980    #define AL2 u64vec2
 981    #define AL3 u64vec3
 982    #define AL4 u64vec4
 983  //------------------------------------------------------------------------------------------------------------------------------
 984    #define ASL1 int64_t
 985    #define ASL2 i64vec2
 986    #define ASL3 i64vec3
 987    #define ASL4 i64vec4
 988  //------------------------------------------------------------------------------------------------------------------------------
 989    #define AL1_AU2(x) packUint2x32(AU2(x))
 990    #define AU2_AL1(x) unpackUint2x32(AL1(x))
 991  //------------------------------------------------------------------------------------------------------------------------------
 992    AL1 AL1_x(AL1 a){return AL1(a);}
 993    AL2 AL2_x(AL1 a){return AL2(a,a);}
 994    AL3 AL3_x(AL1 a){return AL3(a,a,a);}
 995    AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
 996    #define AL1_(a) AL1_x(AL1(a))
 997    #define AL2_(a) AL2_x(AL1(a))
 998    #define AL3_(a) AL3_x(AL1(a))
 999    #define AL4_(a) AL4_x(AL1(a))
1000  //==============================================================================================================================
1001    AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
1002    AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
1003    AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
1004    AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
1005  //------------------------------------------------------------------------------------------------------------------------------
1006    AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
1007    AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
1008    AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
1009    AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
1010  //------------------------------------------------------------------------------------------------------------------------------
1011    AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
1012    AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
1013    AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
1014    AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
1015   #endif
1016  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1017  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1018  //_____________________________________________________________/\_______________________________________________________________
1019  //==============================================================================================================================
1020  //                                                      WAVE OPERATIONS
1021  //==============================================================================================================================
1022   #ifdef A_WAVE
1023    // Where 'x' must be a compile time literal.
1024    AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
1025    AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
1026    AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
1027    AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
1028    AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
1029    AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
1030    AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
1031    AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
1032  //------------------------------------------------------------------------------------------------------------------------------
1033    #ifdef A_HALF
1034     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
1035     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
1036     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
1037     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
1038    #endif
1039   #endif
1040  //==============================================================================================================================
1041  #endif
1042  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1043  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1044  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1045  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1046  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1047  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1048  //_____________________________________________________________/\_______________________________________________________________
1049  //==============================================================================================================================
1050  //
1051  //
1052  //                                                            HLSL
1053  //
1054  //
1055  //==============================================================================================================================
1056  #if defined(A_HLSL) && defined(A_GPU)
1057   #ifdef A_HLSL_6_2
1058    #define AP1 bool
1059    #define AP2 bool2
1060    #define AP3 bool3
1061    #define AP4 bool4
1062  //------------------------------------------------------------------------------------------------------------------------------
1063    #define AF1 float32_t
1064    #define AF2 float32_t2
1065    #define AF3 float32_t3
1066    #define AF4 float32_t4
1067  //------------------------------------------------------------------------------------------------------------------------------
1068    #define AU1 uint32_t
1069    #define AU2 uint32_t2
1070    #define AU3 uint32_t3
1071    #define AU4 uint32_t4
1072  //------------------------------------------------------------------------------------------------------------------------------
1073    #define ASU1 int32_t
1074    #define ASU2 int32_t2
1075    #define ASU3 int32_t3
1076    #define ASU4 int32_t4
1077   #else
1078    #define AP1 bool
1079    #define AP2 bool2
1080    #define AP3 bool3
1081    #define AP4 bool4
1082  //------------------------------------------------------------------------------------------------------------------------------
1083    #define AF1 float
1084    #define AF2 float2
1085    #define AF3 float3
1086    #define AF4 float4
1087  //------------------------------------------------------------------------------------------------------------------------------
1088    #define AU1 uint
1089    #define AU2 uint2
1090    #define AU3 uint3
1091    #define AU4 uint4
1092  //------------------------------------------------------------------------------------------------------------------------------
1093    #define ASU1 int
1094    #define ASU2 int2
1095    #define ASU3 int3
1096    #define ASU4 int4
1097   #endif
1098  //==============================================================================================================================
1099   #define AF1_AU1(x) asfloat(AU1(x))
1100   #define AF2_AU2(x) asfloat(AU2(x))
1101   #define AF3_AU3(x) asfloat(AU3(x))
1102   #define AF4_AU4(x) asfloat(AU4(x))
1103  //------------------------------------------------------------------------------------------------------------------------------
1104   #define AU1_AF1(x) asuint(AF1(x))
1105   #define AU2_AF2(x) asuint(AF2(x))
1106   #define AU3_AF3(x) asuint(AF3(x))
1107   #define AU4_AF4(x) asuint(AF4(x))
1108  //------------------------------------------------------------------------------------------------------------------------------
1109   AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
1110   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
1111  //------------------------------------------------------------------------------------------------------------------------------
1112   AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
1113   #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
1114   #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
1115  //------------------------------------------------------------------------------------------------------------------------------
1116   AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
1117   #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
1118  //==============================================================================================================================
1119   AF1 AF1_x(AF1 a){return AF1(a);}
1120   AF2 AF2_x(AF1 a){return AF2(a,a);}
1121   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
1122   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
1123   #define AF1_(a) AF1_x(AF1(a))
1124   #define AF2_(a) AF2_x(AF1(a))
1125   #define AF3_(a) AF3_x(AF1(a))
1126   #define AF4_(a) AF4_x(AF1(a))
1127  //------------------------------------------------------------------------------------------------------------------------------
1128   AU1 AU1_x(AU1 a){return AU1(a);}
1129   AU2 AU2_x(AU1 a){return AU2(a,a);}
1130   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
1131   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
1132   #define AU1_(a) AU1_x(AU1(a))
1133   #define AU2_(a) AU2_x(AU1(a))
1134   #define AU3_(a) AU3_x(AU1(a))
1135   #define AU4_(a) AU4_x(AU1(a))
1136  //==============================================================================================================================
1137   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
1138   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
1139   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
1140   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
1141  //------------------------------------------------------------------------------------------------------------------------------
1142   AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
1143   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
1144   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
1145  //------------------------------------------------------------------------------------------------------------------------------
1146   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
1147   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
1148   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
1149   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
1150  //------------------------------------------------------------------------------------------------------------------------------
1151   AF1 AFractF1(AF1 x){return x-floor(x);}
1152   AF2 AFractF2(AF2 x){return x-floor(x);}
1153   AF3 AFractF3(AF3 x){return x-floor(x);}
1154   AF4 AFractF4(AF4 x){return x-floor(x);}
1155  //------------------------------------------------------------------------------------------------------------------------------
1156   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
1157   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
1158   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
1159   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
1160  //------------------------------------------------------------------------------------------------------------------------------
1161   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
1162   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
1163   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
1164   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
1165  //------------------------------------------------------------------------------------------------------------------------------
1166   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
1167   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
1168   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
1169   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
1170  //------------------------------------------------------------------------------------------------------------------------------
1171   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
1172   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
1173   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
1174   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
1175  //------------------------------------------------------------------------------------------------------------------------------
1176   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
1177   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
1178   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
1179   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
1180  //------------------------------------------------------------------------------------------------------------------------------
1181   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
1182   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
1183   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
1184   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
1185  //------------------------------------------------------------------------------------------------------------------------------
1186   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
1187   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
1188   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
1189   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
1190  //------------------------------------------------------------------------------------------------------------------------------
1191   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
1192   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
1193   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
1194   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
1195  //------------------------------------------------------------------------------------------------------------------------------
1196   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
1197   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
1198   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
1199   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
1200  //------------------------------------------------------------------------------------------------------------------------------
1201   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
1202   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
1203   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
1204   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
1205  //------------------------------------------------------------------------------------------------------------------------------
1206   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
1207   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
1208   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
1209   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
1210  //------------------------------------------------------------------------------------------------------------------------------
1211   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
1212   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
1213   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
1214   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
1215  //------------------------------------------------------------------------------------------------------------------------------
1216   AF1 ARcpF1(AF1 x){return rcp(x);}
1217   AF2 ARcpF2(AF2 x){return rcp(x);}
1218   AF3 ARcpF3(AF3 x){return rcp(x);}
1219   AF4 ARcpF4(AF4 x){return rcp(x);}
1220  //------------------------------------------------------------------------------------------------------------------------------
1221   AF1 ARsqF1(AF1 x){return rsqrt(x);}
1222   AF2 ARsqF2(AF2 x){return rsqrt(x);}
1223   AF3 ARsqF3(AF3 x){return rsqrt(x);}
1224   AF4 ARsqF4(AF4 x){return rsqrt(x);}
1225  //------------------------------------------------------------------------------------------------------------------------------
1226   AF1 ASatF1(AF1 x){return saturate(x);}
1227   AF2 ASatF2(AF2 x){return saturate(x);}
1228   AF3 ASatF3(AF3 x){return saturate(x);}
1229   AF4 ASatF4(AF4 x){return saturate(x);}
1230  //------------------------------------------------------------------------------------------------------------------------------
1231   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
1232   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
1233   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
1234   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
1235  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1236  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1237  //_____________________________________________________________/\_______________________________________________________________
1238  //==============================================================================================================================
1239  //                                                          HLSL BYTE
1240  //==============================================================================================================================
1241   #ifdef A_BYTE
1242   #endif
1243  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1244  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1245  //_____________________________________________________________/\_______________________________________________________________
1246  //==============================================================================================================================
1247  //                                                          HLSL HALF
1248  //==============================================================================================================================
1249   #ifdef A_HALF
1250    #ifdef A_HLSL_6_2
1251     #define AH1 float16_t
1252     #define AH2 float16_t2
1253     #define AH3 float16_t3
1254     #define AH4 float16_t4
1255  //------------------------------------------------------------------------------------------------------------------------------
1256     #define AW1 uint16_t
1257     #define AW2 uint16_t2
1258     #define AW3 uint16_t3
1259     #define AW4 uint16_t4
1260  //------------------------------------------------------------------------------------------------------------------------------
1261     #define ASW1 int16_t
1262     #define ASW2 int16_t2
1263     #define ASW3 int16_t3
1264     #define ASW4 int16_t4
1265    #else
1266     #define AH1 min16float
1267     #define AH2 min16float2
1268     #define AH3 min16float3
1269     #define AH4 min16float4
1270  //------------------------------------------------------------------------------------------------------------------------------
1271     #define AW1 min16uint
1272     #define AW2 min16uint2
1273     #define AW3 min16uint3
1274     #define AW4 min16uint4
1275  //------------------------------------------------------------------------------------------------------------------------------
1276     #define ASW1 min16int
1277     #define ASW2 min16int2
1278     #define ASW3 min16int3
1279     #define ASW4 min16int4
1280    #endif
1281  //==============================================================================================================================
1282    // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
1283    // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
1284    AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
1285    AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
1286    AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
1287    AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
1288    #define AH2_AU1(x) AH2_AU1_x(AU1(x))
1289    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
1290    #define AW2_AU1(x) AW2_AU1_x(AU1(x))
1291    #define AW4_AU2(x) AW4_AU2_x(AU2(x))
1292  //------------------------------------------------------------------------------------------------------------------------------
1293    AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
1294    AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
1295    AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
1296    AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
1297    #define AU1_AH2(x) AU1_AH2_x(AH2(x))
1298    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
1299    #define AU1_AW2(x) AU1_AW2_x(AW2(x))
1300    #define AU2_AW4(x) AU2_AW4_x(AW4(x))
1301  //==============================================================================================================================
1302    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1303     #define AW1_AH1(x) asuint16(x)
1304     #define AW2_AH2(x) asuint16(x)
1305     #define AW3_AH3(x) asuint16(x)
1306     #define AW4_AH4(x) asuint16(x)
1307    #else
1308     #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
1309     #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
1310     #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
1311     #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
1312    #endif
1313  //------------------------------------------------------------------------------------------------------------------------------
1314    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1315     #define AH1_AW1(x) asfloat16(x)
1316     #define AH2_AW2(x) asfloat16(x)
1317     #define AH3_AW3(x) asfloat16(x)
1318     #define AH4_AW4(x) asfloat16(x)
1319    #else
1320     #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
1321     #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
1322     #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
1323     #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
1324    #endif
1325  //==============================================================================================================================
1326    AH1 AH1_x(AH1 a){return AH1(a);}
1327    AH2 AH2_x(AH1 a){return AH2(a,a);}
1328    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
1329    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
1330    #define AH1_(a) AH1_x(AH1(a))
1331    #define AH2_(a) AH2_x(AH1(a))
1332    #define AH3_(a) AH3_x(AH1(a))
1333    #define AH4_(a) AH4_x(AH1(a))
1334  //------------------------------------------------------------------------------------------------------------------------------
1335    AW1 AW1_x(AW1 a){return AW1(a);}
1336    AW2 AW2_x(AW1 a){return AW2(a,a);}
1337    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
1338    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
1339    #define AW1_(a) AW1_x(AW1(a))
1340    #define AW2_(a) AW2_x(AW1(a))
1341    #define AW3_(a) AW3_x(AW1(a))
1342    #define AW4_(a) AW4_x(AW1(a))
1343  //==============================================================================================================================
1344    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
1345    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
1346    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
1347    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
1348  //------------------------------------------------------------------------------------------------------------------------------
1349    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
1350    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
1351    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
1352    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
1353  //------------------------------------------------------------------------------------------------------------------------------
1354   // V_FRACT_F16 (note DX frac() is different).
1355    AH1 AFractH1(AH1 x){return x-floor(x);}
1356    AH2 AFractH2(AH2 x){return x-floor(x);}
1357    AH3 AFractH3(AH3 x){return x-floor(x);}
1358    AH4 AFractH4(AH4 x){return x-floor(x);}
1359  //------------------------------------------------------------------------------------------------------------------------------
1360    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
1361    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
1362    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
1363    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
1364  //------------------------------------------------------------------------------------------------------------------------------
1365    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
1366    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
1367    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
1368    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
1369  //------------------------------------------------------------------------------------------------------------------------------
1370    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
1371    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
1372    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
1373    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
1374  //------------------------------------------------------------------------------------------------------------------------------
1375    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
1376    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
1377    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
1378    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
1379  //------------------------------------------------------------------------------------------------------------------------------
1380    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
1381    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
1382    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
1383    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
1384  //------------------------------------------------------------------------------------------------------------------------------
1385    AH1 ARcpH1(AH1 x){return rcp(x);}
1386    AH2 ARcpH2(AH2 x){return rcp(x);}
1387    AH3 ARcpH3(AH3 x){return rcp(x);}
1388    AH4 ARcpH4(AH4 x){return rcp(x);}
1389  //------------------------------------------------------------------------------------------------------------------------------
1390    AH1 ARsqH1(AH1 x){return rsqrt(x);}
1391    AH2 ARsqH2(AH2 x){return rsqrt(x);}
1392    AH3 ARsqH3(AH3 x){return rsqrt(x);}
1393    AH4 ARsqH4(AH4 x){return rsqrt(x);}
1394  //------------------------------------------------------------------------------------------------------------------------------
1395    AH1 ASatH1(AH1 x){return saturate(x);}
1396    AH2 ASatH2(AH2 x){return saturate(x);}
1397    AH3 ASatH3(AH3 x){return saturate(x);}
1398    AH4 ASatH4(AH4 x){return saturate(x);}
1399  //------------------------------------------------------------------------------------------------------------------------------
1400    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
1401    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
1402    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
1403    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
1404   #endif
1405  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1406  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1407  //_____________________________________________________________/\_______________________________________________________________
1408  //==============================================================================================================================
1409  //                                                         HLSL DOUBLE
1410  //==============================================================================================================================
1411   #ifdef A_DUBL
1412    #ifdef A_HLSL_6_2
1413     #define AD1 float64_t
1414     #define AD2 float64_t2
1415     #define AD3 float64_t3
1416     #define AD4 float64_t4
1417    #else
1418     #define AD1 double
1419     #define AD2 double2
1420     #define AD3 double3
1421     #define AD4 double4
1422    #endif
1423  //------------------------------------------------------------------------------------------------------------------------------
1424    AD1 AD1_x(AD1 a){return AD1(a);}
1425    AD2 AD2_x(AD1 a){return AD2(a,a);}
1426    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
1427    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
1428    #define AD1_(a) AD1_x(AD1(a))
1429    #define AD2_(a) AD2_x(AD1(a))
1430    #define AD3_(a) AD3_x(AD1(a))
1431    #define AD4_(a) AD4_x(AD1(a))
1432  //==============================================================================================================================
1433    AD1 AFractD1(AD1 a){return a-floor(a);}
1434    AD2 AFractD2(AD2 a){return a-floor(a);}
1435    AD3 AFractD3(AD3 a){return a-floor(a);}
1436    AD4 AFractD4(AD4 a){return a-floor(a);}
1437  //------------------------------------------------------------------------------------------------------------------------------
1438    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
1439    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
1440    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
1441    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
1442  //------------------------------------------------------------------------------------------------------------------------------
1443    AD1 ARcpD1(AD1 x){return rcp(x);}
1444    AD2 ARcpD2(AD2 x){return rcp(x);}
1445    AD3 ARcpD3(AD3 x){return rcp(x);}
1446    AD4 ARcpD4(AD4 x){return rcp(x);}
1447  //------------------------------------------------------------------------------------------------------------------------------
1448    AD1 ARsqD1(AD1 x){return rsqrt(x);}
1449    AD2 ARsqD2(AD2 x){return rsqrt(x);}
1450    AD3 ARsqD3(AD3 x){return rsqrt(x);}
1451    AD4 ARsqD4(AD4 x){return rsqrt(x);}
1452  //------------------------------------------------------------------------------------------------------------------------------
1453    AD1 ASatD1(AD1 x){return saturate(x);}
1454    AD2 ASatD2(AD2 x){return saturate(x);}
1455    AD3 ASatD3(AD3 x){return saturate(x);}
1456    AD4 ASatD4(AD4 x){return saturate(x);}
1457   #endif
1458  //==============================================================================================================================
1459  //                                                         HLSL WAVE
1460  //==============================================================================================================================
1461   #ifdef A_WAVE
1462    // Where 'x' must be a compile time literal.
1463    AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1464    AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1465    AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1466    AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1467    AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1468    AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1469    AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1470    AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1471  //------------------------------------------------------------------------------------------------------------------------------
1472    #ifdef A_HALF
1473     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
1474     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
1475     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
1476     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
1477    #endif
1478   #endif
1479  //==============================================================================================================================
1480  #endif
1481  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1482  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1483  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1484  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1485  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1486  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1487  //_____________________________________________________________/\_______________________________________________________________
1488  //==============================================================================================================================
1489  //
1490  //
1491  //                                                          GPU COMMON
1492  //
1493  //
1494  //==============================================================================================================================
1495  #ifdef A_GPU
1496   // Negative and positive infinity.
1497   #define A_INFP_F AF1_AU1(0x7f800000u)
1498   #define A_INFN_F AF1_AU1(0xff800000u)
1499  //------------------------------------------------------------------------------------------------------------------------------
1500   // Copy sign from 's' to positive 'd'.
1501   AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
1502   AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
1503   AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
1504   AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
1505  //------------------------------------------------------------------------------------------------------------------------------
1506   // Single operation to return (useful to create a mask to use in lerp for branch free logic),
1507   //  m=NaN := 0
1508   //  m>=0  := 0
1509   //  m<0   := 1
1510   // Uses the following useful floating point logic,
1511   //  saturate(+a*(-INF)==-INF) := 0
1512   //  saturate( 0*(-INF)== NaN) := 0
1513   //  saturate(-a*(-INF)==+INF) := 1
1514   AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
1515   AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
1516   AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
1517   AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
1518  //------------------------------------------------------------------------------------------------------------------------------
1519   AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
1520   AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
1521   AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
1522   AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
1523  //==============================================================================================================================
1524   #ifdef A_HALF
1525    #ifdef A_HLSL_6_2
1526     #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
1527     #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
1528    #else
1529     #define A_INFP_H AH1_AW1(0x7c00u)
1530     #define A_INFN_H AH1_AW1(0xfc00u)
1531    #endif
1532  
1533  //------------------------------------------------------------------------------------------------------------------------------
1534    AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
1535    AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
1536    AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
1537    AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
1538  //------------------------------------------------------------------------------------------------------------------------------
1539    AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
1540    AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
1541    AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
1542    AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
1543  //------------------------------------------------------------------------------------------------------------------------------
1544    AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
1545    AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
1546    AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
1547    AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
1548   #endif
1549  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1550  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1551  //_____________________________________________________________/\_______________________________________________________________
1552  //==============================================================================================================================
1553  //                                                [FIS] FLOAT INTEGER SORTABLE
1554  //------------------------------------------------------------------------------------------------------------------------------
1555  // Float to integer sortable.
1556  //  - If sign bit=0, flip the sign bit (positives).
1557  //  - If sign bit=1, flip all bits     (negatives).
1558  // Integer sortable to float.
1559  //  - If sign bit=1, flip the sign bit (positives).
1560  //  - If sign bit=0, flip all bits     (negatives).
1561  // Has nice side effects.
1562  //  - Larger integers are more positive values.
1563  //  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
1564  // Burns 3 ops for conversion {shift,or,xor}.
1565  //==============================================================================================================================
1566   AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1567   AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1568  //------------------------------------------------------------------------------------------------------------------------------
1569   // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
1570   AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1571   AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1572  //------------------------------------------------------------------------------------------------------------------------------
1573   #ifdef A_HALF
1574    AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1575    AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1576  //------------------------------------------------------------------------------------------------------------------------------
1577    AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1578    AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1579   #endif
1580  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1581  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1582  //_____________________________________________________________/\_______________________________________________________________
1583  //==============================================================================================================================
1584  //                                                      [PERM] V_PERM_B32
1585  //------------------------------------------------------------------------------------------------------------------------------
1586  // Support for V_PERM_B32 started in the 3rd generation of GCN.
1587  //------------------------------------------------------------------------------------------------------------------------------
1588  // yyyyxxxx - The 'i' input.
1589  // 76543210
1590  // ========
1591  // HGFEDCBA - Naming on permutation.
1592  //------------------------------------------------------------------------------------------------------------------------------
1593  // TODO
1594  // ====
1595  //  - Make sure compiler optimizes this.
1596  //==============================================================================================================================
1597   #ifdef A_HALF
1598    AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
1599    AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
1600    AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
1601    AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
1602  //------------------------------------------------------------------------------------------------------------------------------
1603    AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
1604    AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
1605    AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1606    AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1607    AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
1608    AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
1609    AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
1610    AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
1611  //------------------------------------------------------------------------------------------------------------------------------
1612    AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
1613    AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
1614   #endif
1615  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1616  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1617  //_____________________________________________________________/\_______________________________________________________________
1618  //==============================================================================================================================
1619  //                                               [BUC] BYTE UNSIGNED CONVERSION
1620  //------------------------------------------------------------------------------------------------------------------------------
1621  // Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
1622  // Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
1623  //------------------------------------------------------------------------------------------------------------------------------
1624  // OPCODE NOTES
1625  // ============
1626  // GCN does not do UNORM or SNORM for bytes in opcodes.
1627  //  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
1628  //  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
1629  // V_PERM_B32 does byte packing with ability to zero fill bytes as well.
1630  //  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
1631  //------------------------------------------------------------------------------------------------------------------------------
1632  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
1633  // ====   =====
1634  //    0 : 0
1635  //    1 : 1
1636  //     ...
1637  //  255 : 255
1638  //      : 256 (just outside the encoding range)
1639  //------------------------------------------------------------------------------------------------------------------------------
1640  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1641  // ====   =====
1642  //    0 : 0
1643  //    1 : 1/512
1644  //    2 : 1/256
1645  //     ...
1646  //   64 : 1/8
1647  //  128 : 1/4
1648  //  255 : 255/512
1649  //      : 1/2 (just outside the encoding range)
1650  //------------------------------------------------------------------------------------------------------------------------------
1651  // OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
1652  // ============================================
1653  // r=ABuc0FromU1(i)
1654  //   V_CVT_F32_UBYTE0 r,i
1655  // --------------------------------------------
1656  // r=ABuc0ToU1(d,i)
1657  //   V_CVT_PKACCUM_U8_F32 r,i,0,d
1658  // --------------------------------------------
1659  // d=ABuc0FromU2(i)
1660  //   Where 'k0' is an SGPR with 0x0E0A
1661  //   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
1662  //   V_PERM_B32 d,i.x,i.y,k0
1663  //   V_PK_FMA_F16 d,d,k1.x,0
1664  // --------------------------------------------
1665  // r=ABuc0ToU2(d,i)
1666  //   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
1667  //   Where 'k1' is an SGPR with 0x????
1668  //   Where 'k2' is an SGPR with 0x????
1669  //   V_PK_FMA_F16 i,i,k0.x,0
1670  //   V_PERM_B32 r.x,i,i,k1
1671  //   V_PERM_B32 r.y,i,i,k2
1672  //==============================================================================================================================
1673   // Peak range for 32-bit and 16-bit operations.
1674   #define A_BUC_32 (255.0)
1675   #define A_BUC_16 (255.0/512.0)
1676  //==============================================================================================================================
1677   #if 1
1678    // Designed to be one V_CVT_PKACCUM_U8_F32.
1679    // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
1680    AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
1681    AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
1682    AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
1683    AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
1684  //------------------------------------------------------------------------------------------------------------------------------
1685    // Designed to be one V_CVT_F32_UBYTE*.
1686    AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
1687    AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
1688    AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
1689    AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
1690   #endif
1691  //==============================================================================================================================
1692   #ifdef A_HALF
1693    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1694    AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
1695     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1696  //------------------------------------------------------------------------------------------------------------------------------
1697    // Designed for 3 ops to do SOA to AOS and conversion.
1698    AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1699     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1700    AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1701     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1702    AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1703     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1704    AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1705     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1706  //------------------------------------------------------------------------------------------------------------------------------
1707    // Designed for 2 ops to do both AOS to SOA, and conversion.
1708    AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
1709    AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
1710    AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
1711    AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
1712   #endif
1713  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1714  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1715  //_____________________________________________________________/\_______________________________________________________________
1716  //==============================================================================================================================
1717  //                                                 [BSC] BYTE SIGNED CONVERSION
1718  //------------------------------------------------------------------------------------------------------------------------------
1719  // Similar to [BUC].
1720  // Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
1721  //------------------------------------------------------------------------------------------------------------------------------
1722  // ENCODING (without zero-based encoding)
1723  // ========
1724  //   0 = unused (can be used to mean something else)
1725  //   1 = lowest value 
1726  // 128 = exact zero center (zero based encoding 
1727  // 255 = highest value
1728  //------------------------------------------------------------------------------------------------------------------------------
1729  // Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
1730  // This is useful if there is a desire for cleared values to decode as zero.
1731  //------------------------------------------------------------------------------------------------------------------------------
1732  // BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1733  // ====   =====
1734  //    0 : -127/512 (unused)
1735  //    1 : -126/512
1736  //    2 : -125/512
1737  //     ...
1738  //  128 : 0 
1739  //     ... 
1740  //  255 : 127/512
1741  //      : 1/4 (just outside the encoding range)
1742  //==============================================================================================================================
1743   // Peak range for 32-bit and 16-bit operations.
1744   #define A_BSC_32 (127.0)
1745   #define A_BSC_16 (127.0/512.0)
1746  //==============================================================================================================================
1747   #if 1
1748    AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
1749    AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
1750    AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
1751    AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
1752  //------------------------------------------------------------------------------------------------------------------------------
1753    AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
1754    AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
1755    AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
1756    AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
1757  //------------------------------------------------------------------------------------------------------------------------------
1758    AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
1759    AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
1760    AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
1761    AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
1762  //------------------------------------------------------------------------------------------------------------------------------
1763    AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
1764    AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
1765    AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
1766    AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
1767   #endif
1768  //==============================================================================================================================
1769   #ifdef A_HALF
1770    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1771    AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
1772     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1773  //------------------------------------------------------------------------------------------------------------------------------
1774    AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1775     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1776    AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1777     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1778    AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1779     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1780    AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1781     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1782  //------------------------------------------------------------------------------------------------------------------------------
1783    AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1784     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1785    AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1786     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1787    AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1788     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1789    AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1790     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1791  //------------------------------------------------------------------------------------------------------------------------------
1792    AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
1793    AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
1794    AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
1795    AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
1796  //------------------------------------------------------------------------------------------------------------------------------
1797    AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1798    AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1799    AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1800    AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1801   #endif
1802  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1803  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1804  //_____________________________________________________________/\_______________________________________________________________
1805  //==============================================================================================================================
1806  //                                                     HALF APPROXIMATIONS
1807  //------------------------------------------------------------------------------------------------------------------------------
1808  // These support only positive inputs.
1809  // Did not see value yet in specialization for range.
1810  // Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
1811  // With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
1812  // However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
1813  // And co-execution would require a compiler interleaving a lot of independent work for packed usage.
1814  //------------------------------------------------------------------------------------------------------------------------------
1815  // The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
1816  // Same with sqrt(), as this could be x*rsq() (7 ops).
1817  //==============================================================================================================================
1818   #ifdef A_HALF
1819    // Minimize squared error across full positive range, 2 ops.
1820    // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
1821    AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
1822    AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
1823    AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
1824    AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
1825  //------------------------------------------------------------------------------------------------------------------------------
1826    // Lower precision estimation, 1 op.
1827    // Minimize squared error across {smallest normal to 16384.0}.
1828    AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
1829    AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
1830    AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
1831    AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
1832  //------------------------------------------------------------------------------------------------------------------------------
1833    // Medium precision estimation, one Newton Raphson iteration, 3 ops.
1834    AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
1835    AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
1836    AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
1837    AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
1838  //------------------------------------------------------------------------------------------------------------------------------
1839    // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
1840    AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
1841    AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
1842    AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
1843    AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
1844   #endif
1845  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1846  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1847  //_____________________________________________________________/\_______________________________________________________________
1848  //==============================================================================================================================
1849  //                                                    FLOAT APPROXIMATIONS
1850  //------------------------------------------------------------------------------------------------------------------------------
1851  // Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
1852  //  - Idea dates back to SGI, then to Quake 3, etc.
1853  //  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
1854  //     - sqrt(x)=rsqrt(x)*x
1855  //     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
1856  //  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
1857  //------------------------------------------------------------------------------------------------------------------------------
1858  // These below are from perhaps less complete searching for optimal.
1859  // Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
1860  // So these match up well with the half approximations.
1861  //==============================================================================================================================
1862   AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
1863   AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
1864   AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
1865   AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
1866  //------------------------------------------------------------------------------------------------------------------------------
1867   AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
1868   AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
1869   AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
1870   AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
1871  //------------------------------------------------------------------------------------------------------------------------------
1872   AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
1873   AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
1874   AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
1875   AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
1876  //------------------------------------------------------------------------------------------------------------------------------
1877   AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
1878   AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
1879   AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
1880   AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
1881  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1882  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1883  //_____________________________________________________________/\_______________________________________________________________
1884  //==============================================================================================================================
1885  //                                                    PQ APPROXIMATIONS
1886  //------------------------------------------------------------------------------------------------------------------------------
1887  // PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
1888  // PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
1889  //==============================================================================================================================
1890  // Helpers
1891   AF1 Quart(AF1 a) { a = a * a; return a * a;}
1892   AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
1893   AF2 Quart(AF2 a) { a = a * a; return a * a; }
1894   AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
1895   AF3 Quart(AF3 a) { a = a * a; return a * a; }
1896   AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
1897   AF4 Quart(AF4 a) { a = a * a; return a * a; }
1898   AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
1899   //------------------------------------------------------------------------------------------------------------------------------
1900   AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
1901   AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
1902   AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
1903   AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1904   AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
1905   AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
1906   AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1907   AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
1908   //------------------------------------------------------------------------------------------------------------------------------
1909   AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
1910   AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
1911   AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
1912   AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1913   AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
1914   AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
1915   AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1916   AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
1917   //------------------------------------------------------------------------------------------------------------------------------
1918   AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
1919   AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
1920   AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
1921   AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1922   AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
1923   AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
1924   AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1925   AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
1926   //------------------------------------------------------------------------------------------------------------------------------
1927   AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
1928   AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
1929   AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
1930   AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1931   AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
1932   AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
1933   AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1934   AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
1935  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1936  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1937  //_____________________________________________________________/\_______________________________________________________________
1938  //==============================================================================================================================
1939  //                                                    PARABOLIC SIN & COS
1940  //------------------------------------------------------------------------------------------------------------------------------
1941  // Approximate answers to transcendental questions.
1942  //------------------------------------------------------------------------------------------------------------------------------
1943  //==============================================================================================================================
1944   #if 1
1945    // Valid input range is {-1 to 1} representing {0 to 2 pi}.
1946    // Output range is {-1/4 to 1/4} representing {-1 to 1}.
1947    AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
1948    AF2 APSinF2(AF2 x){return x*abs(x)-x;}
1949    AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
1950    AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
1951    AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
1952   #endif
1953  //------------------------------------------------------------------------------------------------------------------------------
1954   #ifdef A_HALF
1955    // For a packed {sin,cos} pair,
1956    //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
1957    //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
1958    AH1 APSinH1(AH1 x){return x*abs(x)-x;}
1959    AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
1960    AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
1961    AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
1962    AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
1963   #endif
1964  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1965  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1966  //_____________________________________________________________/\_______________________________________________________________
1967  //==============================================================================================================================
1968  //                                                     [ZOL] ZERO ONE LOGIC
1969  //------------------------------------------------------------------------------------------------------------------------------
1970  // Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
1971  //------------------------------------------------------------------------------------------------------------------------------
1972  // 0 := false
1973  // 1 := true
1974  //------------------------------------------------------------------------------------------------------------------------------
1975  // AndNot(x,y)   -> !(x&y) .... One op.
1976  // AndOr(x,y,z)  -> (x&y)|z ... One op.
1977  // GtZero(x)     -> x>0.0 ..... One op.
1978  // Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
1979  // Signed(x)     -> x<0.0 ..... One op.
1980  // ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
1981  //------------------------------------------------------------------------------------------------------------------------------
1982  // OPTIMIZATION NOTES
1983  // ==================
1984  // - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
1985  //   For example 'a.xy*k.xx+k.yy'.
1986  //==============================================================================================================================
1987   #if 1
1988    AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
1989    AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
1990    AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
1991    AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
1992  //------------------------------------------------------------------------------------------------------------------------------
1993    AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
1994    AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
1995    AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
1996    AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
1997  //------------------------------------------------------------------------------------------------------------------------------
1998    AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
1999    AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
2000    AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
2001    AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
2002  //==============================================================================================================================
2003    AU1 AZolF1ToU1(AF1 x){return AU1(x);}
2004    AU2 AZolF2ToU2(AF2 x){return AU2(x);}
2005    AU3 AZolF3ToU3(AF3 x){return AU3(x);}
2006    AU4 AZolF4ToU4(AF4 x){return AU4(x);}
2007  //------------------------------------------------------------------------------------------------------------------------------
2008    // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
2009    AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
2010    AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
2011    AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
2012    AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
2013  //------------------------------------------------------------------------------------------------------------------------------
2014    AF1 AZolU1ToF1(AU1 x){return AF1(x);}
2015    AF2 AZolU2ToF2(AU2 x){return AF2(x);}
2016    AF3 AZolU3ToF3(AU3 x){return AF3(x);}
2017    AF4 AZolU4ToF4(AU4 x){return AF4(x);}
2018  //==============================================================================================================================
2019    AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
2020    AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
2021    AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
2022    AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
2023  //------------------------------------------------------------------------------------------------------------------------------
2024    AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
2025    AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
2026    AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
2027    AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
2028  //------------------------------------------------------------------------------------------------------------------------------
2029    AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
2030    AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
2031    AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
2032    AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
2033  //------------------------------------------------------------------------------------------------------------------------------
2034    AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
2035    AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
2036    AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
2037    AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
2038  //------------------------------------------------------------------------------------------------------------------------------
2039    AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
2040    AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
2041    AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
2042    AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
2043  //------------------------------------------------------------------------------------------------------------------------------
2044    AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
2045    AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
2046    AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
2047    AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
2048  //------------------------------------------------------------------------------------------------------------------------------
2049    AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
2050    AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
2051    AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
2052    AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
2053  //------------------------------------------------------------------------------------------------------------------------------
2054    AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
2055    AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
2056    AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
2057    AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
2058  //------------------------------------------------------------------------------------------------------------------------------
2059    AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
2060    AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
2061    AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
2062    AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
2063   #endif
2064  //==============================================================================================================================
2065   #ifdef A_HALF
2066    AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
2067    AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
2068    AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
2069    AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
2070  //------------------------------------------------------------------------------------------------------------------------------
2071    AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
2072    AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
2073    AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
2074    AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
2075  //------------------------------------------------------------------------------------------------------------------------------
2076    AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
2077    AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
2078    AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
2079    AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
2080  //==============================================================================================================================
2081    // Uses denormal trick.
2082    AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
2083    AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
2084    AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
2085    AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
2086  //------------------------------------------------------------------------------------------------------------------------------
2087    // AMD arch lacks a packed conversion opcode.
2088    AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
2089    AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
2090    AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
2091    AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
2092  //==============================================================================================================================
2093    AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
2094    AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
2095    AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
2096    AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
2097  //------------------------------------------------------------------------------------------------------------------------------
2098    AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
2099    AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
2100    AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
2101    AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
2102  //------------------------------------------------------------------------------------------------------------------------------
2103    AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
2104    AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
2105    AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
2106    AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
2107  //------------------------------------------------------------------------------------------------------------------------------
2108    AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
2109    AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
2110    AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
2111    AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
2112  //------------------------------------------------------------------------------------------------------------------------------
2113    AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
2114    AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
2115    AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
2116    AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
2117  //------------------------------------------------------------------------------------------------------------------------------
2118    AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
2119    AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
2120    AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
2121    AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
2122  //------------------------------------------------------------------------------------------------------------------------------
2123    AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
2124    AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
2125    AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
2126    AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
2127  //------------------------------------------------------------------------------------------------------------------------------
2128    AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
2129    AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
2130    AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
2131    AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
2132   #endif
2133  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2134  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2135  //_____________________________________________________________/\_______________________________________________________________
2136  //==============================================================================================================================
2137  //                                                      COLOR CONVERSIONS
2138  //------------------------------------------------------------------------------------------------------------------------------
2139  // These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
2140  // So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
2141  // These are branch free implementations.
2142  // The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
2143  //------------------------------------------------------------------------------------------------------------------------------
2144  // TRANSFER FUNCTIONS
2145  // ==================
2146  // 709 ..... Rec709 used for some HDTVs
2147  // Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
2148  // Pq ...... PQ native for HDR10
2149  // Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
2150  // Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
2151  // Three ... Gamma 3.0, less fast, but good for HDR.
2152  //------------------------------------------------------------------------------------------------------------------------------
2153  // KEEPING TO SPEC
2154  // ===============
2155  // Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
2156  //  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
2157  //  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
2158  // Also there is a slight step in the transition regions.
2159  // Precision of the coefficients in the spec being the likely cause.
2160  // Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
2161  // This is to work around lack of hardware (typically only ROP does the conversion for free).
2162  // To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
2163  // So this header keeps with the spec.
2164  // For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
2165  // Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
2166  //------------------------------------------------------------------------------------------------------------------------------
2167  // FOR PQ
2168  // ======
2169  // Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
2170  // All constants are only specified to FP32 precision.
2171  // External PQ source reference,
2172  //  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
2173  //------------------------------------------------------------------------------------------------------------------------------
2174  // PACKED VERSIONS
2175  // ===============
2176  // These are the A*H2() functions.
2177  // There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
2178  // The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
2179  // Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
2180  //------------------------------------------------------------------------------------------------------------------------------
2181  // NOTES
2182  // =====
2183  // Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
2184  //==============================================================================================================================
2185   #if 1
2186    AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2187     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2188    AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2189     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2190    AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2191     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2192  //------------------------------------------------------------------------------------------------------------------------------
2193    // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
2194    AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
2195    AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
2196    AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
2197  //------------------------------------------------------------------------------------------------------------------------------
2198    AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
2199     return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
2200    AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
2201     return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
2202    AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
2203     return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
2204  //------------------------------------------------------------------------------------------------------------------------------
2205    AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2206     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2207    AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2208     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2209    AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2210     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2211  //------------------------------------------------------------------------------------------------------------------------------
2212    AF1 AToTwoF1(AF1 c){return sqrt(c);}
2213    AF2 AToTwoF2(AF2 c){return sqrt(c);}
2214    AF3 AToTwoF3(AF3 c){return sqrt(c);}
2215  //------------------------------------------------------------------------------------------------------------------------------
2216    AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
2217    AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
2218    AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
2219   #endif
2220  //==============================================================================================================================
2221   #if 1
2222    // Unfortunately median won't work here.
2223    AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2224     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2225    AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2226     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2227    AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2228     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2229  //------------------------------------------------------------------------------------------------------------------------------
2230    AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
2231    AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
2232    AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
2233  //------------------------------------------------------------------------------------------------------------------------------
2234    AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
2235     return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
2236    AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
2237     return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
2238    AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
2239     return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
2240  //------------------------------------------------------------------------------------------------------------------------------
2241    // Unfortunately median won't work here.
2242    AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2243     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2244    AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2245     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2246    AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2247     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2248  //------------------------------------------------------------------------------------------------------------------------------
2249    AF1 AFromTwoF1(AF1 c){return c*c;}
2250    AF2 AFromTwoF2(AF2 c){return c*c;}
2251    AF3 AFromTwoF3(AF3 c){return c*c;}
2252  //------------------------------------------------------------------------------------------------------------------------------
2253    AF1 AFromThreeF1(AF1 c){return c*c*c;}
2254    AF2 AFromThreeF2(AF2 c){return c*c*c;}
2255    AF3 AFromThreeF3(AF3 c){return c*c*c;}
2256   #endif
2257  //==============================================================================================================================
2258   #ifdef A_HALF
2259    AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2260     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2261    AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2262     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2263    AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2264     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2265  //------------------------------------------------------------------------------------------------------------------------------
2266    AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
2267    AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
2268    AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
2269  //------------------------------------------------------------------------------------------------------------------------------
2270    AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2271     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2272    AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2273     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2274    AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2275     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2276  //------------------------------------------------------------------------------------------------------------------------------
2277    AH1 AToTwoH1(AH1 c){return sqrt(c);}
2278    AH2 AToTwoH2(AH2 c){return sqrt(c);}
2279    AH3 AToTwoH3(AH3 c){return sqrt(c);}
2280  //------------------------------------------------------------------------------------------------------------------------------
2281    AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
2282    AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
2283    AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
2284   #endif
2285  //==============================================================================================================================
2286   #ifdef A_HALF
2287    AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2288     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2289    AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2290     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2291    AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2292     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2293  //------------------------------------------------------------------------------------------------------------------------------
2294    AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
2295    AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
2296    AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
2297  //------------------------------------------------------------------------------------------------------------------------------
2298    AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2299     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2300    AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2301     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2302    AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2303     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2304  //------------------------------------------------------------------------------------------------------------------------------
2305    AH1 AFromTwoH1(AH1 c){return c*c;}
2306    AH2 AFromTwoH2(AH2 c){return c*c;}
2307    AH3 AFromTwoH3(AH3 c){return c*c;}
2308  //------------------------------------------------------------------------------------------------------------------------------
2309    AH1 AFromThreeH1(AH1 c){return c*c*c;}
2310    AH2 AFromThreeH2(AH2 c){return c*c*c;}
2311    AH3 AFromThreeH3(AH3 c){return c*c*c;}
2312   #endif
2313  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2314  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2315  //_____________________________________________________________/\_______________________________________________________________
2316  //==============================================================================================================================
2317  //                                                          CS REMAP
2318  //==============================================================================================================================
2319   // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
2320   //  543210
2321   //  ======
2322   //  ..xxx.
2323   //  yy...y
2324   AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2325  //==============================================================================================================================
2326   // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
2327   //  543210
2328   //  ======
2329   //  .xx..x
2330   //  y..yy.
2331   // Details,
2332   //  LANE TO 8x8 MAPPING
2333   //  ===================
2334   //  00 01 08 09 10 11 18 19 
2335   //  02 03 0a 0b 12 13 1a 1b
2336   //  04 05 0c 0d 14 15 1c 1d
2337   //  06 07 0e 0f 16 17 1e 1f 
2338   //  20 21 28 29 30 31 38 39 
2339   //  22 23 2a 2b 32 33 3a 3b
2340   //  24 25 2c 2d 34 35 3c 3d
2341   //  26 27 2e 2f 36 37 3e 3f 
2342   AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2343  //==============================================================================================================================
2344   #ifdef A_HALF
2345    AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2346    AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2347   #endif
2348  #endif
2349  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2350  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2351  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2352  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2353  //_____________________________________________________________/\_______________________________________________________________
2354  //==============================================================================================================================
2355  //
2356  //                                                          REFERENCE
2357  //
2358  //------------------------------------------------------------------------------------------------------------------------------
2359  // IEEE FLOAT RULES
2360  // ================
2361  //  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
2362  //  - {+/-}0 * {+/-}INF = NaN
2363  //  - -INF + (+INF) = NaN
2364  //  - {+/-}0 / {+/-}0 = NaN
2365  //  - {+/-}INF / {+/-}INF = NaN
2366  //  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
2367  //  - 0 == -0
2368  //  - 4/0 = +INF
2369  //  - 4/-0 = -INF
2370  //  - 4+INF = +INF
2371  //  - 4-INF = -INF
2372  //  - 4*(+INF) = +INF
2373  //  - 4*(-INF) = -INF
2374  //  - -4*(+INF) = -INF
2375  //  - sqrt(+INF) = +INF
2376  //------------------------------------------------------------------------------------------------------------------------------
2377  // FP16 ENCODING
2378  // =============
2379  // fedcba9876543210
2380  // ----------------
2381  // ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
2382  // .eeeee..........  5-bit exponent
2383  // .00000..........  denormals
2384  // .00001..........  -14 exponent
2385  // .11110..........   15 exponent
2386  // .111110000000000  infinity
2387  // .11111nnnnnnnnnn  NaN with n!=0
2388  // s...............  sign
2389  //------------------------------------------------------------------------------------------------------------------------------
2390  // FP16/INT16 ALIASING DENORMAL
2391  // ============================
2392  // 11-bit unsigned integers alias with half float denormal/normal values,
2393  //     1 = 2^(-24) = 1/16777216 ....................... first denormal value
2394  //     2 = 2^(-23)
2395  //   ...
2396  //  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
2397  //  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
2398  //  2047 .............................................. last normal value that still maps to integers 
2399  // Scaling limits,
2400  //  2^15 = 32768 ...................................... largest power of 2 scaling
2401  // Largest pow2 conversion mapping is at *32768,
2402  //     1 : 2^(-9) = 1/512
2403  //     2 : 1/256
2404  //     4 : 1/128
2405  //     8 : 1/64
2406  //    16 : 1/32
2407  //    32 : 1/16
2408  //    64 : 1/8
2409  //   128 : 1/4
2410  //   256 : 1/2
2411  //   512 : 1
2412  //  1024 : 2
2413  //  2047 : a little less than 4
2414  //==============================================================================================================================
2415  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2416  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2417  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2418  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2419  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2420  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2421  //_____________________________________________________________/\_______________________________________________________________
2422  //==============================================================================================================================
2423  //
2424  //
2425  //                                                     GPU/CPU PORTABILITY
2426  //
2427  //
2428  //------------------------------------------------------------------------------------------------------------------------------
2429  // This is the GPU implementation.
2430  // See the CPU implementation for docs.
2431  //==============================================================================================================================
2432  #ifdef A_GPU
2433   #define A_TRUE true
2434   #define A_FALSE false
2435   #define A_STATIC
2436  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2437  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2438  //_____________________________________________________________/\_______________________________________________________________
2439  //==============================================================================================================================
2440  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
2441  //==============================================================================================================================
2442   #define retAD2 AD2
2443   #define retAD3 AD3
2444   #define retAD4 AD4
2445   #define retAF2 AF2
2446   #define retAF3 AF3
2447   #define retAF4 AF4
2448   #define retAL2 AL2
2449   #define retAL3 AL3
2450   #define retAL4 AL4
2451   #define retAU2 AU2
2452   #define retAU3 AU3
2453   #define retAU4 AU4
2454  //------------------------------------------------------------------------------------------------------------------------------
2455   #define inAD2 in AD2
2456   #define inAD3 in AD3
2457   #define inAD4 in AD4
2458   #define inAF2 in AF2
2459   #define inAF3 in AF3
2460   #define inAF4 in AF4
2461   #define inAL2 in AL2
2462   #define inAL3 in AL3
2463   #define inAL4 in AL4
2464   #define inAU2 in AU2
2465   #define inAU3 in AU3
2466   #define inAU4 in AU4
2467  //------------------------------------------------------------------------------------------------------------------------------
2468   #define inoutAD2 inout AD2
2469   #define inoutAD3 inout AD3
2470   #define inoutAD4 inout AD4
2471   #define inoutAF2 inout AF2
2472   #define inoutAF3 inout AF3
2473   #define inoutAF4 inout AF4
2474   #define inoutAL2 inout AL2
2475   #define inoutAL3 inout AL3
2476   #define inoutAL4 inout AL4
2477   #define inoutAU2 inout AU2
2478   #define inoutAU3 inout AU3
2479   #define inoutAU4 inout AU4
2480  //------------------------------------------------------------------------------------------------------------------------------
2481   #define outAD2 out AD2
2482   #define outAD3 out AD3
2483   #define outAD4 out AD4
2484   #define outAF2 out AF2
2485   #define outAF3 out AF3
2486   #define outAF4 out AF4
2487   #define outAL2 out AL2
2488   #define outAL3 out AL3
2489   #define outAL4 out AL4
2490   #define outAU2 out AU2
2491   #define outAU3 out AU3
2492   #define outAU4 out AU4
2493  //------------------------------------------------------------------------------------------------------------------------------
2494   #define varAD2(x) AD2 x
2495   #define varAD3(x) AD3 x
2496   #define varAD4(x) AD4 x
2497   #define varAF2(x) AF2 x
2498   #define varAF3(x) AF3 x
2499   #define varAF4(x) AF4 x
2500   #define varAL2(x) AL2 x
2501   #define varAL3(x) AL3 x
2502   #define varAL4(x) AL4 x
2503   #define varAU2(x) AU2 x
2504   #define varAU3(x) AU3 x
2505   #define varAU4(x) AU4 x
2506  //------------------------------------------------------------------------------------------------------------------------------
2507   #define initAD2(x,y) AD2(x,y)
2508   #define initAD3(x,y,z) AD3(x,y,z)
2509   #define initAD4(x,y,z,w) AD4(x,y,z,w)
2510   #define initAF2(x,y) AF2(x,y)
2511   #define initAF3(x,y,z) AF3(x,y,z)
2512   #define initAF4(x,y,z,w) AF4(x,y,z,w)
2513   #define initAL2(x,y) AL2(x,y)
2514   #define initAL3(x,y,z) AL3(x,y,z)
2515   #define initAL4(x,y,z,w) AL4(x,y,z,w)
2516   #define initAU2(x,y) AU2(x,y)
2517   #define initAU3(x,y,z) AU3(x,y,z)
2518   #define initAU4(x,y,z,w) AU4(x,y,z,w)
2519  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2520  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2521  //_____________________________________________________________/\_______________________________________________________________
2522  //==============================================================================================================================
2523  //                                                     SCALAR RETURN OPS
2524  //==============================================================================================================================
2525   #define AAbsD1(a) abs(AD1(a))
2526   #define AAbsF1(a) abs(AF1(a))
2527  //------------------------------------------------------------------------------------------------------------------------------
2528   #define ACosD1(a) cos(AD1(a))
2529   #define ACosF1(a) cos(AF1(a))
2530  //------------------------------------------------------------------------------------------------------------------------------
2531   #define ADotD2(a,b) dot(AD2(a),AD2(b))
2532   #define ADotD3(a,b) dot(AD3(a),AD3(b))
2533   #define ADotD4(a,b) dot(AD4(a),AD4(b))
2534   #define ADotF2(a,b) dot(AF2(a),AF2(b))
2535   #define ADotF3(a,b) dot(AF3(a),AF3(b))
2536   #define ADotF4(a,b) dot(AF4(a),AF4(b))
2537  //------------------------------------------------------------------------------------------------------------------------------
2538   #define AExp2D1(a) exp2(AD1(a))
2539   #define AExp2F1(a) exp2(AF1(a))
2540  //------------------------------------------------------------------------------------------------------------------------------
2541   #define AFloorD1(a) floor(AD1(a))
2542   #define AFloorF1(a) floor(AF1(a))
2543  //------------------------------------------------------------------------------------------------------------------------------
2544   #define ALog2D1(a) log2(AD1(a))
2545   #define ALog2F1(a) log2(AF1(a))
2546  //------------------------------------------------------------------------------------------------------------------------------
2547   #define AMaxD1(a,b) max(a,b)
2548   #define AMaxF1(a,b) max(a,b)
2549   #define AMaxL1(a,b) max(a,b)
2550   #define AMaxU1(a,b) max(a,b)
2551  //------------------------------------------------------------------------------------------------------------------------------
2552   #define AMinD1(a,b) min(a,b)
2553   #define AMinF1(a,b) min(a,b)
2554   #define AMinL1(a,b) min(a,b)
2555   #define AMinU1(a,b) min(a,b)
2556  //------------------------------------------------------------------------------------------------------------------------------
2557   #define ASinD1(a) sin(AD1(a))
2558   #define ASinF1(a) sin(AF1(a))
2559  //------------------------------------------------------------------------------------------------------------------------------
2560   #define ASqrtD1(a) sqrt(AD1(a))
2561   #define ASqrtF1(a) sqrt(AF1(a))
2562  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2563  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2564  //_____________________________________________________________/\_______________________________________________________________
2565  //==============================================================================================================================
2566  //                                               SCALAR RETURN OPS - DEPENDENT
2567  //==============================================================================================================================
2568   #define APowD1(a,b) pow(AD1(a),AF1(b))
2569   #define APowF1(a,b) pow(AF1(a),AF1(b))
2570  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2571  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2572  //_____________________________________________________________/\_______________________________________________________________
2573  //==============================================================================================================================
2574  //                                                         VECTOR OPS
2575  //------------------------------------------------------------------------------------------------------------------------------
2576  // These are added as needed for production or prototyping, so not necessarily a complete set.
2577  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
2578  //==============================================================================================================================
2579   #ifdef A_DUBL
2580    AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
2581    AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
2582    AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
2583  //------------------------------------------------------------------------------------------------------------------------------
2584    AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
2585    AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
2586    AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
2587  //------------------------------------------------------------------------------------------------------------------------------
2588    AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
2589    AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
2590    AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
2591  //------------------------------------------------------------------------------------------------------------------------------
2592    AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
2593    AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
2594    AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
2595  //------------------------------------------------------------------------------------------------------------------------------
2596    AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
2597    AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
2598    AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
2599  //------------------------------------------------------------------------------------------------------------------------------
2600    AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
2601    AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
2602    AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
2603  //------------------------------------------------------------------------------------------------------------------------------
2604    AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
2605    AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
2606    AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
2607  //------------------------------------------------------------------------------------------------------------------------------
2608    AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
2609    AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
2610    AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
2611  //------------------------------------------------------------------------------------------------------------------------------
2612    AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
2613    AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
2614    AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
2615  //------------------------------------------------------------------------------------------------------------------------------
2616    AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
2617    AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
2618    AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
2619  //------------------------------------------------------------------------------------------------------------------------------
2620    AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
2621    AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
2622    AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
2623  //------------------------------------------------------------------------------------------------------------------------------
2624    AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
2625    AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
2626    AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
2627   #endif
2628  //==============================================================================================================================
2629   AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
2630   AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
2631   AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
2632  //------------------------------------------------------------------------------------------------------------------------------
2633   AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
2634   AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
2635   AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
2636  //------------------------------------------------------------------------------------------------------------------------------
2637   AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
2638   AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
2639   AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
2640  //------------------------------------------------------------------------------------------------------------------------------
2641   AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
2642   AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
2643   AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
2644  //------------------------------------------------------------------------------------------------------------------------------
2645   AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
2646   AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
2647   AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
2648  //------------------------------------------------------------------------------------------------------------------------------
2649   AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
2650   AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
2651   AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
2652  //------------------------------------------------------------------------------------------------------------------------------
2653   AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
2654   AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
2655   AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
2656  //------------------------------------------------------------------------------------------------------------------------------
2657   AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
2658   AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
2659   AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
2660  //------------------------------------------------------------------------------------------------------------------------------
2661   AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
2662   AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
2663   AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
2664  //------------------------------------------------------------------------------------------------------------------------------
2665   AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
2666   AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
2667   AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
2668  //------------------------------------------------------------------------------------------------------------------------------
2669   AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
2670   AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
2671   AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
2672  //------------------------------------------------------------------------------------------------------------------------------
2673   AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
2674   AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
2675   AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
2676  #endif
2677  
2678  
2679  #define FSR_RCAS_F 1
2680  AU4 con0;
2681  
2682  AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); }
2683  void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {}
2684  
2685  //_____________________________________________________________/\_______________________________________________________________
2686  //==============================================================================================================================
2687  //
2688  //
2689  //                    AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
2690  //
2691  //
2692  //------------------------------------------------------------------------------------------------------------------------------
2693  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2694  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2695  //------------------------------------------------------------------------------------------------------------------------------
2696  // FidelityFX Super Resolution Sample
2697  //
2698  // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
2699  // Permission is hereby granted, free of charge, to any person obtaining a copy
2700  // of this software and associated documentation files(the "Software"), to deal
2701  // in the Software without restriction, including without limitation the rights
2702  // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
2703  // copies of the Software, and to permit persons to whom the Software is
2704  // furnished to do so, subject to the following conditions :
2705  // The above copyright notice and this permission notice shall be included in
2706  // all copies or substantial portions of the Software.
2707  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2708  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2709  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
2710  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2711  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2712  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2713  // THE SOFTWARE.
2714  //------------------------------------------------------------------------------------------------------------------------------
2715  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2716  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2717  //------------------------------------------------------------------------------------------------------------------------------
2718  // ABOUT
2719  // =====
2720  // FSR is a collection of algorithms relating to generating a higher resolution image.
2721  // This specific header focuses on single-image non-temporal image scaling, and related tools.
2722  // 
2723  // The core functions are EASU and RCAS:
2724  //  [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
2725  //  [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
2726  // RCAS needs to be applied after EASU as a separate pass.
2727  // 
2728  // Optional utility functions are:
2729  //  [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
2730  //  [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
2731  //  [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
2732  // See each individual sub-section for inline documentation.
2733  //------------------------------------------------------------------------------------------------------------------------------
2734  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2735  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2736  //------------------------------------------------------------------------------------------------------------------------------
2737  // FUNCTION PERMUTATIONS
2738  // =====================
2739  // *F() ..... Single item computation with 32-bit.
2740  // *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
2741  // *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
2742  //            Not all interfaces in this file have a *Hx2() form.
2743  //==============================================================================================================================
2744  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2745  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2746  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2747  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2748  //_____________________________________________________________/\_______________________________________________________________
2749  //==============================================================================================================================
2750  //
2751  //                                        FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
2752  //
2753  //------------------------------------------------------------------------------------------------------------------------------
2754  // EASU provides a high quality spatial-only scaling at relatively low cost.
2755  // Meaning EASU is appropiate for laptops and other low-end GPUs.
2756  // Quality from 1x to 4x area scaling is good.
2757  //------------------------------------------------------------------------------------------------------------------------------
2758  // The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
2759  // EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
2760  // This is also kept as simple as possible to have minimum runtime.
2761  //------------------------------------------------------------------------------------------------------------------------------
2762  // The lanzcos filter has negative lobes, so by itself it will introduce ringing.
2763  // To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
2764  // and limits output to the minimum and maximum of that neighborhood.
2765  //------------------------------------------------------------------------------------------------------------------------------
2766  // Input image requirements:
2767  // 
2768  // Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
2769  // Each channel needs to be in the range[0, 1]
2770  // Any color primaries are supported
2771  // Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
2772  // There should be no banding in the input
2773  // There should be no high amplitude noise in the input
2774  // There should be no noise in the input that is not at input pixel granularity
2775  // For performance purposes, use 32bpp formats
2776  //------------------------------------------------------------------------------------------------------------------------------
2777  // Best to apply EASU at the end of the frame after tonemapping 
2778  // but before film grain or composite of the UI.
2779  //------------------------------------------------------------------------------------------------------------------------------
2780  // Example of including this header for D3D HLSL :
2781  // 
2782  //  #define A_GPU 1
2783  //  #define A_HLSL 1
2784  //  #define A_HALF 1
2785  //  #include "ffx_a.h"
2786  //  #define FSR_EASU_H 1
2787  //  #define FSR_RCAS_H 1
2788  //  //declare input callbacks
2789  //  #include "ffx_fsr1.h"
2790  // 
2791  // Example of including this header for Vulkan GLSL :
2792  // 
2793  //  #define A_GPU 1
2794  //  #define A_GLSL 1
2795  //  #define A_HALF 1
2796  //  #include "ffx_a.h"
2797  //  #define FSR_EASU_H 1
2798  //  #define FSR_RCAS_H 1
2799  //  //declare input callbacks
2800  //  #include "ffx_fsr1.h"
2801  // 
2802  // Example of including this header for Vulkan HLSL :
2803  // 
2804  //  #define A_GPU 1
2805  //  #define A_HLSL 1
2806  //  #define A_HLSL_6_2 1
2807  //  #define A_NO_16_BIT_CAST 1
2808  //  #define A_HALF 1
2809  //  #include "ffx_a.h"
2810  //  #define FSR_EASU_H 1
2811  //  #define FSR_RCAS_H 1
2812  //  //declare input callbacks
2813  //  #include "ffx_fsr1.h"
2814  // 
2815  //  Example of declaring the required input callbacks for GLSL :
2816  //  The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
2817  //  EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
2818  // 
2819  //  AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
2820  //  AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
2821  //  AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
2822  //  ...
2823  //  The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
2824  //  The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
2825  //  To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
2826  //  Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
2827  //  AU4 con0,con1,con2,con3;
2828  //  FsrEasuCon(con0,con1,con2,con3,
2829  //    1920.0,1080.0,  // Viewport size (top left aligned) in the input image which is to be scaled.
2830  //    3840.0,2160.0,  // The size of the input image.
2831  //    2560.0,1440.0); // The output resolution.
2832  //==============================================================================================================================
2833  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2834  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2835  //_____________________________________________________________/\_______________________________________________________________
2836  //==============================================================================================================================
2837  //                                                      CONSTANT SETUP
2838  //==============================================================================================================================
2839  // Call to setup required constant values (works on CPU or GPU).
2840  A_STATIC void FsrEasuCon(
2841  outAU4 con0,
2842  outAU4 con1,
2843  outAU4 con2,
2844  outAU4 con3,
2845  // This the rendered image resolution being upscaled
2846  AF1 inputViewportInPixelsX,
2847  AF1 inputViewportInPixelsY,
2848  // This is the resolution of the resource containing the input image (useful for dynamic resolution)
2849  AF1 inputSizeInPixelsX,
2850  AF1 inputSizeInPixelsY,
2851  // This is the display resolution which the input image gets upscaled to
2852  AF1 outputSizeInPixelsX,
2853  AF1 outputSizeInPixelsY){
2854   // Output integer position to a pixel position in viewport.
2855   con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
2856   con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
2857   con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
2858   con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
2859   // Viewport pixel position to normalized image space.
2860   // This is used to get upper-left of 'F' tap.
2861   con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
2862   con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
2863   // Centers of gather4, first offset from upper-left of 'F'.
2864   //      +---+---+
2865   //      |   |   |
2866   //      +--(0)--+
2867   //      | b | c |
2868   //  +---F---+---+---+
2869   //  | e | f | g | h |
2870   //  +--(1)--+--(2)--+
2871   //  | i | j | k | l |
2872   //  +---+---+---+---+
2873   //      | n | o |
2874   //      +--(3)--+
2875   //      |   |   |
2876   //      +---+---+
2877   con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
2878   con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
2879   // These are from (0) instead of 'F'.
2880   con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
2881   con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
2882   con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
2883   con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
2884   con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
2885   con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
2886   con3[2]=con3[3]=0;}
2887  
2888  //If the an offset into the input image resource
2889  A_STATIC void FsrEasuConOffset(
2890      outAU4 con0,
2891      outAU4 con1,
2892      outAU4 con2,
2893      outAU4 con3,
2894      // This the rendered image resolution being upscaled
2895      AF1 inputViewportInPixelsX,
2896      AF1 inputViewportInPixelsY,
2897      // This is the resolution of the resource containing the input image (useful for dynamic resolution)
2898      AF1 inputSizeInPixelsX,
2899      AF1 inputSizeInPixelsY,
2900      // This is the display resolution which the input image gets upscaled to
2901      AF1 outputSizeInPixelsX,
2902      AF1 outputSizeInPixelsY,
2903      // This is the input image offset into the resource containing it (useful for dynamic resolution)
2904      AF1 inputOffsetInPixelsX,
2905      AF1 inputOffsetInPixelsY) {
2906      FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
2907      con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
2908      con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
2909  }
2910  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2911  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2912  //_____________________________________________________________/\_______________________________________________________________
2913  //==============================================================================================================================
2914  //                                                   NON-PACKED 32-BIT VERSION
2915  //==============================================================================================================================
2916  #if defined(A_GPU)&&defined(FSR_EASU_F)
2917   // Input callback prototypes, need to be implemented by calling shader
2918   AF4 FsrEasuRF(AF2 p);
2919   AF4 FsrEasuGF(AF2 p);
2920   AF4 FsrEasuBF(AF2 p);
2921  //------------------------------------------------------------------------------------------------------------------------------
2922   // Filtering for a given tap for the scalar.
2923   void FsrEasuTapF(
2924   inout AF3 aC, // Accumulated color, with negative lobe.
2925   inout AF1 aW, // Accumulated weight.
2926   AF2 off, // Pixel offset from resolve position to tap.
2927   AF2 dir, // Gradient direction.
2928   AF2 len, // Length.
2929   AF1 lob, // Negative lobe strength.
2930   AF1 clp, // Clipping point.
2931   AF3 c){ // Tap color.
2932    // Rotate offset by direction.
2933    AF2 v;
2934    v.x=(off.x*( dir.x))+(off.y*dir.y);
2935    v.y=(off.x*(-dir.y))+(off.y*dir.x);
2936    // Anisotropy.
2937    v*=len;
2938    // Compute distance^2.
2939    AF1 d2=v.x*v.x+v.y*v.y;
2940    // Limit to the window as at corner, 2 taps can easily be outside.
2941    d2=min(d2,clp);
2942    // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
2943    //  (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
2944    //  |_______________________________________|   |_______________|
2945    //                   base                             window
2946    // The general form of the 'base' is,
2947    //  (a*(b*x^2-1)^2-(a-1))
2948    // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
2949    AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
2950    AF1 wA=lob*d2+AF1_(-1.0);
2951    wB*=wB;
2952    wA*=wA;
2953    wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
2954    AF1 w=wB*wA;
2955    // Do weighted average.
2956    aC+=c*w;aW+=w;}
2957  //------------------------------------------------------------------------------------------------------------------------------
2958   // Accumulate direction and length.
2959   void FsrEasuSetF(
2960   inout AF2 dir,
2961   inout AF1 len,
2962   AF2 pp,
2963   AP1 biS,AP1 biT,AP1 biU,AP1 biV,
2964   AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
2965    // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
2966    //  s t
2967    //  u v
2968    AF1 w = AF1_(0.0);
2969    if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
2970    if(biT)w=           pp.x *(AF1_(1.0)-pp.y);
2971    if(biU)w=(AF1_(1.0)-pp.x)*           pp.y ;
2972    if(biV)w=           pp.x *           pp.y ;
2973    // Direction is the '+' diff.
2974    //    a
2975    //  b c d
2976    //    e
2977    // Then takes magnitude from abs average of both sides of 'c'.
2978    // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
2979    AF1 dc=lD-lC;
2980    AF1 cb=lC-lB;
2981    AF1 lenX=max(abs(dc),abs(cb));
2982    lenX=APrxLoRcpF1(lenX);
2983    AF1 dirX=lD-lB;
2984    dir.x+=dirX*w;
2985    lenX=ASatF1(abs(dirX)*lenX);
2986    lenX*=lenX;
2987    len+=lenX*w;
2988    // Repeat for the y axis.
2989    AF1 ec=lE-lC;
2990    AF1 ca=lC-lA;
2991    AF1 lenY=max(abs(ec),abs(ca));
2992    lenY=APrxLoRcpF1(lenY);
2993    AF1 dirY=lE-lA;
2994    dir.y+=dirY*w;
2995    lenY=ASatF1(abs(dirY)*lenY);
2996    lenY*=lenY;
2997    len+=lenY*w;}
2998  //------------------------------------------------------------------------------------------------------------------------------
2999   void FsrEasuF(
3000   out AF3 pix,
3001   AU2 ip, // Integer pixel position in output.
3002   AU4 con0, // Constants generated by FsrEasuCon().
3003   AU4 con1,
3004   AU4 con2,
3005   AU4 con3){
3006  //------------------------------------------------------------------------------------------------------------------------------
3007    // Get position of 'f'.
3008    AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
3009    AF2 fp=floor(pp);
3010    pp-=fp;
3011  //------------------------------------------------------------------------------------------------------------------------------
3012    // 12-tap kernel.
3013    //    b c
3014    //  e f g h
3015    //  i j k l
3016    //    n o
3017    // Gather 4 ordering.
3018    //  a b
3019    //  r g
3020    // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
3021    //    a b    <- unused (z)
3022    //    r g
3023    //  a b a b
3024    //  r g r g
3025    //    a b
3026    //    r g    <- unused (z)
3027    // Allowing dead-code removal to remove the 'z's.
3028    AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
3029    // These are from p0 to avoid pulling two constants on pre-Navi hardware.
3030    AF2 p1=p0+AF2_AU2(con2.xy);
3031    AF2 p2=p0+AF2_AU2(con2.zw);
3032    AF2 p3=p0+AF2_AU2(con3.xy);
3033    AF4 bczzR=FsrEasuRF(p0);
3034    AF4 bczzG=FsrEasuGF(p0);
3035    AF4 bczzB=FsrEasuBF(p0);
3036    AF4 ijfeR=FsrEasuRF(p1);
3037    AF4 ijfeG=FsrEasuGF(p1);
3038    AF4 ijfeB=FsrEasuBF(p1);
3039    AF4 klhgR=FsrEasuRF(p2);
3040    AF4 klhgG=FsrEasuGF(p2);
3041    AF4 klhgB=FsrEasuBF(p2);
3042    AF4 zzonR=FsrEasuRF(p3);
3043    AF4 zzonG=FsrEasuGF(p3);
3044    AF4 zzonB=FsrEasuBF(p3);
3045  //------------------------------------------------------------------------------------------------------------------------------
3046    // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
3047    AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
3048    AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
3049    AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
3050    AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
3051    // Rename.
3052    AF1 bL=bczzL.x;
3053    AF1 cL=bczzL.y;
3054    AF1 iL=ijfeL.x;
3055    AF1 jL=ijfeL.y;
3056    AF1 fL=ijfeL.z;
3057    AF1 eL=ijfeL.w;
3058    AF1 kL=klhgL.x;
3059    AF1 lL=klhgL.y;
3060    AF1 hL=klhgL.z;
3061    AF1 gL=klhgL.w;
3062    AF1 oL=zzonL.z;
3063    AF1 nL=zzonL.w;
3064    // Accumulate for bilinear interpolation.
3065    AF2 dir=AF2_(0.0);
3066    AF1 len=AF1_(0.0);
3067    FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
3068    FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
3069    FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
3070    FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
3071  //------------------------------------------------------------------------------------------------------------------------------
3072    // Normalize with approximation, and cleanup close to zero.
3073    AF2 dir2=dir*dir;
3074    AF1 dirR=dir2.x+dir2.y;
3075    AP1 zro=dirR<AF1_(1.0/32768.0);
3076    dirR=APrxLoRsqF1(dirR);
3077    dirR=zro?AF1_(1.0):dirR;
3078    dir.x=zro?AF1_(1.0):dir.x;
3079    dir*=AF2_(dirR);
3080    // Transform from {0 to 2} to {0 to 1} range, and shape with square.
3081    len=len*AF1_(0.5);
3082    len*=len;
3083    // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
3084    AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
3085    // Anisotropic length after rotation,
3086    //  x := 1.0 lerp to 'stretch' on edges
3087    //  y := 1.0 lerp to 2x on edges
3088    AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
3089    // Based on the amount of 'edge',
3090    // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
3091    AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
3092    // Set distance^2 clipping point to the end of the adjustable window.
3093    AF1 clp=APrxLoRcpF1(lob);
3094  //------------------------------------------------------------------------------------------------------------------------------
3095    // Accumulation mixed with min/max of 4 nearest.
3096    //    b c
3097    //  e f g h
3098    //  i j k l
3099    //    n o
3100    AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
3101                 AF3(klhgR.x,klhgG.x,klhgB.x));
3102    AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
3103                 AF3(klhgR.x,klhgG.x,klhgB.x));
3104    // Accumulation.
3105    AF3 aC=AF3_(0.0);
3106    AF1 aW=AF1_(0.0);
3107    FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
3108    FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
3109    FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
3110    FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
3111    FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
3112    FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
3113    FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
3114    FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
3115    FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
3116    FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
3117    FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
3118    FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
3119  //------------------------------------------------------------------------------------------------------------------------------
3120    // Normalize and dering.
3121    pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
3122  #endif
3123  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3124  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3125  //_____________________________________________________________/\_______________________________________________________________
3126  //==============================================================================================================================
3127  //                                                    PACKED 16-BIT VERSION
3128  //==============================================================================================================================
3129  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
3130  // Input callback prototypes, need to be implemented by calling shader
3131   AH4 FsrEasuRH(AF2 p);
3132   AH4 FsrEasuGH(AF2 p);
3133   AH4 FsrEasuBH(AF2 p);
3134  //------------------------------------------------------------------------------------------------------------------------------
3135   // This runs 2 taps in parallel.
3136   void FsrEasuTapH(
3137   inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
3138   inout AH2 aW,
3139   AH2 offX,AH2 offY,
3140   AH2 dir,
3141   AH2 len,
3142   AH1 lob,
3143   AH1 clp,
3144   AH2 cR,AH2 cG,AH2 cB){
3145    AH2 vX,vY;
3146    vX=offX*  dir.xx +offY*dir.yy;
3147    vY=offX*(-dir.yy)+offY*dir.xx;
3148    vX*=len.x;vY*=len.y;
3149    AH2 d2=vX*vX+vY*vY;
3150    d2=min(d2,AH2_(clp));
3151    AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
3152    AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
3153    wB*=wB;
3154    wA*=wA;
3155    wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
3156    AH2 w=wB*wA;
3157    aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
3158  //------------------------------------------------------------------------------------------------------------------------------
3159   // This runs 2 taps in parallel.
3160   void FsrEasuSetH(
3161   inout AH2 dirPX,inout AH2 dirPY,
3162   inout AH2 lenP,
3163   AH2 pp,
3164   AP1 biST,AP1 biUV,
3165   AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
3166    AH2 w = AH2_(0.0);
3167    if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
3168    if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(          pp.y);
3169    // ABS is not free in the packed FP16 path.
3170    AH2 dc=lD-lC;
3171    AH2 cb=lC-lB;
3172    AH2 lenX=max(abs(dc),abs(cb));
3173    lenX=ARcpH2(lenX);
3174    AH2 dirX=lD-lB;
3175    dirPX+=dirX*w;
3176    lenX=ASatH2(abs(dirX)*lenX);
3177    lenX*=lenX;
3178    lenP+=lenX*w;
3179    AH2 ec=lE-lC;
3180    AH2 ca=lC-lA;
3181    AH2 lenY=max(abs(ec),abs(ca));
3182    lenY=ARcpH2(lenY);
3183    AH2 dirY=lE-lA;
3184    dirPY+=dirY*w;
3185    lenY=ASatH2(abs(dirY)*lenY);
3186    lenY*=lenY;
3187    lenP+=lenY*w;}
3188  //------------------------------------------------------------------------------------------------------------------------------
3189   void FsrEasuH(
3190   out AH3 pix,
3191   AU2 ip,
3192   AU4 con0,
3193   AU4 con1,
3194   AU4 con2,
3195   AU4 con3){
3196  //------------------------------------------------------------------------------------------------------------------------------
3197    AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
3198    AF2 fp=floor(pp);
3199    pp-=fp;
3200    AH2 ppp=AH2(pp);
3201  //------------------------------------------------------------------------------------------------------------------------------
3202    AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
3203    AF2 p1=p0+AF2_AU2(con2.xy);
3204    AF2 p2=p0+AF2_AU2(con2.zw);
3205    AF2 p3=p0+AF2_AU2(con3.xy);
3206    AH4 bczzR=FsrEasuRH(p0);
3207    AH4 bczzG=FsrEasuGH(p0);
3208    AH4 bczzB=FsrEasuBH(p0);
3209    AH4 ijfeR=FsrEasuRH(p1);
3210    AH4 ijfeG=FsrEasuGH(p1);
3211    AH4 ijfeB=FsrEasuBH(p1);
3212    AH4 klhgR=FsrEasuRH(p2);
3213    AH4 klhgG=FsrEasuGH(p2);
3214    AH4 klhgB=FsrEasuBH(p2);
3215    AH4 zzonR=FsrEasuRH(p3);
3216    AH4 zzonG=FsrEasuGH(p3);
3217    AH4 zzonB=FsrEasuBH(p3);
3218  //------------------------------------------------------------------------------------------------------------------------------
3219    AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
3220    AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
3221    AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
3222    AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
3223    AH1 bL=bczzL.x;
3224    AH1 cL=bczzL.y;
3225    AH1 iL=ijfeL.x;
3226    AH1 jL=ijfeL.y;
3227    AH1 fL=ijfeL.z;
3228    AH1 eL=ijfeL.w;
3229    AH1 kL=klhgL.x;
3230    AH1 lL=klhgL.y;
3231    AH1 hL=klhgL.z;
3232    AH1 gL=klhgL.w;
3233    AH1 oL=zzonL.z;
3234    AH1 nL=zzonL.w;
3235    // This part is different, accumulating 2 taps in parallel.
3236    AH2 dirPX=AH2_(0.0);
3237    AH2 dirPY=AH2_(0.0);
3238    AH2 lenP=AH2_(0.0);
3239    FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
3240    FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
3241    AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
3242    AH1 len=lenP.r+lenP.g;
3243  //------------------------------------------------------------------------------------------------------------------------------
3244    AH2 dir2=dir*dir;
3245    AH1 dirR=dir2.x+dir2.y;
3246    AP1 zro=dirR<AH1_(1.0/32768.0);
3247    dirR=APrxLoRsqH1(dirR);
3248    dirR=zro?AH1_(1.0):dirR;
3249    dir.x=zro?AH1_(1.0):dir.x;
3250    dir*=AH2_(dirR);
3251    len=len*AH1_(0.5);
3252    len*=len;
3253    AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
3254    AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
3255    AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
3256    AH1 clp=APrxLoRcpH1(lob);
3257  //------------------------------------------------------------------------------------------------------------------------------
3258    // FP16 is different, using packed trick to do min and max in same operation.
3259    AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
3260    AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
3261    AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
3262    // This part is different for FP16, working pairs of taps at a time.
3263    AH2 pR=AH2_(0.0);
3264    AH2 pG=AH2_(0.0);
3265    AH2 pB=AH2_(0.0);
3266    AH2 pW=AH2_(0.0);
3267    FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
3268    FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
3269    FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
3270    FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
3271    FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
3272    FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
3273    AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
3274    AH1 aW=pW.x+pW.y;
3275  //------------------------------------------------------------------------------------------------------------------------------
3276    // Slightly different for FP16 version due to combined min and max.
3277    pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
3278  #endif
3279  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3280  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3281  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3282  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3283  //_____________________________________________________________/\_______________________________________________________________
3284  //==============================================================================================================================
3285  //
3286  //                                      FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
3287  //
3288  //------------------------------------------------------------------------------------------------------------------------------
3289  // CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
3290  // RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
3291  // RCAS also has a built in process to limit sharpening of what it detects as possible noise.
3292  // RCAS sharper does not support scaling, as it should be applied after EASU scaling.
3293  // Pass EASU output straight into RCAS, no color conversions necessary.
3294  //------------------------------------------------------------------------------------------------------------------------------
3295  // RCAS is based on the following logic.
3296  // RCAS uses a 5 tap filter in a cross pattern (same as CAS),
3297  //    w                n
3298  //  w 1 w  for taps  w m e 
3299  //    w                s
3300  // Where 'w' is the negative lobe weight.
3301  //  output = (w*(n+e+w+s)+m)/(4*w+1)
3302  // RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
3303  //  0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
3304  //  1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
3305  // Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
3306  // This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
3307  // So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
3308  // As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
3309  // This stabilizes RCAS.
3310  // RCAS does a simple highpass which is normalized against the local contrast then shaped,
3311  //       0.25
3312  //  0.25  -1  0.25
3313  //       0.25
3314  // This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
3315  //
3316  //  GLSL example for the required callbacks :
3317  // 
3318  //  AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
3319  //  void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
3320  //  {
3321  //    //do any simple input color conversions here or leave empty if none needed
3322  //  }
3323  //  
3324  //  FsrRcasCon need to be called from the CPU or GPU to set up constants.
3325  //  Including a GPU example here, the 'con' value would be stored out to a constant buffer.
3326  // 
3327  //  AU4 con;
3328  //  FsrRcasCon(con,
3329  //   0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
3330  // ---------------
3331  // RCAS sharpening supports a CAS-like pass-through alpha via,
3332  //  #define FSR_RCAS_PASSTHROUGH_ALPHA 1
3333  // RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
3334  // Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
3335  //  #define FSR_RCAS_DENOISE 1
3336  //==============================================================================================================================
3337  // This is set at the limit of providing unnatural results for sharpening.
3338  #define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
3339  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3340  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3341  //_____________________________________________________________/\_______________________________________________________________
3342  //==============================================================================================================================
3343  //                                                      CONSTANT SETUP
3344  //==============================================================================================================================
3345  // Call to setup required constant values (works on CPU or GPU).
3346  A_STATIC void FsrRcasCon(
3347  outAU4 con,
3348  // The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
3349  AF1 sharpness){
3350   // Transform from stops to linear value.
3351   sharpness=AExp2F1(-sharpness);
3352   varAF2(hSharp)=initAF2(sharpness,sharpness);
3353   con[0]=AU1_AF1(sharpness);
3354   con[1]=AU1_AH2_AF2(hSharp);
3355   con[2]=0;
3356   con[3]=0;}
3357  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3358  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3359  //_____________________________________________________________/\_______________________________________________________________
3360  //==============================================================================================================================
3361  //                                                   NON-PACKED 32-BIT VERSION
3362  //==============================================================================================================================
3363  #if defined(A_GPU)&&defined(FSR_RCAS_F)
3364   // Input callback prototypes that need to be implemented by calling shader
3365   AF4 FsrRcasLoadF(ASU2 p);
3366   void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
3367  //------------------------------------------------------------------------------------------------------------------------------
3368   void FsrRcasF(
3369   out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
3370   out AF1 pixG,
3371   out AF1 pixB,
3372   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3373    out AF1 pixA,
3374   #endif
3375   AU2 ip, // Integer pixel position in output.
3376   AU4 con){ // Constant generated by RcasSetup().
3377    // Algorithm uses minimal 3x3 pixel neighborhood.
3378    //    b 
3379    //  d e f
3380    //    h
3381    ASU2 sp=ASU2(ip);
3382    AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
3383    AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
3384    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3385     AF4 ee=FsrRcasLoadF(sp);
3386     AF3 e=ee.rgb;pixA=ee.a;
3387    #else
3388     AF3 e=FsrRcasLoadF(sp).rgb;
3389    #endif
3390    AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
3391    AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
3392    // Rename (32-bit) or regroup (16-bit).
3393    AF1 bR=b.r;
3394    AF1 bG=b.g;
3395    AF1 bB=b.b;
3396    AF1 dR=d.r;
3397    AF1 dG=d.g;
3398    AF1 dB=d.b;
3399    AF1 eR=e.r;
3400    AF1 eG=e.g;
3401    AF1 eB=e.b;
3402    AF1 fR=f.r;
3403    AF1 fG=f.g;
3404    AF1 fB=f.b;
3405    AF1 hR=h.r;
3406    AF1 hG=h.g;
3407    AF1 hB=h.b;
3408    // Run optional input transform.
3409    FsrRcasInputF(bR,bG,bB);
3410    FsrRcasInputF(dR,dG,dB);
3411    FsrRcasInputF(eR,eG,eB);
3412    FsrRcasInputF(fR,fG,fB);
3413    FsrRcasInputF(hR,hG,hB);
3414    // Luma times 2.
3415    AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
3416    AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
3417    AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
3418    AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
3419    AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
3420    // Noise detection.
3421    AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
3422    nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
3423    nz=AF1_(-0.5)*nz+AF1_(1.0);
3424    // Min and max of ring.
3425    AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
3426    AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
3427    AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
3428    AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
3429    AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
3430    AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
3431    // Immediate constants for peak range.
3432    AF2 peakC=AF2(1.0,-1.0*4.0);
3433    // Limiters, these need to be high precision RCPs.
3434    AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
3435    AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
3436    AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
3437    AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
3438    AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
3439    AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
3440    AF1 lobeR=max(-hitMinR,hitMaxR);
3441    AF1 lobeG=max(-hitMinG,hitMaxG);
3442    AF1 lobeB=max(-hitMinB,hitMaxB);
3443    AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
3444    // Apply noise removal.
3445    #ifdef FSR_RCAS_DENOISE
3446     lobe*=nz;
3447    #endif
3448    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3449    AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
3450    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3451    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3452    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
3453    return;} 
3454  #endif
3455  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3456  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3457  //_____________________________________________________________/\_______________________________________________________________
3458  //==============================================================================================================================
3459  //                                                  NON-PACKED 16-BIT VERSION
3460  //==============================================================================================================================
3461  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
3462   // Input callback prototypes that need to be implemented by calling shader
3463   AH4 FsrRcasLoadH(ASW2 p);
3464   void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
3465  //------------------------------------------------------------------------------------------------------------------------------
3466   void FsrRcasH(
3467   out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
3468   out AH1 pixG,
3469   out AH1 pixB,
3470   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3471    out AH1 pixA,
3472   #endif
3473   AU2 ip, // Integer pixel position in output.
3474   AU4 con){ // Constant generated by RcasSetup().
3475    // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
3476    //    b 
3477    //  d e f
3478    //    h
3479    ASW2 sp=ASW2(ip);
3480    AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
3481    AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
3482    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3483     AH4 ee=FsrRcasLoadH(sp);
3484     AH3 e=ee.rgb;pixA=ee.a;
3485    #else
3486     AH3 e=FsrRcasLoadH(sp).rgb;
3487    #endif
3488    AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
3489    AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
3490    // Rename (32-bit) or regroup (16-bit).
3491    AH1 bR=b.r;
3492    AH1 bG=b.g;
3493    AH1 bB=b.b;
3494    AH1 dR=d.r;
3495    AH1 dG=d.g;
3496    AH1 dB=d.b;
3497    AH1 eR=e.r;
3498    AH1 eG=e.g;
3499    AH1 eB=e.b;
3500    AH1 fR=f.r;
3501    AH1 fG=f.g;
3502    AH1 fB=f.b;
3503    AH1 hR=h.r;
3504    AH1 hG=h.g;
3505    AH1 hB=h.b;
3506    // Run optional input transform.
3507    FsrRcasInputH(bR,bG,bB);
3508    FsrRcasInputH(dR,dG,dB);
3509    FsrRcasInputH(eR,eG,eB);
3510    FsrRcasInputH(fR,fG,fB);
3511    FsrRcasInputH(hR,hG,hB);
3512    // Luma times 2.
3513    AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
3514    AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
3515    AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
3516    AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
3517    AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
3518    // Noise detection.
3519    AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
3520    nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
3521    nz=AH1_(-0.5)*nz+AH1_(1.0);
3522    // Min and max of ring.
3523    AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
3524    AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
3525    AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
3526    AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
3527    AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
3528    AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
3529    // Immediate constants for peak range.
3530    AH2 peakC=AH2(1.0,-1.0*4.0);
3531    // Limiters, these need to be high precision RCPs.
3532    AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
3533    AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
3534    AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
3535    AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
3536    AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
3537    AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
3538    AH1 lobeR=max(-hitMinR,hitMaxR);
3539    AH1 lobeG=max(-hitMinG,hitMaxG);
3540    AH1 lobeB=max(-hitMinB,hitMaxB);
3541    AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
3542    // Apply noise removal.
3543    #ifdef FSR_RCAS_DENOISE
3544     lobe*=nz;
3545    #endif
3546    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3547    AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
3548    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3549    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3550    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
3551  #endif
3552  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3553  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3554  //_____________________________________________________________/\_______________________________________________________________
3555  //==============================================================================================================================
3556  //                                                     PACKED 16-BIT VERSION
3557  //==============================================================================================================================
3558  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
3559   // Input callback prototypes that need to be implemented by the calling shader
3560   AH4 FsrRcasLoadHx2(ASW2 p);
3561   void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
3562  //------------------------------------------------------------------------------------------------------------------------------
3563   // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
3564   void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
3565    #ifdef A_HLSL
3566     // Invoke a slower path for DX only, since it won't allow uninitialized values.
3567     pix0.a=pix1.a=0.0;
3568    #endif
3569    pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
3570    pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
3571  //------------------------------------------------------------------------------------------------------------------------------
3572   void FsrRcasHx2(
3573   // Output values are for 2 8x8 tiles in a 16x8 region.
3574   //  pix<R,G,B>.x =  left 8x8 tile
3575   //  pix<R,G,B>.y = right 8x8 tile
3576   // This enables later processing to easily be packed as well.
3577   out AH2 pixR,
3578   out AH2 pixG,
3579   out AH2 pixB,
3580   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3581    out AH2 pixA,
3582   #endif
3583   AU2 ip, // Integer pixel position in output.
3584   AU4 con){ // Constant generated by RcasSetup().
3585    // No scaling algorithm uses minimal 3x3 pixel neighborhood.
3586    ASW2 sp0=ASW2(ip);
3587    AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
3588    AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
3589    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3590     AH4 ee0=FsrRcasLoadHx2(sp0);
3591     AH3 e0=ee0.rgb;pixA.r=ee0.a;
3592    #else
3593     AH3 e0=FsrRcasLoadHx2(sp0).rgb;
3594    #endif
3595    AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
3596    AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
3597    ASW2 sp1=sp0+ASW2(8,0);
3598    AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
3599    AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
3600    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3601     AH4 ee1=FsrRcasLoadHx2(sp1);
3602     AH3 e1=ee1.rgb;pixA.g=ee1.a;
3603    #else
3604     AH3 e1=FsrRcasLoadHx2(sp1).rgb;
3605    #endif
3606    AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
3607    AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
3608    // Arrays of Structures to Structures of Arrays conversion.
3609    AH2 bR=AH2(b0.r,b1.r);
3610    AH2 bG=AH2(b0.g,b1.g);
3611    AH2 bB=AH2(b0.b,b1.b);
3612    AH2 dR=AH2(d0.r,d1.r);
3613    AH2 dG=AH2(d0.g,d1.g);
3614    AH2 dB=AH2(d0.b,d1.b);
3615    AH2 eR=AH2(e0.r,e1.r);
3616    AH2 eG=AH2(e0.g,e1.g);
3617    AH2 eB=AH2(e0.b,e1.b);
3618    AH2 fR=AH2(f0.r,f1.r);
3619    AH2 fG=AH2(f0.g,f1.g);
3620    AH2 fB=AH2(f0.b,f1.b);
3621    AH2 hR=AH2(h0.r,h1.r);
3622    AH2 hG=AH2(h0.g,h1.g);
3623    AH2 hB=AH2(h0.b,h1.b);
3624    // Run optional input transform.
3625    FsrRcasInputHx2(bR,bG,bB);
3626    FsrRcasInputHx2(dR,dG,dB);
3627    FsrRcasInputHx2(eR,eG,eB);
3628    FsrRcasInputHx2(fR,fG,fB);
3629    FsrRcasInputHx2(hR,hG,hB);
3630    // Luma times 2.
3631    AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
3632    AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
3633    AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
3634    AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
3635    AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
3636    // Noise detection.
3637    AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
3638    nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
3639    nz=AH2_(-0.5)*nz+AH2_(1.0);
3640    // Min and max of ring.
3641    AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
3642    AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
3643    AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
3644    AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
3645    AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
3646    AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
3647    // Immediate constants for peak range.
3648    AH2 peakC=AH2(1.0,-1.0*4.0);
3649    // Limiters, these need to be high precision RCPs.
3650    AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
3651    AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
3652    AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
3653    AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
3654    AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
3655    AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
3656    AH2 lobeR=max(-hitMinR,hitMaxR);
3657    AH2 lobeG=max(-hitMinG,hitMaxG);
3658    AH2 lobeB=max(-hitMinB,hitMaxB);
3659    AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
3660    // Apply noise removal.
3661    #ifdef FSR_RCAS_DENOISE
3662     lobe*=nz;
3663    #endif
3664    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3665    AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
3666    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3667    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3668    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
3669  #endif
3670  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3671  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3672  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3673  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3674  //_____________________________________________________________/\_______________________________________________________________
3675  //==============================================================================================================================
3676  //
3677  //                                          FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
3678  //
3679  //------------------------------------------------------------------------------------------------------------------------------
3680  // Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
3681  // Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
3682  // The 'Lfga*()' functions provide a convenient way to introduce grain.
3683  // These functions limit grain based on distance to signal limits.
3684  // This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
3685  // Grain application should be done in a linear colorspace.
3686  // The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
3687  //------------------------------------------------------------------------------------------------------------------------------
3688  // Usage,
3689  //   FsrLfga*(
3690  //    color, // In/out linear colorspace color {0 to 1} ranged.
3691  //    grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
3692  //    amount); // Amount of grain (0 to 1} ranged.
3693  //------------------------------------------------------------------------------------------------------------------------------
3694  // Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
3695  //==============================================================================================================================
3696  #if defined(A_GPU)
3697   // Maximum grain is the minimum distance to the signal limit.
3698   void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
3699  #endif
3700  //==============================================================================================================================
3701  #if defined(A_GPU)&&defined(A_HALF)
3702   // Half precision version (slower).
3703   void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
3704  //------------------------------------------------------------------------------------------------------------------------------
3705   // Packed half precision version (faster).
3706   void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
3707    cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
3708  #endif
3709  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3710  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3711  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3712  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3713  //_____________________________________________________________/\_______________________________________________________________
3714  //==============================================================================================================================
3715  //
3716  //                                          FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
3717  //
3718  //------------------------------------------------------------------------------------------------------------------------------
3719  // This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
3720  // The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
3721  //------------------------------------------------------------------------------------------------------------------------------
3722  // Reversible tonemapper usage,
3723  //  FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
3724  //  FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
3725  //==============================================================================================================================
3726  #if defined(A_GPU)
3727   void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
3728   // The extra max solves the c=1.0 case (which is a /0).
3729   void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
3730  #endif
3731  //==============================================================================================================================
3732  #if defined(A_GPU)&&defined(A_HALF)
3733   void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
3734   void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
3735  //------------------------------------------------------------------------------------------------------------------------------
3736   void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
3737    AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
3738   void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
3739    AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
3740  #endif
3741  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3742  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3743  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3744  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3745  //_____________________________________________________________/\_______________________________________________________________
3746  //==============================================================================================================================
3747  //
3748  //                                       FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
3749  //
3750  //------------------------------------------------------------------------------------------------------------------------------
3751  // Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
3752  // Gamma 2.0 is used so that the conversion back to linear is just to square the color.
3753  // The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
3754  // Given good non-biased temporal blue noise as dither input,
3755  // the output dither will temporally conserve energy.
3756  // This is done by choosing the linear nearest step point instead of perceptual nearest.
3757  // See code below for details.
3758  //------------------------------------------------------------------------------------------------------------------------------
3759  // DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
3760  // ===============================================
3761  // - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
3762  // - Thus rounding is to nearest.
3763  // - NaN gets converted to zero.
3764  // - INF is clamped to {0.0 to 1.0}.
3765  //==============================================================================================================================
3766  #if defined(A_GPU)
3767   // Hand tuned integer position to dither value, with more values than simple checkerboard.
3768   // Only 32-bit has enough precision for this compddation.
3769   // Output is {0 to <1}.
3770   AF1 FsrTepdDitF(AU2 p,AU1 f){
3771    AF1 x=AF1_(p.x+f);
3772    AF1 y=AF1_(p.y);
3773    // The 1.61803 golden ratio.
3774    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3775    // Number designed to provide a good visual pattern.
3776    AF1 b=AF1_(1.0/3.69);
3777    x=x*a+(y*b);
3778    return AFractF1(x);}
3779  //------------------------------------------------------------------------------------------------------------------------------
3780   // This version is 8-bit gamma 2.0.
3781   // The 'c' input is {0 to 1}.
3782   // Output is {0 to 1} ready for image store.
3783   void FsrTepdC8F(inout AF3 c,AF1 dit){
3784    AF3 n=sqrt(c);
3785    n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
3786    AF3 a=n*n;
3787    AF3 b=n+AF3_(1.0/255.0);b=b*b;
3788    // Ratio of 'a' to 'b' required to produce 'c'.
3789    // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
3790    // APrxMedRcpF1() is an IADD,FMA,MUL.
3791    AF3 r=(c-b)*APrxMedRcpF3(a-b);
3792    // Use the ratio as a cutoff to choose 'a' or 'b'.
3793    // AGtZeroF1() is a MUL.
3794    c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
3795  //------------------------------------------------------------------------------------------------------------------------------
3796   // This version is 10-bit gamma 2.0.
3797   // The 'c' input is {0 to 1}.
3798   // Output is {0 to 1} ready for image store.
3799   void FsrTepdC10F(inout AF3 c,AF1 dit){
3800    AF3 n=sqrt(c);
3801    n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
3802    AF3 a=n*n;
3803    AF3 b=n+AF3_(1.0/1023.0);b=b*b;
3804    AF3 r=(c-b)*APrxMedRcpF3(a-b);
3805    c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
3806  #endif
3807  //==============================================================================================================================
3808  #if defined(A_GPU)&&defined(A_HALF)
3809   AH1 FsrTepdDitH(AU2 p,AU1 f){
3810    AF1 x=AF1_(p.x+f);
3811    AF1 y=AF1_(p.y);
3812    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3813    AF1 b=AF1_(1.0/3.69);
3814    x=x*a+(y*b);
3815    return AH1(AFractF1(x));}
3816  //------------------------------------------------------------------------------------------------------------------------------
3817   void FsrTepdC8H(inout AH3 c,AH1 dit){
3818    AH3 n=sqrt(c);
3819    n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
3820    AH3 a=n*n;
3821    AH3 b=n+AH3_(1.0/255.0);b=b*b;
3822    AH3 r=(c-b)*APrxMedRcpH3(a-b);
3823    c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
3824  //------------------------------------------------------------------------------------------------------------------------------
3825   void FsrTepdC10H(inout AH3 c,AH1 dit){
3826    AH3 n=sqrt(c);
3827    n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
3828    AH3 a=n*n;
3829    AH3 b=n+AH3_(1.0/1023.0);b=b*b;
3830    AH3 r=(c-b)*APrxMedRcpH3(a-b);
3831    c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
3832  //==============================================================================================================================
3833   // This computes dither for positions 'p' and 'p+{8,0}'.
3834   AH2 FsrTepdDitHx2(AU2 p,AU1 f){
3835    AF2 x;
3836    x.x=AF1_(p.x+f);
3837    x.y=x.x+AF1_(8.0);
3838    AF1 y=AF1_(p.y);
3839    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3840    AF1 b=AF1_(1.0/3.69);
3841    x=x*AF2_(a)+AF2_(y*b);
3842    return AH2(AFractF2(x));}
3843  //------------------------------------------------------------------------------------------------------------------------------
3844   void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
3845    AH2 nR=sqrt(cR);
3846    AH2 nG=sqrt(cG);
3847    AH2 nB=sqrt(cB);
3848    nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
3849    nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
3850    nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
3851    AH2 aR=nR*nR;
3852    AH2 aG=nG*nG;
3853    AH2 aB=nB*nB;
3854    AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
3855    AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
3856    AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
3857    AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
3858    AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
3859    AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
3860    cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
3861    cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
3862    cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
3863  //------------------------------------------------------------------------------------------------------------------------------
3864   void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
3865    AH2 nR=sqrt(cR);
3866    AH2 nG=sqrt(cG);
3867    AH2 nB=sqrt(cB);
3868    nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
3869    nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
3870    nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
3871    AH2 aR=nR*nR;
3872    AH2 aG=nG*nG;
3873    AH2 aB=nB*nB;
3874    AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
3875    AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
3876    AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
3877    AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
3878    AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
3879    AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
3880    cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
3881    cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
3882    cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
3883  #endif
3884  
3885  
3886  void CurrFilter(AU2 pos)
3887  {
3888      AF3 c;
3889      FsrRcasF(c.r, c.g, c.b, pos, con0);
3890      imageStore(imgOutput, ASU2(pos), AF4(c, 1));
3891  }
3892  
3893  void main() {
3894  	FsrRcasCon(con0, sharpening_data);    
3895      
3896  	AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
3897      CurrFilter(gxy);
3898  	gxy.x += 8u;
3899  	CurrFilter(gxy);
3900  	gxy.y += 8u;
3901  	CurrFilter(gxy);
3902  	gxy.x -= 8u;
3903  	CurrFilter(gxy);
3904  }