/ src / Ryujinx.Graphics.Vulkan / Effects / Shaders / FsrScaling.glsl
FsrScaling.glsl
   1  // Scaling
   2  
   3  #version 430 core
   4  layout (local_size_x = 64) in;
   5  layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput;
   6  layout( binding = 1, set = 2) uniform sampler2D Source;
   7  layout( binding = 2 ) uniform dimensions{
   8   float srcX0;
   9   float srcX1;
  10   float srcY0;
  11   float srcY1;
  12   float dstX0;
  13   float dstX1;
  14   float dstY0;
  15   float dstY1;
  16   float scaleX;
  17   float scaleY;
  18  };
  19  
  20  #define A_GPU 1
  21  #define A_GLSL 1
  22  //==============================================================================================================================
  23  //
  24  //                                               [A] SHADER PORTABILITY 1.20210629
  25  //
  26  //==============================================================================================================================
  27  // FidelityFX Super Resolution Sample
  28  //
  29  // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  30  // Permission is hereby granted, free of charge, to any person obtaining a copy
  31  // of this software and associated documentation files(the "Software"), to deal
  32  // in the Software without restriction, including without limitation the rights
  33  // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  34  // copies of the Software, and to permit persons to whom the Software is
  35  // furnished to do so, subject to the following conditions :
  36  // The above copyright notice and this permission notice shall be included in
  37  // all copies or substantial portions of the Software.
  38  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  39  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  40  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  41  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  42  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  43  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  44  // THE SOFTWARE.
  45  //------------------------------------------------------------------------------------------------------------------------------
  46  // MIT LICENSE
  47  // ===========
  48  // Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
  49  // -----------
  50  // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
  51  // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
  52  // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
  53  // Software is furnished to do so, subject to the following conditions:
  54  // -----------
  55  // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  56  // Software.
  57  // -----------
  58  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  59  // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
  60  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  61  // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  62  //------------------------------------------------------------------------------------------------------------------------------
  63  // ABOUT
  64  // =====
  65  // Common central point for high-level shading language and C portability for various shader headers.
  66  //------------------------------------------------------------------------------------------------------------------------------
  67  // DEFINES
  68  // =======
  69  // A_CPU ..... Include the CPU related code.
  70  // A_GPU ..... Include the GPU related code.
  71  // A_GLSL .... Using GLSL.
  72  // A_HLSL .... Using HLSL.
  73  // A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
  74  // A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
  75  // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
  76  // =======
  77  // A_BYTE .... Support 8-bit integer.
  78  // A_HALF .... Support 16-bit integer and floating point.
  79  // A_LONG .... Support 64-bit integer.
  80  // A_DUBL .... Support 64-bit floating point.
  81  // =======
  82  // A_WAVE .... Support wave-wide operations.
  83  //------------------------------------------------------------------------------------------------------------------------------
  84  // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
  85  //------------------------------------------------------------------------------------------------------------------------------
  86  // SIMPLIFIED TYPE SYSTEM
  87  // ======================
  88  //  - All ints will be unsigned with exception of when signed is required.
  89  //  - Type naming simplified and shortened "A<type><#components>",
  90  //     - H = 16-bit float (half)
  91  //     - F = 32-bit float (float)
  92  //     - D = 64-bit float (double)
  93  //     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
  94  //     - B = 8-bit integer (byte)
  95  //     - W = 16-bit integer (word)
  96  //     - U = 32-bit integer (unsigned)
  97  //     - L = 64-bit integer (long)
  98  //  - Using "AS<type><#components>" for signed when required.
  99  //------------------------------------------------------------------------------------------------------------------------------
 100  // TODO
 101  // ====
 102  //  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
 103  //------------------------------------------------------------------------------------------------------------------------------
 104  // CHANGE LOG
 105  // ==========
 106  // 20200914 - Expanded wave ops and prx code.
 107  // 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
 108  //==============================================================================================================================
 109  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 110  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 111  //_____________________________________________________________/\_______________________________________________________________
 112  //==============================================================================================================================
 113  //                                                           COMMON
 114  //==============================================================================================================================
 115  #define A_2PI 6.28318530718
 116  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 117  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 118  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 119  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 120  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 121  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 122  //_____________________________________________________________/\_______________________________________________________________
 123  //==============================================================================================================================
 124  //
 125  //
 126  //                                                             CPU
 127  //
 128  //
 129  //==============================================================================================================================
 130  #ifdef A_CPU
 131   // Supporting user defined overrides.
 132   #ifndef A_RESTRICT
 133    #define A_RESTRICT __restrict
 134   #endif
 135  //------------------------------------------------------------------------------------------------------------------------------
 136   #ifndef A_STATIC
 137    #define A_STATIC static
 138   #endif
 139  //------------------------------------------------------------------------------------------------------------------------------
 140   // Same types across CPU and GPU.
 141   // Predicate uses 32-bit integer (C friendly bool).
 142   typedef uint32_t AP1;
 143   typedef float AF1;
 144   typedef double AD1;
 145   typedef uint8_t AB1;
 146   typedef uint16_t AW1;
 147   typedef uint32_t AU1;
 148   typedef uint64_t AL1;
 149   typedef int8_t ASB1;
 150   typedef int16_t ASW1;
 151   typedef int32_t ASU1;
 152   typedef int64_t ASL1;
 153  //------------------------------------------------------------------------------------------------------------------------------
 154   #define AD1_(a) ((AD1)(a))
 155   #define AF1_(a) ((AF1)(a))
 156   #define AL1_(a) ((AL1)(a))
 157   #define AU1_(a) ((AU1)(a))
 158  //------------------------------------------------------------------------------------------------------------------------------
 159   #define ASL1_(a) ((ASL1)(a))
 160   #define ASU1_(a) ((ASU1)(a))
 161  //------------------------------------------------------------------------------------------------------------------------------
 162   A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
 163  //------------------------------------------------------------------------------------------------------------------------------
 164   #define A_TRUE 1
 165   #define A_FALSE 0
 166  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 167  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 168  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 169  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 170  //_____________________________________________________________/\_______________________________________________________________
 171  //==============================================================================================================================
 172  //
 173  //                                                       CPU/GPU PORTING
 174  //
 175  //------------------------------------------------------------------------------------------------------------------------------
 176  // Get CPU and GPU to share all setup code, without duplicate code paths.
 177  // This uses a lower-case prefix for special vector constructs.
 178  //  - In C restrict pointers are used.
 179  //  - In the shading language, in/inout/out arguments are used.
 180  // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
 181  //==============================================================================================================================
 182  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 183  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 184  //_____________________________________________________________/\_______________________________________________________________
 185  //==============================================================================================================================
 186  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
 187  //==============================================================================================================================
 188   #define retAD2 AD1 *A_RESTRICT
 189   #define retAD3 AD1 *A_RESTRICT
 190   #define retAD4 AD1 *A_RESTRICT
 191   #define retAF2 AF1 *A_RESTRICT
 192   #define retAF3 AF1 *A_RESTRICT
 193   #define retAF4 AF1 *A_RESTRICT
 194   #define retAL2 AL1 *A_RESTRICT
 195   #define retAL3 AL1 *A_RESTRICT
 196   #define retAL4 AL1 *A_RESTRICT
 197   #define retAU2 AU1 *A_RESTRICT
 198   #define retAU3 AU1 *A_RESTRICT
 199   #define retAU4 AU1 *A_RESTRICT
 200  //------------------------------------------------------------------------------------------------------------------------------
 201   #define inAD2 AD1 *A_RESTRICT
 202   #define inAD3 AD1 *A_RESTRICT
 203   #define inAD4 AD1 *A_RESTRICT
 204   #define inAF2 AF1 *A_RESTRICT
 205   #define inAF3 AF1 *A_RESTRICT
 206   #define inAF4 AF1 *A_RESTRICT
 207   #define inAL2 AL1 *A_RESTRICT
 208   #define inAL3 AL1 *A_RESTRICT
 209   #define inAL4 AL1 *A_RESTRICT
 210   #define inAU2 AU1 *A_RESTRICT
 211   #define inAU3 AU1 *A_RESTRICT
 212   #define inAU4 AU1 *A_RESTRICT
 213  //------------------------------------------------------------------------------------------------------------------------------
 214   #define inoutAD2 AD1 *A_RESTRICT
 215   #define inoutAD3 AD1 *A_RESTRICT
 216   #define inoutAD4 AD1 *A_RESTRICT
 217   #define inoutAF2 AF1 *A_RESTRICT
 218   #define inoutAF3 AF1 *A_RESTRICT
 219   #define inoutAF4 AF1 *A_RESTRICT
 220   #define inoutAL2 AL1 *A_RESTRICT
 221   #define inoutAL3 AL1 *A_RESTRICT
 222   #define inoutAL4 AL1 *A_RESTRICT
 223   #define inoutAU2 AU1 *A_RESTRICT
 224   #define inoutAU3 AU1 *A_RESTRICT
 225   #define inoutAU4 AU1 *A_RESTRICT
 226  //------------------------------------------------------------------------------------------------------------------------------
 227   #define outAD2 AD1 *A_RESTRICT
 228   #define outAD3 AD1 *A_RESTRICT
 229   #define outAD4 AD1 *A_RESTRICT
 230   #define outAF2 AF1 *A_RESTRICT
 231   #define outAF3 AF1 *A_RESTRICT
 232   #define outAF4 AF1 *A_RESTRICT
 233   #define outAL2 AL1 *A_RESTRICT
 234   #define outAL3 AL1 *A_RESTRICT
 235   #define outAL4 AL1 *A_RESTRICT
 236   #define outAU2 AU1 *A_RESTRICT
 237   #define outAU3 AU1 *A_RESTRICT
 238   #define outAU4 AU1 *A_RESTRICT
 239  //------------------------------------------------------------------------------------------------------------------------------
 240   #define varAD2(x) AD1 x[2]
 241   #define varAD3(x) AD1 x[3]
 242   #define varAD4(x) AD1 x[4]
 243   #define varAF2(x) AF1 x[2]
 244   #define varAF3(x) AF1 x[3]
 245   #define varAF4(x) AF1 x[4]
 246   #define varAL2(x) AL1 x[2]
 247   #define varAL3(x) AL1 x[3]
 248   #define varAL4(x) AL1 x[4]
 249   #define varAU2(x) AU1 x[2]
 250   #define varAU3(x) AU1 x[3]
 251   #define varAU4(x) AU1 x[4]
 252  //------------------------------------------------------------------------------------------------------------------------------
 253   #define initAD2(x,y) {x,y}
 254   #define initAD3(x,y,z) {x,y,z}
 255   #define initAD4(x,y,z,w) {x,y,z,w}
 256   #define initAF2(x,y) {x,y}
 257   #define initAF3(x,y,z) {x,y,z}
 258   #define initAF4(x,y,z,w) {x,y,z,w}
 259   #define initAL2(x,y) {x,y}
 260   #define initAL3(x,y,z) {x,y,z}
 261   #define initAL4(x,y,z,w) {x,y,z,w}
 262   #define initAU2(x,y) {x,y}
 263   #define initAU3(x,y,z) {x,y,z}
 264   #define initAU4(x,y,z,w) {x,y,z,w}
 265  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 266  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 267  //_____________________________________________________________/\_______________________________________________________________
 268  //==============================================================================================================================
 269  //                                                     SCALAR RETURN OPS
 270  //------------------------------------------------------------------------------------------------------------------------------
 271  // TODO
 272  // ====
 273  //  - Replace transcendentals with manual versions. 
 274  //==============================================================================================================================
 275   #ifdef A_GCC
 276    A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
 277    A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
 278    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
 279    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
 280   #else
 281    A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
 282    A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
 283    A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
 284    A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
 285   #endif
 286  //------------------------------------------------------------------------------------------------------------------------------
 287   #ifdef A_GCC
 288    A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
 289    A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
 290   #else
 291    A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
 292    A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
 293   #endif
 294  //------------------------------------------------------------------------------------------------------------------------------
 295   A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
 296   A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 297   A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 298   A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
 299   A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
 300   A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
 301  //------------------------------------------------------------------------------------------------------------------------------
 302   #ifdef A_GCC
 303    A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
 304    A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
 305   #else
 306    A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
 307    A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
 308   #endif
 309  //------------------------------------------------------------------------------------------------------------------------------
 310   #ifdef A_GCC
 311    A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
 312    A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
 313   #else
 314    A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
 315    A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
 316   #endif
 317  //------------------------------------------------------------------------------------------------------------------------------
 318   A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
 319   A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
 320  //------------------------------------------------------------------------------------------------------------------------------
 321   #ifdef A_GCC
 322    A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
 323    A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
 324   #else
 325    A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
 326    A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
 327   #endif
 328  //------------------------------------------------------------------------------------------------------------------------------
 329   A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
 330   A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
 331   A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
 332   A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
 333  //------------------------------------------------------------------------------------------------------------------------------
 334   // These follow the convention that A integer types don't have signage, until they are operated on. 
 335   A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
 336   A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
 337  //------------------------------------------------------------------------------------------------------------------------------
 338   A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
 339   A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
 340   A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
 341   A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
 342  //------------------------------------------------------------------------------------------------------------------------------
 343   A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
 344   A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
 345  //------------------------------------------------------------------------------------------------------------------------------
 346   A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
 347   A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
 348  //------------------------------------------------------------------------------------------------------------------------------
 349   A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
 350   A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
 351  //------------------------------------------------------------------------------------------------------------------------------
 352   #ifdef A_GCC
 353    A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
 354    A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
 355   #else
 356    A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
 357    A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
 358   #endif
 359  //------------------------------------------------------------------------------------------------------------------------------
 360   #ifdef A_GCC
 361    A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
 362    A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
 363   #else
 364    A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
 365    A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
 366   #endif
 367  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 368  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 369  //_____________________________________________________________/\_______________________________________________________________
 370  //==============================================================================================================================
 371  //                                               SCALAR RETURN OPS - DEPENDENT
 372  //==============================================================================================================================
 373   A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
 374   A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
 375  //------------------------------------------------------------------------------------------------------------------------------
 376   A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
 377   A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
 378  //------------------------------------------------------------------------------------------------------------------------------
 379   A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
 380   A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
 381  //------------------------------------------------------------------------------------------------------------------------------
 382   A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
 383   A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
 384  //------------------------------------------------------------------------------------------------------------------------------
 385   A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
 386   A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
 387  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 388  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 389  //_____________________________________________________________/\_______________________________________________________________
 390  //==============================================================================================================================
 391  //                                                         VECTOR OPS
 392  //------------------------------------------------------------------------------------------------------------------------------
 393  // These are added as needed for production or prototyping, so not necessarily a complete set.
 394  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
 395  //==============================================================================================================================
 396   A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
 397   A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
 398   A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
 399  //------------------------------------------------------------------------------------------------------------------------------
 400   A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
 401   A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
 402   A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
 403  //==============================================================================================================================
 404   A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 405   A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 406   A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 407  //------------------------------------------------------------------------------------------------------------------------------
 408   A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
 409   A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
 410   A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
 411  //==============================================================================================================================
 412   A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 413   A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 414   A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 415  //------------------------------------------------------------------------------------------------------------------------------
 416   A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
 417   A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
 418   A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
 419  //==============================================================================================================================
 420   A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
 421   A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 422   A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 423  //------------------------------------------------------------------------------------------------------------------------------
 424   A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
 425   A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
 426   A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
 427  //==============================================================================================================================
 428   A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
 429   A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
 430   A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
 431  //------------------------------------------------------------------------------------------------------------------------------
 432   A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
 433   A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
 434   A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
 435  //==============================================================================================================================
 436   A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
 437   A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
 438   A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
 439  //------------------------------------------------------------------------------------------------------------------------------
 440   A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
 441   A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
 442   A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
 443  //==============================================================================================================================
 444   A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
 445   A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
 446   A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
 447  //------------------------------------------------------------------------------------------------------------------------------
 448   A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
 449   A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
 450   A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
 451  //==============================================================================================================================
 452   A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
 453   A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
 454   A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
 455  //------------------------------------------------------------------------------------------------------------------------------
 456   A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
 457   A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
 458   A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
 459  //==============================================================================================================================
 460   A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 461   A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 462   A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 463  //------------------------------------------------------------------------------------------------------------------------------
 464   A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
 465   A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
 466   A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
 467  //==============================================================================================================================
 468   A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 469   A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 470   A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 471  //------------------------------------------------------------------------------------------------------------------------------
 472   A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
 473   A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
 474   A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
 475  //==============================================================================================================================
 476   A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 477   A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 478   A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 479  //------------------------------------------------------------------------------------------------------------------------------
 480   A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
 481   A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
 482   A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
 483  //==============================================================================================================================
 484   A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
 485   A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
 486   A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
 487  //------------------------------------------------------------------------------------------------------------------------------
 488   A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
 489   A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
 490   A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
 491  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 492  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 493  //_____________________________________________________________/\_______________________________________________________________
 494  //==============================================================================================================================
 495  //                                                     HALF FLOAT PACKING
 496  //==============================================================================================================================
 497   // Convert float to half (in lower 16-bits of output).
 498   // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
 499   // Supports denormals.
 500   // Conversion rules are to make computations possibly "safer" on the GPU,
 501   //  -INF & -NaN -> -65504
 502   //  +INF & +NaN -> +65504
 503   A_STATIC AU1 AU1_AH1_AF1(AF1 f){
 504    static AW1 base[512]={
 505     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 506     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 507     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 508     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 509     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 510     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 511     0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
 512     0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
 513     0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
 514     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 515     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 516     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 517     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 518     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 519     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 520     0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 521     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 522     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 523     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 524     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 525     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 526     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 527     0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
 528     0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
 529     0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
 530     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 531     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 532     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 533     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 534     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 535     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 536     0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
 537    static AB1 shift[512]={
 538     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 539     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 540     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 541     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 542     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 543     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 544     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 545     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 546     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 547     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 548     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 549     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 550     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 551     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 552     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 553     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 554     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 555     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 556     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 557     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 558     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 559     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 560     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 561     0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 562     0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 563     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 564     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 565     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 566     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 567     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 568     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 569     0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
 570    union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
 571  //------------------------------------------------------------------------------------------------------------------------------
 572   // Used to output packed constant.
 573   A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
 574  #endif
 575  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 576  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 577  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 578  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 579  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 580  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 581  //_____________________________________________________________/\_______________________________________________________________
 582  //==============================================================================================================================
 583  //
 584  //
 585  //                                                            GLSL
 586  //
 587  //
 588  //==============================================================================================================================
 589  #if defined(A_GLSL) && defined(A_GPU)
 590   #ifndef A_SKIP_EXT
 591    #ifdef A_HALF
 592     #extension GL_EXT_shader_16bit_storage:require
 593     #extension GL_EXT_shader_explicit_arithmetic_types:require 
 594    #endif
 595  //------------------------------------------------------------------------------------------------------------------------------
 596    #ifdef A_LONG
 597     #extension GL_ARB_gpu_shader_int64:require
 598     #extension GL_NV_shader_atomic_int64:require
 599    #endif
 600  //------------------------------------------------------------------------------------------------------------------------------
 601    #ifdef A_WAVE
 602     #extension GL_KHR_shader_subgroup_arithmetic:require
 603     #extension GL_KHR_shader_subgroup_ballot:require
 604     #extension GL_KHR_shader_subgroup_quad:require
 605     #extension GL_KHR_shader_subgroup_shuffle:require
 606    #endif
 607   #endif
 608  //==============================================================================================================================
 609   #define AP1 bool
 610   #define AP2 bvec2
 611   #define AP3 bvec3
 612   #define AP4 bvec4
 613  //------------------------------------------------------------------------------------------------------------------------------
 614   #define AF1 float
 615   #define AF2 vec2
 616   #define AF3 vec3
 617   #define AF4 vec4
 618  //------------------------------------------------------------------------------------------------------------------------------
 619   #define AU1 uint
 620   #define AU2 uvec2
 621   #define AU3 uvec3
 622   #define AU4 uvec4
 623  //------------------------------------------------------------------------------------------------------------------------------
 624   #define ASU1 int
 625   #define ASU2 ivec2
 626   #define ASU3 ivec3
 627   #define ASU4 ivec4
 628  //==============================================================================================================================
 629   #define AF1_AU1(x) uintBitsToFloat(AU1(x))
 630   #define AF2_AU2(x) uintBitsToFloat(AU2(x))
 631   #define AF3_AU3(x) uintBitsToFloat(AU3(x))
 632   #define AF4_AU4(x) uintBitsToFloat(AU4(x))
 633  //------------------------------------------------------------------------------------------------------------------------------
 634   #define AU1_AF1(x) floatBitsToUint(AF1(x))
 635   #define AU2_AF2(x) floatBitsToUint(AF2(x))
 636   #define AU3_AF3(x) floatBitsToUint(AF3(x))
 637   #define AU4_AF4(x) floatBitsToUint(AF4(x))
 638  //------------------------------------------------------------------------------------------------------------------------------
 639   AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
 640   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
 641  //------------------------------------------------------------------------------------------------------------------------------
 642   #define AU1_AH2_AF2 packHalf2x16
 643   #define AU1_AW2Unorm_AF2 packUnorm2x16
 644   #define AU1_AB4Unorm_AF4 packUnorm4x8
 645  //------------------------------------------------------------------------------------------------------------------------------
 646   #define AF2_AH2_AU1 unpackHalf2x16
 647   #define AF2_AW2Unorm_AU1 unpackUnorm2x16
 648   #define AF4_AB4Unorm_AU1 unpackUnorm4x8
 649  //==============================================================================================================================
 650   AF1 AF1_x(AF1 a){return AF1(a);}
 651   AF2 AF2_x(AF1 a){return AF2(a,a);}
 652   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
 653   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
 654   #define AF1_(a) AF1_x(AF1(a))
 655   #define AF2_(a) AF2_x(AF1(a))
 656   #define AF3_(a) AF3_x(AF1(a))
 657   #define AF4_(a) AF4_x(AF1(a))
 658  //------------------------------------------------------------------------------------------------------------------------------
 659   AU1 AU1_x(AU1 a){return AU1(a);}
 660   AU2 AU2_x(AU1 a){return AU2(a,a);}
 661   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
 662   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
 663   #define AU1_(a) AU1_x(AU1(a))
 664   #define AU2_(a) AU2_x(AU1(a))
 665   #define AU3_(a) AU3_x(AU1(a))
 666   #define AU4_(a) AU4_x(AU1(a))
 667  //==============================================================================================================================
 668   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
 669   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
 670   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
 671   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
 672  //------------------------------------------------------------------------------------------------------------------------------
 673   AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
 674   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
 675   // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
 676   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
 677  //------------------------------------------------------------------------------------------------------------------------------
 678   // V_MED3_F32.
 679   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
 680   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
 681   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
 682   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
 683  //------------------------------------------------------------------------------------------------------------------------------
 684   // V_FRACT_F32 (note DX frac() is different).
 685   AF1 AFractF1(AF1 x){return fract(x);}
 686   AF2 AFractF2(AF2 x){return fract(x);}
 687   AF3 AFractF3(AF3 x){return fract(x);}
 688   AF4 AFractF4(AF4 x){return fract(x);}
 689  //------------------------------------------------------------------------------------------------------------------------------
 690   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
 691   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
 692   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
 693   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
 694  //------------------------------------------------------------------------------------------------------------------------------
 695   // V_MAX3_F32.
 696   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
 697   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
 698   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
 699   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
 700  //------------------------------------------------------------------------------------------------------------------------------
 701   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
 702   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
 703   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
 704   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
 705  //------------------------------------------------------------------------------------------------------------------------------
 706   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
 707   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
 708   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
 709   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
 710  //------------------------------------------------------------------------------------------------------------------------------
 711   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
 712   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
 713   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
 714   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
 715  //------------------------------------------------------------------------------------------------------------------------------
 716   // Clamp has an easier pattern match for med3 when some ordering is known.
 717   // V_MED3_F32.
 718   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
 719   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
 720   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
 721   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
 722  //------------------------------------------------------------------------------------------------------------------------------
 723   // V_MIN3_F32.
 724   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
 725   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
 726   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
 727   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
 728  //------------------------------------------------------------------------------------------------------------------------------
 729   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
 730   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
 731   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
 732   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
 733  //------------------------------------------------------------------------------------------------------------------------------
 734   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
 735   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
 736   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
 737   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
 738  //------------------------------------------------------------------------------------------------------------------------------
 739   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
 740   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
 741   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
 742   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
 743  //------------------------------------------------------------------------------------------------------------------------------
 744   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 745   // V_COS_F32.
 746   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
 747   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
 748   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
 749   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
 750  //------------------------------------------------------------------------------------------------------------------------------
 751   // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
 752   // V_SIN_F32.
 753   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
 754   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
 755   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
 756   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
 757  //------------------------------------------------------------------------------------------------------------------------------
 758   AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
 759   AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
 760   AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
 761   AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
 762  //------------------------------------------------------------------------------------------------------------------------------
 763   AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
 764   AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
 765   AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
 766   AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
 767  //------------------------------------------------------------------------------------------------------------------------------
 768   AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
 769   AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
 770   AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
 771   AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
 772  //------------------------------------------------------------------------------------------------------------------------------
 773   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
 774   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
 775   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
 776   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
 777  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 778  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 779  //_____________________________________________________________/\_______________________________________________________________
 780  //==============================================================================================================================
 781  //                                                          GLSL BYTE
 782  //==============================================================================================================================
 783   #ifdef A_BYTE
 784    #define AB1 uint8_t
 785    #define AB2 u8vec2
 786    #define AB3 u8vec3
 787    #define AB4 u8vec4
 788  //------------------------------------------------------------------------------------------------------------------------------
 789    #define ASB1 int8_t
 790    #define ASB2 i8vec2
 791    #define ASB3 i8vec3
 792    #define ASB4 i8vec4
 793  //------------------------------------------------------------------------------------------------------------------------------
 794    AB1 AB1_x(AB1 a){return AB1(a);}
 795    AB2 AB2_x(AB1 a){return AB2(a,a);}
 796    AB3 AB3_x(AB1 a){return AB3(a,a,a);}
 797    AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
 798    #define AB1_(a) AB1_x(AB1(a))
 799    #define AB2_(a) AB2_x(AB1(a))
 800    #define AB3_(a) AB3_x(AB1(a))
 801    #define AB4_(a) AB4_x(AB1(a))
 802   #endif
 803  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 804  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 805  //_____________________________________________________________/\_______________________________________________________________
 806  //==============================================================================================================================
 807  //                                                          GLSL HALF
 808  //==============================================================================================================================
 809   #ifdef A_HALF
 810    #define AH1 float16_t
 811    #define AH2 f16vec2
 812    #define AH3 f16vec3
 813    #define AH4 f16vec4
 814  //------------------------------------------------------------------------------------------------------------------------------
 815    #define AW1 uint16_t
 816    #define AW2 u16vec2
 817    #define AW3 u16vec3
 818    #define AW4 u16vec4
 819  //------------------------------------------------------------------------------------------------------------------------------
 820    #define ASW1 int16_t
 821    #define ASW2 i16vec2
 822    #define ASW3 i16vec3
 823    #define ASW4 i16vec4
 824  //==============================================================================================================================
 825    #define AH2_AU1(x) unpackFloat2x16(AU1(x))
 826    AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
 827    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
 828    #define AW2_AU1(x) unpackUint2x16(AU1(x))
 829    #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
 830  //------------------------------------------------------------------------------------------------------------------------------
 831    #define AU1_AH2(x) packFloat2x16(AH2(x))
 832    AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
 833    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
 834    #define AU1_AW2(x) packUint2x16(AW2(x))
 835    #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
 836  //==============================================================================================================================
 837    #define AW1_AH1(x) halfBitsToUint16(AH1(x))
 838    #define AW2_AH2(x) halfBitsToUint16(AH2(x))
 839    #define AW3_AH3(x) halfBitsToUint16(AH3(x))
 840    #define AW4_AH4(x) halfBitsToUint16(AH4(x))
 841  //------------------------------------------------------------------------------------------------------------------------------
 842    #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
 843    #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
 844    #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
 845    #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
 846  //==============================================================================================================================
 847    AH1 AH1_x(AH1 a){return AH1(a);}
 848    AH2 AH2_x(AH1 a){return AH2(a,a);}
 849    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
 850    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
 851    #define AH1_(a) AH1_x(AH1(a))
 852    #define AH2_(a) AH2_x(AH1(a))
 853    #define AH3_(a) AH3_x(AH1(a))
 854    #define AH4_(a) AH4_x(AH1(a))
 855  //------------------------------------------------------------------------------------------------------------------------------
 856    AW1 AW1_x(AW1 a){return AW1(a);}
 857    AW2 AW2_x(AW1 a){return AW2(a,a);}
 858    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
 859    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
 860    #define AW1_(a) AW1_x(AW1(a))
 861    #define AW2_(a) AW2_x(AW1(a))
 862    #define AW3_(a) AW3_x(AW1(a))
 863    #define AW4_(a) AW4_x(AW1(a))
 864  //==============================================================================================================================
 865    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
 866    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
 867    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
 868    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
 869  //------------------------------------------------------------------------------------------------------------------------------
 870    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
 871    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
 872    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
 873    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
 874  //------------------------------------------------------------------------------------------------------------------------------
 875    AH1 AFractH1(AH1 x){return fract(x);}
 876    AH2 AFractH2(AH2 x){return fract(x);}
 877    AH3 AFractH3(AH3 x){return fract(x);}
 878    AH4 AFractH4(AH4 x){return fract(x);}
 879  //------------------------------------------------------------------------------------------------------------------------------
 880    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
 881    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
 882    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
 883    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
 884  //------------------------------------------------------------------------------------------------------------------------------
 885    // No packed version of max3.
 886    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
 887    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
 888    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
 889    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
 890  //------------------------------------------------------------------------------------------------------------------------------
 891    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
 892    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
 893    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
 894    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
 895  //------------------------------------------------------------------------------------------------------------------------------
 896    // No packed version of min3.
 897    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
 898    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
 899    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
 900    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
 901  //------------------------------------------------------------------------------------------------------------------------------
 902    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
 903    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
 904    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
 905    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
 906  //------------------------------------------------------------------------------------------------------------------------------
 907    AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
 908    AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
 909    AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
 910    AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
 911  //------------------------------------------------------------------------------------------------------------------------------
 912    AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
 913    AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
 914    AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
 915    AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
 916  //------------------------------------------------------------------------------------------------------------------------------
 917    AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
 918    AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
 919    AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
 920    AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
 921  //------------------------------------------------------------------------------------------------------------------------------
 922    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
 923    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
 924    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
 925    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
 926   #endif
 927  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 928  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 929  //_____________________________________________________________/\_______________________________________________________________
 930  //==============================================================================================================================
 931  //                                                         GLSL DOUBLE
 932  //==============================================================================================================================
 933   #ifdef A_DUBL
 934    #define AD1 double
 935    #define AD2 dvec2
 936    #define AD3 dvec3
 937    #define AD4 dvec4
 938  //------------------------------------------------------------------------------------------------------------------------------
 939    AD1 AD1_x(AD1 a){return AD1(a);}
 940    AD2 AD2_x(AD1 a){return AD2(a,a);}
 941    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
 942    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
 943    #define AD1_(a) AD1_x(AD1(a))
 944    #define AD2_(a) AD2_x(AD1(a))
 945    #define AD3_(a) AD3_x(AD1(a))
 946    #define AD4_(a) AD4_x(AD1(a))
 947  //==============================================================================================================================
 948    AD1 AFractD1(AD1 x){return fract(x);}
 949    AD2 AFractD2(AD2 x){return fract(x);}
 950    AD3 AFractD3(AD3 x){return fract(x);}
 951    AD4 AFractD4(AD4 x){return fract(x);}
 952  //------------------------------------------------------------------------------------------------------------------------------
 953    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
 954    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
 955    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
 956    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
 957  //------------------------------------------------------------------------------------------------------------------------------
 958    AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
 959    AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
 960    AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
 961    AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
 962  //------------------------------------------------------------------------------------------------------------------------------
 963    AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
 964    AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
 965    AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
 966    AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
 967  //------------------------------------------------------------------------------------------------------------------------------
 968    AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
 969    AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
 970    AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
 971    AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
 972   #endif
 973  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 974  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 975  //_____________________________________________________________/\_______________________________________________________________
 976  //==============================================================================================================================
 977  //                                                         GLSL LONG
 978  //==============================================================================================================================
 979   #ifdef A_LONG
 980    #define AL1 uint64_t
 981    #define AL2 u64vec2
 982    #define AL3 u64vec3
 983    #define AL4 u64vec4
 984  //------------------------------------------------------------------------------------------------------------------------------
 985    #define ASL1 int64_t
 986    #define ASL2 i64vec2
 987    #define ASL3 i64vec3
 988    #define ASL4 i64vec4
 989  //------------------------------------------------------------------------------------------------------------------------------
 990    #define AL1_AU2(x) packUint2x32(AU2(x))
 991    #define AU2_AL1(x) unpackUint2x32(AL1(x))
 992  //------------------------------------------------------------------------------------------------------------------------------
 993    AL1 AL1_x(AL1 a){return AL1(a);}
 994    AL2 AL2_x(AL1 a){return AL2(a,a);}
 995    AL3 AL3_x(AL1 a){return AL3(a,a,a);}
 996    AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
 997    #define AL1_(a) AL1_x(AL1(a))
 998    #define AL2_(a) AL2_x(AL1(a))
 999    #define AL3_(a) AL3_x(AL1(a))
1000    #define AL4_(a) AL4_x(AL1(a))
1001  //==============================================================================================================================
1002    AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
1003    AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
1004    AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
1005    AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
1006  //------------------------------------------------------------------------------------------------------------------------------
1007    AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
1008    AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
1009    AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
1010    AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
1011  //------------------------------------------------------------------------------------------------------------------------------
1012    AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
1013    AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
1014    AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
1015    AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
1016   #endif
1017  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1018  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1019  //_____________________________________________________________/\_______________________________________________________________
1020  //==============================================================================================================================
1021  //                                                      WAVE OPERATIONS
1022  //==============================================================================================================================
1023   #ifdef A_WAVE
1024    // Where 'x' must be a compile time literal.
1025    AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
1026    AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
1027    AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
1028    AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
1029    AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
1030    AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
1031    AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
1032    AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
1033  //------------------------------------------------------------------------------------------------------------------------------
1034    #ifdef A_HALF
1035     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
1036     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
1037     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
1038     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
1039    #endif
1040   #endif
1041  //==============================================================================================================================
1042  #endif
1043  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1044  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1045  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1046  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1047  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1048  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1049  //_____________________________________________________________/\_______________________________________________________________
1050  //==============================================================================================================================
1051  //
1052  //
1053  //                                                            HLSL
1054  //
1055  //
1056  //==============================================================================================================================
1057  #if defined(A_HLSL) && defined(A_GPU)
1058   #ifdef A_HLSL_6_2
1059    #define AP1 bool
1060    #define AP2 bool2
1061    #define AP3 bool3
1062    #define AP4 bool4
1063  //------------------------------------------------------------------------------------------------------------------------------
1064    #define AF1 float32_t
1065    #define AF2 float32_t2
1066    #define AF3 float32_t3
1067    #define AF4 float32_t4
1068  //------------------------------------------------------------------------------------------------------------------------------
1069    #define AU1 uint32_t
1070    #define AU2 uint32_t2
1071    #define AU3 uint32_t3
1072    #define AU4 uint32_t4
1073  //------------------------------------------------------------------------------------------------------------------------------
1074    #define ASU1 int32_t
1075    #define ASU2 int32_t2
1076    #define ASU3 int32_t3
1077    #define ASU4 int32_t4
1078   #else
1079    #define AP1 bool
1080    #define AP2 bool2
1081    #define AP3 bool3
1082    #define AP4 bool4
1083  //------------------------------------------------------------------------------------------------------------------------------
1084    #define AF1 float
1085    #define AF2 float2
1086    #define AF3 float3
1087    #define AF4 float4
1088  //------------------------------------------------------------------------------------------------------------------------------
1089    #define AU1 uint
1090    #define AU2 uint2
1091    #define AU3 uint3
1092    #define AU4 uint4
1093  //------------------------------------------------------------------------------------------------------------------------------
1094    #define ASU1 int
1095    #define ASU2 int2
1096    #define ASU3 int3
1097    #define ASU4 int4
1098   #endif
1099  //==============================================================================================================================
1100   #define AF1_AU1(x) asfloat(AU1(x))
1101   #define AF2_AU2(x) asfloat(AU2(x))
1102   #define AF3_AU3(x) asfloat(AU3(x))
1103   #define AF4_AU4(x) asfloat(AU4(x))
1104  //------------------------------------------------------------------------------------------------------------------------------
1105   #define AU1_AF1(x) asuint(AF1(x))
1106   #define AU2_AF2(x) asuint(AF2(x))
1107   #define AU3_AF3(x) asuint(AF3(x))
1108   #define AU4_AF4(x) asuint(AF4(x))
1109  //------------------------------------------------------------------------------------------------------------------------------
1110   AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
1111   #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
1112  //------------------------------------------------------------------------------------------------------------------------------
1113   AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
1114   #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
1115   #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
1116  //------------------------------------------------------------------------------------------------------------------------------
1117   AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
1118   #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
1119  //==============================================================================================================================
1120   AF1 AF1_x(AF1 a){return AF1(a);}
1121   AF2 AF2_x(AF1 a){return AF2(a,a);}
1122   AF3 AF3_x(AF1 a){return AF3(a,a,a);}
1123   AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
1124   #define AF1_(a) AF1_x(AF1(a))
1125   #define AF2_(a) AF2_x(AF1(a))
1126   #define AF3_(a) AF3_x(AF1(a))
1127   #define AF4_(a) AF4_x(AF1(a))
1128  //------------------------------------------------------------------------------------------------------------------------------
1129   AU1 AU1_x(AU1 a){return AU1(a);}
1130   AU2 AU2_x(AU1 a){return AU2(a,a);}
1131   AU3 AU3_x(AU1 a){return AU3(a,a,a);}
1132   AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
1133   #define AU1_(a) AU1_x(AU1(a))
1134   #define AU2_(a) AU2_x(AU1(a))
1135   #define AU3_(a) AU3_x(AU1(a))
1136   #define AU4_(a) AU4_x(AU1(a))
1137  //==============================================================================================================================
1138   AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
1139   AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
1140   AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
1141   AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
1142  //------------------------------------------------------------------------------------------------------------------------------
1143   AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
1144   AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
1145   AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
1146  //------------------------------------------------------------------------------------------------------------------------------
1147   AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
1148   AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
1149   AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
1150   AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
1151  //------------------------------------------------------------------------------------------------------------------------------
1152   AF1 AFractF1(AF1 x){return x-floor(x);}
1153   AF2 AFractF2(AF2 x){return x-floor(x);}
1154   AF3 AFractF3(AF3 x){return x-floor(x);}
1155   AF4 AFractF4(AF4 x){return x-floor(x);}
1156  //------------------------------------------------------------------------------------------------------------------------------
1157   AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
1158   AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
1159   AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
1160   AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
1161  //------------------------------------------------------------------------------------------------------------------------------
1162   AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
1163   AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
1164   AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
1165   AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
1166  //------------------------------------------------------------------------------------------------------------------------------
1167   AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
1168   AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
1169   AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
1170   AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
1171  //------------------------------------------------------------------------------------------------------------------------------
1172   AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
1173   AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
1174   AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
1175   AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
1176  //------------------------------------------------------------------------------------------------------------------------------
1177   AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
1178   AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
1179   AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
1180   AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
1181  //------------------------------------------------------------------------------------------------------------------------------
1182   AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
1183   AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
1184   AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
1185   AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
1186  //------------------------------------------------------------------------------------------------------------------------------
1187   AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
1188   AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
1189   AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
1190   AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
1191  //------------------------------------------------------------------------------------------------------------------------------
1192   AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
1193   AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
1194   AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
1195   AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
1196  //------------------------------------------------------------------------------------------------------------------------------
1197   AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
1198   AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
1199   AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
1200   AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
1201  //------------------------------------------------------------------------------------------------------------------------------
1202   AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
1203   AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
1204   AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
1205   AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
1206  //------------------------------------------------------------------------------------------------------------------------------
1207   AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
1208   AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
1209   AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
1210   AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
1211  //------------------------------------------------------------------------------------------------------------------------------
1212   AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
1213   AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
1214   AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
1215   AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
1216  //------------------------------------------------------------------------------------------------------------------------------
1217   AF1 ARcpF1(AF1 x){return rcp(x);}
1218   AF2 ARcpF2(AF2 x){return rcp(x);}
1219   AF3 ARcpF3(AF3 x){return rcp(x);}
1220   AF4 ARcpF4(AF4 x){return rcp(x);}
1221  //------------------------------------------------------------------------------------------------------------------------------
1222   AF1 ARsqF1(AF1 x){return rsqrt(x);}
1223   AF2 ARsqF2(AF2 x){return rsqrt(x);}
1224   AF3 ARsqF3(AF3 x){return rsqrt(x);}
1225   AF4 ARsqF4(AF4 x){return rsqrt(x);}
1226  //------------------------------------------------------------------------------------------------------------------------------
1227   AF1 ASatF1(AF1 x){return saturate(x);}
1228   AF2 ASatF2(AF2 x){return saturate(x);}
1229   AF3 ASatF3(AF3 x){return saturate(x);}
1230   AF4 ASatF4(AF4 x){return saturate(x);}
1231  //------------------------------------------------------------------------------------------------------------------------------
1232   AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
1233   AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
1234   AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
1235   AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
1236  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1237  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1238  //_____________________________________________________________/\_______________________________________________________________
1239  //==============================================================================================================================
1240  //                                                          HLSL BYTE
1241  //==============================================================================================================================
1242   #ifdef A_BYTE
1243   #endif
1244  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1245  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1246  //_____________________________________________________________/\_______________________________________________________________
1247  //==============================================================================================================================
1248  //                                                          HLSL HALF
1249  //==============================================================================================================================
1250   #ifdef A_HALF
1251    #ifdef A_HLSL_6_2
1252     #define AH1 float16_t
1253     #define AH2 float16_t2
1254     #define AH3 float16_t3
1255     #define AH4 float16_t4
1256  //------------------------------------------------------------------------------------------------------------------------------
1257     #define AW1 uint16_t
1258     #define AW2 uint16_t2
1259     #define AW3 uint16_t3
1260     #define AW4 uint16_t4
1261  //------------------------------------------------------------------------------------------------------------------------------
1262     #define ASW1 int16_t
1263     #define ASW2 int16_t2
1264     #define ASW3 int16_t3
1265     #define ASW4 int16_t4
1266    #else
1267     #define AH1 min16float
1268     #define AH2 min16float2
1269     #define AH3 min16float3
1270     #define AH4 min16float4
1271  //------------------------------------------------------------------------------------------------------------------------------
1272     #define AW1 min16uint
1273     #define AW2 min16uint2
1274     #define AW3 min16uint3
1275     #define AW4 min16uint4
1276  //------------------------------------------------------------------------------------------------------------------------------
1277     #define ASW1 min16int
1278     #define ASW2 min16int2
1279     #define ASW3 min16int3
1280     #define ASW4 min16int4
1281    #endif
1282  //==============================================================================================================================
1283    // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
1284    // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
1285    AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
1286    AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
1287    AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
1288    AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
1289    #define AH2_AU1(x) AH2_AU1_x(AU1(x))
1290    #define AH4_AU2(x) AH4_AU2_x(AU2(x))
1291    #define AW2_AU1(x) AW2_AU1_x(AU1(x))
1292    #define AW4_AU2(x) AW4_AU2_x(AU2(x))
1293  //------------------------------------------------------------------------------------------------------------------------------
1294    AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
1295    AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
1296    AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
1297    AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
1298    #define AU1_AH2(x) AU1_AH2_x(AH2(x))
1299    #define AU2_AH4(x) AU2_AH4_x(AH4(x))
1300    #define AU1_AW2(x) AU1_AW2_x(AW2(x))
1301    #define AU2_AW4(x) AU2_AW4_x(AW4(x))
1302  //==============================================================================================================================
1303    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1304     #define AW1_AH1(x) asuint16(x)
1305     #define AW2_AH2(x) asuint16(x)
1306     #define AW3_AH3(x) asuint16(x)
1307     #define AW4_AH4(x) asuint16(x)
1308    #else
1309     #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
1310     #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
1311     #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
1312     #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
1313    #endif
1314  //------------------------------------------------------------------------------------------------------------------------------
1315    #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
1316     #define AH1_AW1(x) asfloat16(x)
1317     #define AH2_AW2(x) asfloat16(x)
1318     #define AH3_AW3(x) asfloat16(x)
1319     #define AH4_AW4(x) asfloat16(x)
1320    #else
1321     #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
1322     #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
1323     #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
1324     #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
1325    #endif
1326  //==============================================================================================================================
1327    AH1 AH1_x(AH1 a){return AH1(a);}
1328    AH2 AH2_x(AH1 a){return AH2(a,a);}
1329    AH3 AH3_x(AH1 a){return AH3(a,a,a);}
1330    AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
1331    #define AH1_(a) AH1_x(AH1(a))
1332    #define AH2_(a) AH2_x(AH1(a))
1333    #define AH3_(a) AH3_x(AH1(a))
1334    #define AH4_(a) AH4_x(AH1(a))
1335  //------------------------------------------------------------------------------------------------------------------------------
1336    AW1 AW1_x(AW1 a){return AW1(a);}
1337    AW2 AW2_x(AW1 a){return AW2(a,a);}
1338    AW3 AW3_x(AW1 a){return AW3(a,a,a);}
1339    AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
1340    #define AW1_(a) AW1_x(AW1(a))
1341    #define AW2_(a) AW2_x(AW1(a))
1342    #define AW3_(a) AW3_x(AW1(a))
1343    #define AW4_(a) AW4_x(AW1(a))
1344  //==============================================================================================================================
1345    AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
1346    AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
1347    AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
1348    AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
1349  //------------------------------------------------------------------------------------------------------------------------------
1350    AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
1351    AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
1352    AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
1353    AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
1354  //------------------------------------------------------------------------------------------------------------------------------
1355   // V_FRACT_F16 (note DX frac() is different).
1356    AH1 AFractH1(AH1 x){return x-floor(x);}
1357    AH2 AFractH2(AH2 x){return x-floor(x);}
1358    AH3 AFractH3(AH3 x){return x-floor(x);}
1359    AH4 AFractH4(AH4 x){return x-floor(x);}
1360  //------------------------------------------------------------------------------------------------------------------------------
1361    AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
1362    AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
1363    AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
1364    AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
1365  //------------------------------------------------------------------------------------------------------------------------------
1366    AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
1367    AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
1368    AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
1369    AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
1370  //------------------------------------------------------------------------------------------------------------------------------
1371    AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
1372    AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
1373    AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
1374    AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
1375  //------------------------------------------------------------------------------------------------------------------------------
1376    AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
1377    AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
1378    AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
1379    AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
1380  //------------------------------------------------------------------------------------------------------------------------------
1381    AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
1382    AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
1383    AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
1384    AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
1385  //------------------------------------------------------------------------------------------------------------------------------
1386    AH1 ARcpH1(AH1 x){return rcp(x);}
1387    AH2 ARcpH2(AH2 x){return rcp(x);}
1388    AH3 ARcpH3(AH3 x){return rcp(x);}
1389    AH4 ARcpH4(AH4 x){return rcp(x);}
1390  //------------------------------------------------------------------------------------------------------------------------------
1391    AH1 ARsqH1(AH1 x){return rsqrt(x);}
1392    AH2 ARsqH2(AH2 x){return rsqrt(x);}
1393    AH3 ARsqH3(AH3 x){return rsqrt(x);}
1394    AH4 ARsqH4(AH4 x){return rsqrt(x);}
1395  //------------------------------------------------------------------------------------------------------------------------------
1396    AH1 ASatH1(AH1 x){return saturate(x);}
1397    AH2 ASatH2(AH2 x){return saturate(x);}
1398    AH3 ASatH3(AH3 x){return saturate(x);}
1399    AH4 ASatH4(AH4 x){return saturate(x);}
1400  //------------------------------------------------------------------------------------------------------------------------------
1401    AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
1402    AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
1403    AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
1404    AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
1405   #endif
1406  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1407  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1408  //_____________________________________________________________/\_______________________________________________________________
1409  //==============================================================================================================================
1410  //                                                         HLSL DOUBLE
1411  //==============================================================================================================================
1412   #ifdef A_DUBL
1413    #ifdef A_HLSL_6_2
1414     #define AD1 float64_t
1415     #define AD2 float64_t2
1416     #define AD3 float64_t3
1417     #define AD4 float64_t4
1418    #else
1419     #define AD1 double
1420     #define AD2 double2
1421     #define AD3 double3
1422     #define AD4 double4
1423    #endif
1424  //------------------------------------------------------------------------------------------------------------------------------
1425    AD1 AD1_x(AD1 a){return AD1(a);}
1426    AD2 AD2_x(AD1 a){return AD2(a,a);}
1427    AD3 AD3_x(AD1 a){return AD3(a,a,a);}
1428    AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
1429    #define AD1_(a) AD1_x(AD1(a))
1430    #define AD2_(a) AD2_x(AD1(a))
1431    #define AD3_(a) AD3_x(AD1(a))
1432    #define AD4_(a) AD4_x(AD1(a))
1433  //==============================================================================================================================
1434    AD1 AFractD1(AD1 a){return a-floor(a);}
1435    AD2 AFractD2(AD2 a){return a-floor(a);}
1436    AD3 AFractD3(AD3 a){return a-floor(a);}
1437    AD4 AFractD4(AD4 a){return a-floor(a);}
1438  //------------------------------------------------------------------------------------------------------------------------------
1439    AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
1440    AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
1441    AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
1442    AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
1443  //------------------------------------------------------------------------------------------------------------------------------
1444    AD1 ARcpD1(AD1 x){return rcp(x);}
1445    AD2 ARcpD2(AD2 x){return rcp(x);}
1446    AD3 ARcpD3(AD3 x){return rcp(x);}
1447    AD4 ARcpD4(AD4 x){return rcp(x);}
1448  //------------------------------------------------------------------------------------------------------------------------------
1449    AD1 ARsqD1(AD1 x){return rsqrt(x);}
1450    AD2 ARsqD2(AD2 x){return rsqrt(x);}
1451    AD3 ARsqD3(AD3 x){return rsqrt(x);}
1452    AD4 ARsqD4(AD4 x){return rsqrt(x);}
1453  //------------------------------------------------------------------------------------------------------------------------------
1454    AD1 ASatD1(AD1 x){return saturate(x);}
1455    AD2 ASatD2(AD2 x){return saturate(x);}
1456    AD3 ASatD3(AD3 x){return saturate(x);}
1457    AD4 ASatD4(AD4 x){return saturate(x);}
1458   #endif
1459  //==============================================================================================================================
1460  //                                                         HLSL WAVE
1461  //==============================================================================================================================
1462   #ifdef A_WAVE
1463    // Where 'x' must be a compile time literal.
1464    AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1465    AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1466    AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1467    AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1468    AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1469    AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1470    AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1471    AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
1472  //------------------------------------------------------------------------------------------------------------------------------
1473    #ifdef A_HALF
1474     AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
1475     AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
1476     AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
1477     AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
1478    #endif
1479   #endif
1480  //==============================================================================================================================
1481  #endif
1482  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1483  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1484  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1485  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1486  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1487  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1488  //_____________________________________________________________/\_______________________________________________________________
1489  //==============================================================================================================================
1490  //
1491  //
1492  //                                                          GPU COMMON
1493  //
1494  //
1495  //==============================================================================================================================
1496  #ifdef A_GPU
1497   // Negative and positive infinity.
1498   #define A_INFP_F AF1_AU1(0x7f800000u)
1499   #define A_INFN_F AF1_AU1(0xff800000u)
1500  //------------------------------------------------------------------------------------------------------------------------------
1501   // Copy sign from 's' to positive 'd'.
1502   AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
1503   AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
1504   AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
1505   AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
1506  //------------------------------------------------------------------------------------------------------------------------------
1507   // Single operation to return (useful to create a mask to use in lerp for branch free logic),
1508   //  m=NaN := 0
1509   //  m>=0  := 0
1510   //  m<0   := 1
1511   // Uses the following useful floating point logic,
1512   //  saturate(+a*(-INF)==-INF) := 0
1513   //  saturate( 0*(-INF)== NaN) := 0
1514   //  saturate(-a*(-INF)==+INF) := 1
1515   AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
1516   AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
1517   AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
1518   AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
1519  //------------------------------------------------------------------------------------------------------------------------------
1520   AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
1521   AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
1522   AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
1523   AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
1524  //==============================================================================================================================
1525   #ifdef A_HALF
1526    #ifdef A_HLSL_6_2
1527     #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
1528     #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
1529    #else
1530     #define A_INFP_H AH1_AW1(0x7c00u)
1531     #define A_INFN_H AH1_AW1(0xfc00u)
1532    #endif
1533  
1534  //------------------------------------------------------------------------------------------------------------------------------
1535    AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
1536    AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
1537    AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
1538    AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
1539  //------------------------------------------------------------------------------------------------------------------------------
1540    AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
1541    AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
1542    AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
1543    AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
1544  //------------------------------------------------------------------------------------------------------------------------------
1545    AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
1546    AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
1547    AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
1548    AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
1549   #endif
1550  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1551  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1552  //_____________________________________________________________/\_______________________________________________________________
1553  //==============================================================================================================================
1554  //                                                [FIS] FLOAT INTEGER SORTABLE
1555  //------------------------------------------------------------------------------------------------------------------------------
1556  // Float to integer sortable.
1557  //  - If sign bit=0, flip the sign bit (positives).
1558  //  - If sign bit=1, flip all bits     (negatives).
1559  // Integer sortable to float.
1560  //  - If sign bit=1, flip the sign bit (positives).
1561  //  - If sign bit=0, flip all bits     (negatives).
1562  // Has nice side effects.
1563  //  - Larger integers are more positive values.
1564  //  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
1565  // Burns 3 ops for conversion {shift,or,xor}.
1566  //==============================================================================================================================
1567   AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1568   AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
1569  //------------------------------------------------------------------------------------------------------------------------------
1570   // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
1571   AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1572   AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
1573  //------------------------------------------------------------------------------------------------------------------------------
1574   #ifdef A_HALF
1575    AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1576    AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
1577  //------------------------------------------------------------------------------------------------------------------------------
1578    AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1579    AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
1580   #endif
1581  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1582  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1583  //_____________________________________________________________/\_______________________________________________________________
1584  //==============================================================================================================================
1585  //                                                      [PERM] V_PERM_B32
1586  //------------------------------------------------------------------------------------------------------------------------------
1587  // Support for V_PERM_B32 started in the 3rd generation of GCN.
1588  //------------------------------------------------------------------------------------------------------------------------------
1589  // yyyyxxxx - The 'i' input.
1590  // 76543210
1591  // ========
1592  // HGFEDCBA - Naming on permutation.
1593  //------------------------------------------------------------------------------------------------------------------------------
1594  // TODO
1595  // ====
1596  //  - Make sure compiler optimizes this.
1597  //==============================================================================================================================
1598   #ifdef A_HALF
1599    AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
1600    AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
1601    AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
1602    AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
1603  //------------------------------------------------------------------------------------------------------------------------------
1604    AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
1605    AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
1606    AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1607    AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
1608    AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
1609    AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
1610    AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
1611    AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
1612  //------------------------------------------------------------------------------------------------------------------------------
1613    AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
1614    AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
1615   #endif
1616  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1617  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1618  //_____________________________________________________________/\_______________________________________________________________
1619  //==============================================================================================================================
1620  //                                               [BUC] BYTE UNSIGNED CONVERSION
1621  //------------------------------------------------------------------------------------------------------------------------------
1622  // Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
1623  // Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
1624  //------------------------------------------------------------------------------------------------------------------------------
1625  // OPCODE NOTES
1626  // ============
1627  // GCN does not do UNORM or SNORM for bytes in opcodes.
1628  //  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
1629  //  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
1630  // V_PERM_B32 does byte packing with ability to zero fill bytes as well.
1631  //  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
1632  //------------------------------------------------------------------------------------------------------------------------------
1633  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
1634  // ====   =====
1635  //    0 : 0
1636  //    1 : 1
1637  //     ...
1638  //  255 : 255
1639  //      : 256 (just outside the encoding range)
1640  //------------------------------------------------------------------------------------------------------------------------------
1641  // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1642  // ====   =====
1643  //    0 : 0
1644  //    1 : 1/512
1645  //    2 : 1/256
1646  //     ...
1647  //   64 : 1/8
1648  //  128 : 1/4
1649  //  255 : 255/512
1650  //      : 1/2 (just outside the encoding range)
1651  //------------------------------------------------------------------------------------------------------------------------------
1652  // OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
1653  // ============================================
1654  // r=ABuc0FromU1(i)
1655  //   V_CVT_F32_UBYTE0 r,i
1656  // --------------------------------------------
1657  // r=ABuc0ToU1(d,i)
1658  //   V_CVT_PKACCUM_U8_F32 r,i,0,d
1659  // --------------------------------------------
1660  // d=ABuc0FromU2(i)
1661  //   Where 'k0' is an SGPR with 0x0E0A
1662  //   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
1663  //   V_PERM_B32 d,i.x,i.y,k0
1664  //   V_PK_FMA_F16 d,d,k1.x,0
1665  // --------------------------------------------
1666  // r=ABuc0ToU2(d,i)
1667  //   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
1668  //   Where 'k1' is an SGPR with 0x????
1669  //   Where 'k2' is an SGPR with 0x????
1670  //   V_PK_FMA_F16 i,i,k0.x,0
1671  //   V_PERM_B32 r.x,i,i,k1
1672  //   V_PERM_B32 r.y,i,i,k2
1673  //==============================================================================================================================
1674   // Peak range for 32-bit and 16-bit operations.
1675   #define A_BUC_32 (255.0)
1676   #define A_BUC_16 (255.0/512.0)
1677  //==============================================================================================================================
1678   #if 1
1679    // Designed to be one V_CVT_PKACCUM_U8_F32.
1680    // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
1681    AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
1682    AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
1683    AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
1684    AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
1685  //------------------------------------------------------------------------------------------------------------------------------
1686    // Designed to be one V_CVT_F32_UBYTE*.
1687    AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
1688    AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
1689    AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
1690    AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
1691   #endif
1692  //==============================================================================================================================
1693   #ifdef A_HALF
1694    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1695    AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
1696     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1697  //------------------------------------------------------------------------------------------------------------------------------
1698    // Designed for 3 ops to do SOA to AOS and conversion.
1699    AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1700     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1701    AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1702     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1703    AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1704     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1705    AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
1706     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1707  //------------------------------------------------------------------------------------------------------------------------------
1708    // Designed for 2 ops to do both AOS to SOA, and conversion.
1709    AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
1710    AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
1711    AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
1712    AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
1713   #endif
1714  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1715  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1716  //_____________________________________________________________/\_______________________________________________________________
1717  //==============================================================================================================================
1718  //                                                 [BSC] BYTE SIGNED CONVERSION
1719  //------------------------------------------------------------------------------------------------------------------------------
1720  // Similar to [BUC].
1721  // Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
1722  //------------------------------------------------------------------------------------------------------------------------------
1723  // ENCODING (without zero-based encoding)
1724  // ========
1725  //   0 = unused (can be used to mean something else)
1726  //   1 = lowest value 
1727  // 128 = exact zero center (zero based encoding 
1728  // 255 = highest value
1729  //------------------------------------------------------------------------------------------------------------------------------
1730  // Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
1731  // This is useful if there is a desire for cleared values to decode as zero.
1732  //------------------------------------------------------------------------------------------------------------------------------
1733  // BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
1734  // ====   =====
1735  //    0 : -127/512 (unused)
1736  //    1 : -126/512
1737  //    2 : -125/512
1738  //     ...
1739  //  128 : 0 
1740  //     ... 
1741  //  255 : 127/512
1742  //      : 1/4 (just outside the encoding range)
1743  //==============================================================================================================================
1744   // Peak range for 32-bit and 16-bit operations.
1745   #define A_BSC_32 (127.0)
1746   #define A_BSC_16 (127.0/512.0)
1747  //==============================================================================================================================
1748   #if 1
1749    AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
1750    AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
1751    AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
1752    AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
1753  //------------------------------------------------------------------------------------------------------------------------------
1754    AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
1755    AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
1756    AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
1757    AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
1758  //------------------------------------------------------------------------------------------------------------------------------
1759    AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
1760    AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
1761    AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
1762    AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
1763  //------------------------------------------------------------------------------------------------------------------------------
1764    AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
1765    AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
1766    AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
1767    AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
1768   #endif
1769  //==============================================================================================================================
1770   #ifdef A_HALF
1771    // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
1772    AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
1773     return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
1774  //------------------------------------------------------------------------------------------------------------------------------
1775    AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1776     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1777    AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1778     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1779    AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1780     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1781    AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
1782     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1783  //------------------------------------------------------------------------------------------------------------------------------
1784    AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1785     return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
1786    AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1787     return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
1788    AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1789     return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
1790    AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
1791     return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
1792  //------------------------------------------------------------------------------------------------------------------------------
1793    AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
1794    AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
1795    AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
1796    AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
1797  //------------------------------------------------------------------------------------------------------------------------------
1798    AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1799    AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1800    AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1801    AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
1802   #endif
1803  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1804  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1805  //_____________________________________________________________/\_______________________________________________________________
1806  //==============================================================================================================================
1807  //                                                     HALF APPROXIMATIONS
1808  //------------------------------------------------------------------------------------------------------------------------------
1809  // These support only positive inputs.
1810  // Did not see value yet in specialization for range.
1811  // Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
1812  // With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
1813  // However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
1814  // And co-execution would require a compiler interleaving a lot of independent work for packed usage.
1815  //------------------------------------------------------------------------------------------------------------------------------
1816  // The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
1817  // Same with sqrt(), as this could be x*rsq() (7 ops).
1818  //==============================================================================================================================
1819   #ifdef A_HALF
1820    // Minimize squared error across full positive range, 2 ops.
1821    // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
1822    AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
1823    AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
1824    AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
1825    AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
1826  //------------------------------------------------------------------------------------------------------------------------------
1827    // Lower precision estimation, 1 op.
1828    // Minimize squared error across {smallest normal to 16384.0}.
1829    AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
1830    AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
1831    AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
1832    AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
1833  //------------------------------------------------------------------------------------------------------------------------------
1834    // Medium precision estimation, one Newton Raphson iteration, 3 ops.
1835    AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
1836    AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
1837    AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
1838    AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
1839  //------------------------------------------------------------------------------------------------------------------------------
1840    // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
1841    AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
1842    AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
1843    AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
1844    AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
1845   #endif
1846  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1847  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1848  //_____________________________________________________________/\_______________________________________________________________
1849  //==============================================================================================================================
1850  //                                                    FLOAT APPROXIMATIONS
1851  //------------------------------------------------------------------------------------------------------------------------------
1852  // Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
1853  //  - Idea dates back to SGI, then to Quake 3, etc.
1854  //  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
1855  //     - sqrt(x)=rsqrt(x)*x
1856  //     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
1857  //  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
1858  //------------------------------------------------------------------------------------------------------------------------------
1859  // These below are from perhaps less complete searching for optimal.
1860  // Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
1861  // So these match up well with the half approximations.
1862  //==============================================================================================================================
1863   AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
1864   AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
1865   AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
1866   AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
1867  //------------------------------------------------------------------------------------------------------------------------------
1868   AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
1869   AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
1870   AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
1871   AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
1872  //------------------------------------------------------------------------------------------------------------------------------
1873   AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
1874   AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
1875   AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
1876   AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
1877  //------------------------------------------------------------------------------------------------------------------------------
1878   AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
1879   AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
1880   AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
1881   AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
1882  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1883  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1884  //_____________________________________________________________/\_______________________________________________________________
1885  //==============================================================================================================================
1886  //                                                    PQ APPROXIMATIONS
1887  //------------------------------------------------------------------------------------------------------------------------------
1888  // PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
1889  // PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
1890  //==============================================================================================================================
1891  // Helpers
1892   AF1 Quart(AF1 a) { a = a * a; return a * a;}
1893   AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
1894   AF2 Quart(AF2 a) { a = a * a; return a * a; }
1895   AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
1896   AF3 Quart(AF3 a) { a = a * a; return a * a; }
1897   AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
1898   AF4 Quart(AF4 a) { a = a * a; return a * a; }
1899   AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
1900   //------------------------------------------------------------------------------------------------------------------------------
1901   AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
1902   AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
1903   AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
1904   AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1905   AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
1906   AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
1907   AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1908   AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
1909   //------------------------------------------------------------------------------------------------------------------------------
1910   AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
1911   AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
1912   AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
1913   AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1914   AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
1915   AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
1916   AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1917   AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
1918   //------------------------------------------------------------------------------------------------------------------------------
1919   AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
1920   AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
1921   AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
1922   AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1923   AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
1924   AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
1925   AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1926   AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
1927   //------------------------------------------------------------------------------------------------------------------------------
1928   AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
1929   AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
1930   AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
1931   AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
1932   AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
1933   AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
1934   AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
1935   AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
1936  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1937  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1938  //_____________________________________________________________/\_______________________________________________________________
1939  //==============================================================================================================================
1940  //                                                    PARABOLIC SIN & COS
1941  //------------------------------------------------------------------------------------------------------------------------------
1942  // Approximate answers to transcendental questions.
1943  //------------------------------------------------------------------------------------------------------------------------------
1944  //==============================================================================================================================
1945   #if 1
1946    // Valid input range is {-1 to 1} representing {0 to 2 pi}.
1947    // Output range is {-1/4 to 1/4} representing {-1 to 1}.
1948    AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
1949    AF2 APSinF2(AF2 x){return x*abs(x)-x;}
1950    AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
1951    AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
1952    AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
1953   #endif
1954  //------------------------------------------------------------------------------------------------------------------------------
1955   #ifdef A_HALF
1956    // For a packed {sin,cos} pair,
1957    //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
1958    //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
1959    AH1 APSinH1(AH1 x){return x*abs(x)-x;}
1960    AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
1961    AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
1962    AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
1963    AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
1964   #endif
1965  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1966  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1967  //_____________________________________________________________/\_______________________________________________________________
1968  //==============================================================================================================================
1969  //                                                     [ZOL] ZERO ONE LOGIC
1970  //------------------------------------------------------------------------------------------------------------------------------
1971  // Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
1972  //------------------------------------------------------------------------------------------------------------------------------
1973  // 0 := false
1974  // 1 := true
1975  //------------------------------------------------------------------------------------------------------------------------------
1976  // AndNot(x,y)   -> !(x&y) .... One op.
1977  // AndOr(x,y,z)  -> (x&y)|z ... One op.
1978  // GtZero(x)     -> x>0.0 ..... One op.
1979  // Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
1980  // Signed(x)     -> x<0.0 ..... One op.
1981  // ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
1982  //------------------------------------------------------------------------------------------------------------------------------
1983  // OPTIMIZATION NOTES
1984  // ==================
1985  // - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
1986  //   For example 'a.xy*k.xx+k.yy'.
1987  //==============================================================================================================================
1988   #if 1
1989    AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
1990    AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
1991    AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
1992    AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
1993  //------------------------------------------------------------------------------------------------------------------------------
1994    AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
1995    AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
1996    AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
1997    AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
1998  //------------------------------------------------------------------------------------------------------------------------------
1999    AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
2000    AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
2001    AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
2002    AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
2003  //==============================================================================================================================
2004    AU1 AZolF1ToU1(AF1 x){return AU1(x);}
2005    AU2 AZolF2ToU2(AF2 x){return AU2(x);}
2006    AU3 AZolF3ToU3(AF3 x){return AU3(x);}
2007    AU4 AZolF4ToU4(AF4 x){return AU4(x);}
2008  //------------------------------------------------------------------------------------------------------------------------------
2009    // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
2010    AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
2011    AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
2012    AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
2013    AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
2014  //------------------------------------------------------------------------------------------------------------------------------
2015    AF1 AZolU1ToF1(AU1 x){return AF1(x);}
2016    AF2 AZolU2ToF2(AU2 x){return AF2(x);}
2017    AF3 AZolU3ToF3(AU3 x){return AF3(x);}
2018    AF4 AZolU4ToF4(AU4 x){return AF4(x);}
2019  //==============================================================================================================================
2020    AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
2021    AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
2022    AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
2023    AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
2024  //------------------------------------------------------------------------------------------------------------------------------
2025    AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
2026    AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
2027    AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
2028    AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
2029  //------------------------------------------------------------------------------------------------------------------------------
2030    AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
2031    AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
2032    AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
2033    AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
2034  //------------------------------------------------------------------------------------------------------------------------------
2035    AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
2036    AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
2037    AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
2038    AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
2039  //------------------------------------------------------------------------------------------------------------------------------
2040    AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
2041    AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
2042    AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
2043    AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
2044  //------------------------------------------------------------------------------------------------------------------------------
2045    AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
2046    AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
2047    AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
2048    AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
2049  //------------------------------------------------------------------------------------------------------------------------------
2050    AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
2051    AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
2052    AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
2053    AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
2054  //------------------------------------------------------------------------------------------------------------------------------
2055    AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
2056    AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
2057    AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
2058    AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
2059  //------------------------------------------------------------------------------------------------------------------------------
2060    AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
2061    AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
2062    AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
2063    AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
2064   #endif
2065  //==============================================================================================================================
2066   #ifdef A_HALF
2067    AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
2068    AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
2069    AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
2070    AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
2071  //------------------------------------------------------------------------------------------------------------------------------
2072    AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
2073    AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
2074    AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
2075    AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
2076  //------------------------------------------------------------------------------------------------------------------------------
2077    AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
2078    AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
2079    AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
2080    AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
2081  //==============================================================================================================================
2082    // Uses denormal trick.
2083    AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
2084    AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
2085    AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
2086    AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
2087  //------------------------------------------------------------------------------------------------------------------------------
2088    // AMD arch lacks a packed conversion opcode.
2089    AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
2090    AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
2091    AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
2092    AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
2093  //==============================================================================================================================
2094    AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
2095    AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
2096    AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
2097    AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
2098  //------------------------------------------------------------------------------------------------------------------------------
2099    AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
2100    AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
2101    AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
2102    AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
2103  //------------------------------------------------------------------------------------------------------------------------------
2104    AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
2105    AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
2106    AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
2107    AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
2108  //------------------------------------------------------------------------------------------------------------------------------
2109    AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
2110    AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
2111    AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
2112    AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
2113  //------------------------------------------------------------------------------------------------------------------------------
2114    AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
2115    AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
2116    AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
2117    AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
2118  //------------------------------------------------------------------------------------------------------------------------------
2119    AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
2120    AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
2121    AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
2122    AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
2123  //------------------------------------------------------------------------------------------------------------------------------
2124    AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
2125    AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
2126    AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
2127    AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
2128  //------------------------------------------------------------------------------------------------------------------------------
2129    AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
2130    AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
2131    AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
2132    AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
2133   #endif
2134  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2135  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2136  //_____________________________________________________________/\_______________________________________________________________
2137  //==============================================================================================================================
2138  //                                                      COLOR CONVERSIONS
2139  //------------------------------------------------------------------------------------------------------------------------------
2140  // These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
2141  // So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
2142  // These are branch free implementations.
2143  // The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
2144  //------------------------------------------------------------------------------------------------------------------------------
2145  // TRANSFER FUNCTIONS
2146  // ==================
2147  // 709 ..... Rec709 used for some HDTVs
2148  // Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
2149  // Pq ...... PQ native for HDR10
2150  // Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
2151  // Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
2152  // Three ... Gamma 3.0, less fast, but good for HDR.
2153  //------------------------------------------------------------------------------------------------------------------------------
2154  // KEEPING TO SPEC
2155  // ===============
2156  // Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
2157  //  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
2158  //  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
2159  // Also there is a slight step in the transition regions.
2160  // Precision of the coefficients in the spec being the likely cause.
2161  // Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
2162  // This is to work around lack of hardware (typically only ROP does the conversion for free).
2163  // To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
2164  // So this header keeps with the spec.
2165  // For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
2166  // Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
2167  //------------------------------------------------------------------------------------------------------------------------------
2168  // FOR PQ
2169  // ======
2170  // Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
2171  // All constants are only specified to FP32 precision.
2172  // External PQ source reference,
2173  //  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
2174  //------------------------------------------------------------------------------------------------------------------------------
2175  // PACKED VERSIONS
2176  // ===============
2177  // These are the A*H2() functions.
2178  // There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
2179  // The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
2180  // Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
2181  //------------------------------------------------------------------------------------------------------------------------------
2182  // NOTES
2183  // =====
2184  // Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
2185  //==============================================================================================================================
2186   #if 1
2187    AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2188     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2189    AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2190     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2191    AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
2192     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2193  //------------------------------------------------------------------------------------------------------------------------------
2194    // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
2195    AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
2196    AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
2197    AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
2198  //------------------------------------------------------------------------------------------------------------------------------
2199    AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
2200     return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
2201    AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
2202     return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
2203    AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
2204     return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
2205  //------------------------------------------------------------------------------------------------------------------------------
2206    AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2207     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2208    AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2209     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2210    AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
2211     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2212  //------------------------------------------------------------------------------------------------------------------------------
2213    AF1 AToTwoF1(AF1 c){return sqrt(c);}
2214    AF2 AToTwoF2(AF2 c){return sqrt(c);}
2215    AF3 AToTwoF3(AF3 c){return sqrt(c);}
2216  //------------------------------------------------------------------------------------------------------------------------------
2217    AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
2218    AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
2219    AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
2220   #endif
2221  //==============================================================================================================================
2222   #if 1
2223    // Unfortunately median won't work here.
2224    AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2225     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2226    AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2227     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2228    AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
2229     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2230  //------------------------------------------------------------------------------------------------------------------------------
2231    AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
2232    AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
2233    AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
2234  //------------------------------------------------------------------------------------------------------------------------------
2235    AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
2236     return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
2237    AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
2238     return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
2239    AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
2240     return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
2241  //------------------------------------------------------------------------------------------------------------------------------
2242    // Unfortunately median won't work here.
2243    AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2244     return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2245    AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2246     return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2247    AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
2248     return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2249  //------------------------------------------------------------------------------------------------------------------------------
2250    AF1 AFromTwoF1(AF1 c){return c*c;}
2251    AF2 AFromTwoF2(AF2 c){return c*c;}
2252    AF3 AFromTwoF3(AF3 c){return c*c;}
2253  //------------------------------------------------------------------------------------------------------------------------------
2254    AF1 AFromThreeF1(AF1 c){return c*c*c;}
2255    AF2 AFromThreeF2(AF2 c){return c*c*c;}
2256    AF3 AFromThreeF3(AF3 c){return c*c*c;}
2257   #endif
2258  //==============================================================================================================================
2259   #ifdef A_HALF
2260    AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2261     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2262    AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2263     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2264    AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
2265     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2266  //------------------------------------------------------------------------------------------------------------------------------
2267    AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
2268    AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
2269    AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
2270  //------------------------------------------------------------------------------------------------------------------------------
2271    AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2272     return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
2273    AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2274     return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
2275    AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
2276     return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
2277  //------------------------------------------------------------------------------------------------------------------------------
2278    AH1 AToTwoH1(AH1 c){return sqrt(c);}
2279    AH2 AToTwoH2(AH2 c){return sqrt(c);}
2280    AH3 AToTwoH3(AH3 c){return sqrt(c);}
2281  //------------------------------------------------------------------------------------------------------------------------------
2282    AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
2283    AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
2284    AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
2285   #endif
2286  //==============================================================================================================================
2287   #ifdef A_HALF
2288    AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2289     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2290    AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2291     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2292    AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
2293     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2294  //------------------------------------------------------------------------------------------------------------------------------
2295    AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
2296    AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
2297    AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
2298  //------------------------------------------------------------------------------------------------------------------------------
2299    AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2300     return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
2301    AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2302     return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
2303    AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
2304     return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
2305  //------------------------------------------------------------------------------------------------------------------------------
2306    AH1 AFromTwoH1(AH1 c){return c*c;}
2307    AH2 AFromTwoH2(AH2 c){return c*c;}
2308    AH3 AFromTwoH3(AH3 c){return c*c;}
2309  //------------------------------------------------------------------------------------------------------------------------------
2310    AH1 AFromThreeH1(AH1 c){return c*c*c;}
2311    AH2 AFromThreeH2(AH2 c){return c*c*c;}
2312    AH3 AFromThreeH3(AH3 c){return c*c*c;}
2313   #endif
2314  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2315  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2316  //_____________________________________________________________/\_______________________________________________________________
2317  //==============================================================================================================================
2318  //                                                          CS REMAP
2319  //==============================================================================================================================
2320   // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
2321   //  543210
2322   //  ======
2323   //  ..xxx.
2324   //  yy...y
2325   AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2326  //==============================================================================================================================
2327   // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
2328   //  543210
2329   //  ======
2330   //  .xx..x
2331   //  y..yy.
2332   // Details,
2333   //  LANE TO 8x8 MAPPING
2334   //  ===================
2335   //  00 01 08 09 10 11 18 19 
2336   //  02 03 0a 0b 12 13 1a 1b
2337   //  04 05 0c 0d 14 15 1c 1d
2338   //  06 07 0e 0f 16 17 1e 1f 
2339   //  20 21 28 29 30 31 38 39 
2340   //  22 23 2a 2b 32 33 3a 3b
2341   //  24 25 2c 2d 34 35 3c 3d
2342   //  26 27 2e 2f 36 37 3e 3f 
2343   AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2344  //==============================================================================================================================
2345   #ifdef A_HALF
2346    AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
2347    AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
2348   #endif
2349  #endif
2350  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2351  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2352  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2353  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2354  //_____________________________________________________________/\_______________________________________________________________
2355  //==============================================================================================================================
2356  //
2357  //                                                          REFERENCE
2358  //
2359  //------------------------------------------------------------------------------------------------------------------------------
2360  // IEEE FLOAT RULES
2361  // ================
2362  //  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
2363  //  - {+/-}0 * {+/-}INF = NaN
2364  //  - -INF + (+INF) = NaN
2365  //  - {+/-}0 / {+/-}0 = NaN
2366  //  - {+/-}INF / {+/-}INF = NaN
2367  //  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
2368  //  - 0 == -0
2369  //  - 4/0 = +INF
2370  //  - 4/-0 = -INF
2371  //  - 4+INF = +INF
2372  //  - 4-INF = -INF
2373  //  - 4*(+INF) = +INF
2374  //  - 4*(-INF) = -INF
2375  //  - -4*(+INF) = -INF
2376  //  - sqrt(+INF) = +INF
2377  //------------------------------------------------------------------------------------------------------------------------------
2378  // FP16 ENCODING
2379  // =============
2380  // fedcba9876543210
2381  // ----------------
2382  // ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
2383  // .eeeee..........  5-bit exponent
2384  // .00000..........  denormals
2385  // .00001..........  -14 exponent
2386  // .11110..........   15 exponent
2387  // .111110000000000  infinity
2388  // .11111nnnnnnnnnn  NaN with n!=0
2389  // s...............  sign
2390  //------------------------------------------------------------------------------------------------------------------------------
2391  // FP16/INT16 ALIASING DENORMAL
2392  // ============================
2393  // 11-bit unsigned integers alias with half float denormal/normal values,
2394  //     1 = 2^(-24) = 1/16777216 ....................... first denormal value
2395  //     2 = 2^(-23)
2396  //   ...
2397  //  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
2398  //  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
2399  //  2047 .............................................. last normal value that still maps to integers 
2400  // Scaling limits,
2401  //  2^15 = 32768 ...................................... largest power of 2 scaling
2402  // Largest pow2 conversion mapping is at *32768,
2403  //     1 : 2^(-9) = 1/512
2404  //     2 : 1/256
2405  //     4 : 1/128
2406  //     8 : 1/64
2407  //    16 : 1/32
2408  //    32 : 1/16
2409  //    64 : 1/8
2410  //   128 : 1/4
2411  //   256 : 1/2
2412  //   512 : 1
2413  //  1024 : 2
2414  //  2047 : a little less than 4
2415  //==============================================================================================================================
2416  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2417  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2418  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2419  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2420  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2421  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2422  //_____________________________________________________________/\_______________________________________________________________
2423  //==============================================================================================================================
2424  //
2425  //
2426  //                                                     GPU/CPU PORTABILITY
2427  //
2428  //
2429  //------------------------------------------------------------------------------------------------------------------------------
2430  // This is the GPU implementation.
2431  // See the CPU implementation for docs.
2432  //==============================================================================================================================
2433  #ifdef A_GPU
2434   #define A_TRUE true
2435   #define A_FALSE false
2436   #define A_STATIC
2437  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2438  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2439  //_____________________________________________________________/\_______________________________________________________________
2440  //==============================================================================================================================
2441  //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
2442  //==============================================================================================================================
2443   #define retAD2 AD2
2444   #define retAD3 AD3
2445   #define retAD4 AD4
2446   #define retAF2 AF2
2447   #define retAF3 AF3
2448   #define retAF4 AF4
2449   #define retAL2 AL2
2450   #define retAL3 AL3
2451   #define retAL4 AL4
2452   #define retAU2 AU2
2453   #define retAU3 AU3
2454   #define retAU4 AU4
2455  //------------------------------------------------------------------------------------------------------------------------------
2456   #define inAD2 in AD2
2457   #define inAD3 in AD3
2458   #define inAD4 in AD4
2459   #define inAF2 in AF2
2460   #define inAF3 in AF3
2461   #define inAF4 in AF4
2462   #define inAL2 in AL2
2463   #define inAL3 in AL3
2464   #define inAL4 in AL4
2465   #define inAU2 in AU2
2466   #define inAU3 in AU3
2467   #define inAU4 in AU4
2468  //------------------------------------------------------------------------------------------------------------------------------
2469   #define inoutAD2 inout AD2
2470   #define inoutAD3 inout AD3
2471   #define inoutAD4 inout AD4
2472   #define inoutAF2 inout AF2
2473   #define inoutAF3 inout AF3
2474   #define inoutAF4 inout AF4
2475   #define inoutAL2 inout AL2
2476   #define inoutAL3 inout AL3
2477   #define inoutAL4 inout AL4
2478   #define inoutAU2 inout AU2
2479   #define inoutAU3 inout AU3
2480   #define inoutAU4 inout AU4
2481  //------------------------------------------------------------------------------------------------------------------------------
2482   #define outAD2 out AD2
2483   #define outAD3 out AD3
2484   #define outAD4 out AD4
2485   #define outAF2 out AF2
2486   #define outAF3 out AF3
2487   #define outAF4 out AF4
2488   #define outAL2 out AL2
2489   #define outAL3 out AL3
2490   #define outAL4 out AL4
2491   #define outAU2 out AU2
2492   #define outAU3 out AU3
2493   #define outAU4 out AU4
2494  //------------------------------------------------------------------------------------------------------------------------------
2495   #define varAD2(x) AD2 x
2496   #define varAD3(x) AD3 x
2497   #define varAD4(x) AD4 x
2498   #define varAF2(x) AF2 x
2499   #define varAF3(x) AF3 x
2500   #define varAF4(x) AF4 x
2501   #define varAL2(x) AL2 x
2502   #define varAL3(x) AL3 x
2503   #define varAL4(x) AL4 x
2504   #define varAU2(x) AU2 x
2505   #define varAU3(x) AU3 x
2506   #define varAU4(x) AU4 x
2507  //------------------------------------------------------------------------------------------------------------------------------
2508   #define initAD2(x,y) AD2(x,y)
2509   #define initAD3(x,y,z) AD3(x,y,z)
2510   #define initAD4(x,y,z,w) AD4(x,y,z,w)
2511   #define initAF2(x,y) AF2(x,y)
2512   #define initAF3(x,y,z) AF3(x,y,z)
2513   #define initAF4(x,y,z,w) AF4(x,y,z,w)
2514   #define initAL2(x,y) AL2(x,y)
2515   #define initAL3(x,y,z) AL3(x,y,z)
2516   #define initAL4(x,y,z,w) AL4(x,y,z,w)
2517   #define initAU2(x,y) AU2(x,y)
2518   #define initAU3(x,y,z) AU3(x,y,z)
2519   #define initAU4(x,y,z,w) AU4(x,y,z,w)
2520  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2521  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2522  //_____________________________________________________________/\_______________________________________________________________
2523  //==============================================================================================================================
2524  //                                                     SCALAR RETURN OPS
2525  //==============================================================================================================================
2526   #define AAbsD1(a) abs(AD1(a))
2527   #define AAbsF1(a) abs(AF1(a))
2528  //------------------------------------------------------------------------------------------------------------------------------
2529   #define ACosD1(a) cos(AD1(a))
2530   #define ACosF1(a) cos(AF1(a))
2531  //------------------------------------------------------------------------------------------------------------------------------
2532   #define ADotD2(a,b) dot(AD2(a),AD2(b))
2533   #define ADotD3(a,b) dot(AD3(a),AD3(b))
2534   #define ADotD4(a,b) dot(AD4(a),AD4(b))
2535   #define ADotF2(a,b) dot(AF2(a),AF2(b))
2536   #define ADotF3(a,b) dot(AF3(a),AF3(b))
2537   #define ADotF4(a,b) dot(AF4(a),AF4(b))
2538  //------------------------------------------------------------------------------------------------------------------------------
2539   #define AExp2D1(a) exp2(AD1(a))
2540   #define AExp2F1(a) exp2(AF1(a))
2541  //------------------------------------------------------------------------------------------------------------------------------
2542   #define AFloorD1(a) floor(AD1(a))
2543   #define AFloorF1(a) floor(AF1(a))
2544  //------------------------------------------------------------------------------------------------------------------------------
2545   #define ALog2D1(a) log2(AD1(a))
2546   #define ALog2F1(a) log2(AF1(a))
2547  //------------------------------------------------------------------------------------------------------------------------------
2548   #define AMaxD1(a,b) max(a,b)
2549   #define AMaxF1(a,b) max(a,b)
2550   #define AMaxL1(a,b) max(a,b)
2551   #define AMaxU1(a,b) max(a,b)
2552  //------------------------------------------------------------------------------------------------------------------------------
2553   #define AMinD1(a,b) min(a,b)
2554   #define AMinF1(a,b) min(a,b)
2555   #define AMinL1(a,b) min(a,b)
2556   #define AMinU1(a,b) min(a,b)
2557  //------------------------------------------------------------------------------------------------------------------------------
2558   #define ASinD1(a) sin(AD1(a))
2559   #define ASinF1(a) sin(AF1(a))
2560  //------------------------------------------------------------------------------------------------------------------------------
2561   #define ASqrtD1(a) sqrt(AD1(a))
2562   #define ASqrtF1(a) sqrt(AF1(a))
2563  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2564  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2565  //_____________________________________________________________/\_______________________________________________________________
2566  //==============================================================================================================================
2567  //                                               SCALAR RETURN OPS - DEPENDENT
2568  //==============================================================================================================================
2569   #define APowD1(a,b) pow(AD1(a),AF1(b))
2570   #define APowF1(a,b) pow(AF1(a),AF1(b))
2571  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2572  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2573  //_____________________________________________________________/\_______________________________________________________________
2574  //==============================================================================================================================
2575  //                                                         VECTOR OPS
2576  //------------------------------------------------------------------------------------------------------------------------------
2577  // These are added as needed for production or prototyping, so not necessarily a complete set.
2578  // They follow a convention of taking in a destination and also returning the destination value to increase utility.
2579  //==============================================================================================================================
2580   #ifdef A_DUBL
2581    AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
2582    AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
2583    AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
2584  //------------------------------------------------------------------------------------------------------------------------------
2585    AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
2586    AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
2587    AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
2588  //------------------------------------------------------------------------------------------------------------------------------
2589    AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
2590    AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
2591    AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
2592  //------------------------------------------------------------------------------------------------------------------------------
2593    AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
2594    AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
2595    AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
2596  //------------------------------------------------------------------------------------------------------------------------------
2597    AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
2598    AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
2599    AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
2600  //------------------------------------------------------------------------------------------------------------------------------
2601    AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
2602    AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
2603    AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
2604  //------------------------------------------------------------------------------------------------------------------------------
2605    AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
2606    AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
2607    AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
2608  //------------------------------------------------------------------------------------------------------------------------------
2609    AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
2610    AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
2611    AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
2612  //------------------------------------------------------------------------------------------------------------------------------
2613    AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
2614    AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
2615    AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
2616  //------------------------------------------------------------------------------------------------------------------------------
2617    AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
2618    AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
2619    AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
2620  //------------------------------------------------------------------------------------------------------------------------------
2621    AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
2622    AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
2623    AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
2624  //------------------------------------------------------------------------------------------------------------------------------
2625    AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
2626    AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
2627    AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
2628   #endif
2629  //==============================================================================================================================
2630   AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
2631   AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
2632   AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
2633  //------------------------------------------------------------------------------------------------------------------------------
2634   AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
2635   AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
2636   AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
2637  //------------------------------------------------------------------------------------------------------------------------------
2638   AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
2639   AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
2640   AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
2641  //------------------------------------------------------------------------------------------------------------------------------
2642   AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
2643   AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
2644   AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
2645  //------------------------------------------------------------------------------------------------------------------------------
2646   AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
2647   AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
2648   AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
2649  //------------------------------------------------------------------------------------------------------------------------------
2650   AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
2651   AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
2652   AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
2653  //------------------------------------------------------------------------------------------------------------------------------
2654   AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
2655   AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
2656   AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
2657  //------------------------------------------------------------------------------------------------------------------------------
2658   AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
2659   AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
2660   AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
2661  //------------------------------------------------------------------------------------------------------------------------------
2662   AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
2663   AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
2664   AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
2665  //------------------------------------------------------------------------------------------------------------------------------
2666   AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
2667   AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
2668   AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
2669  //------------------------------------------------------------------------------------------------------------------------------
2670   AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
2671   AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
2672   AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
2673  //------------------------------------------------------------------------------------------------------------------------------
2674   AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
2675   AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
2676   AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
2677  #endif
2678  
2679  #define FSR_EASU_F 1
2680  AU4 con0, con1, con2, con3;
2681  float srcW, srcH, dstW, dstH;
2682  vec2 bLeft, tRight;
2683  
2684  AF2 translate(AF2 pos) {
2685      return AF2(pos.x * scaleX, pos.y * scaleY);
2686  }
2687  
2688  void setBounds(vec2 bottomLeft, vec2 topRight) {
2689      bLeft = bottomLeft;
2690      tRight = topRight;
2691  }
2692  
2693  AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; }
2694  AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; }
2695  AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; }
2696  
2697  //_____________________________________________________________/\_______________________________________________________________
2698  //==============================================================================================================================
2699  //
2700  //
2701  //                    AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
2702  //
2703  //
2704  //------------------------------------------------------------------------------------------------------------------------------
2705  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2706  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2707  //------------------------------------------------------------------------------------------------------------------------------
2708  // FidelityFX Super Resolution Sample
2709  //
2710  // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
2711  // Permission is hereby granted, free of charge, to any person obtaining a copy
2712  // of this software and associated documentation files(the "Software"), to deal
2713  // in the Software without restriction, including without limitation the rights
2714  // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
2715  // copies of the Software, and to permit persons to whom the Software is
2716  // furnished to do so, subject to the following conditions :
2717  // The above copyright notice and this permission notice shall be included in
2718  // all copies or substantial portions of the Software.
2719  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2720  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2721  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
2722  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2723  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2724  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2725  // THE SOFTWARE.
2726  //------------------------------------------------------------------------------------------------------------------------------
2727  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2728  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2729  //------------------------------------------------------------------------------------------------------------------------------
2730  // ABOUT
2731  // =====
2732  // FSR is a collection of algorithms relating to generating a higher resolution image.
2733  // This specific header focuses on single-image non-temporal image scaling, and related tools.
2734  // 
2735  // The core functions are EASU and RCAS:
2736  //  [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
2737  //  [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
2738  // RCAS needs to be applied after EASU as a separate pass.
2739  // 
2740  // Optional utility functions are:
2741  //  [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
2742  //  [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
2743  //  [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
2744  // See each individual sub-section for inline documentation.
2745  //------------------------------------------------------------------------------------------------------------------------------
2746  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2747  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2748  //------------------------------------------------------------------------------------------------------------------------------
2749  // FUNCTION PERMUTATIONS
2750  // =====================
2751  // *F() ..... Single item computation with 32-bit.
2752  // *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
2753  // *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
2754  //            Not all interfaces in this file have a *Hx2() form.
2755  //==============================================================================================================================
2756  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2757  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2758  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2759  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2760  //_____________________________________________________________/\_______________________________________________________________
2761  //==============================================================================================================================
2762  //
2763  //                                        FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
2764  //
2765  //------------------------------------------------------------------------------------------------------------------------------
2766  // EASU provides a high quality spatial-only scaling at relatively low cost.
2767  // Meaning EASU is appropiate for laptops and other low-end GPUs.
2768  // Quality from 1x to 4x area scaling is good.
2769  //------------------------------------------------------------------------------------------------------------------------------
2770  // The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
2771  // EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
2772  // This is also kept as simple as possible to have minimum runtime.
2773  //------------------------------------------------------------------------------------------------------------------------------
2774  // The lanzcos filter has negative lobes, so by itself it will introduce ringing.
2775  // To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
2776  // and limits output to the minimum and maximum of that neighborhood.
2777  //------------------------------------------------------------------------------------------------------------------------------
2778  // Input image requirements:
2779  // 
2780  // Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
2781  // Each channel needs to be in the range[0, 1]
2782  // Any color primaries are supported
2783  // Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
2784  // There should be no banding in the input
2785  // There should be no high amplitude noise in the input
2786  // There should be no noise in the input that is not at input pixel granularity
2787  // For performance purposes, use 32bpp formats
2788  //------------------------------------------------------------------------------------------------------------------------------
2789  // Best to apply EASU at the end of the frame after tonemapping 
2790  // but before film grain or composite of the UI.
2791  //------------------------------------------------------------------------------------------------------------------------------
2792  // Example of including this header for D3D HLSL :
2793  // 
2794  //  #define A_GPU 1
2795  //  #define A_HLSL 1
2796  //  #define A_HALF 1
2797  //  #include "ffx_a.h"
2798  //  #define FSR_EASU_H 1
2799  //  #define FSR_RCAS_H 1
2800  //  //declare input callbacks
2801  //  #include "ffx_fsr1.h"
2802  // 
2803  // Example of including this header for Vulkan GLSL :
2804  // 
2805  //  #define A_GPU 1
2806  //  #define A_GLSL 1
2807  //  #define A_HALF 1
2808  //  #include "ffx_a.h"
2809  //  #define FSR_EASU_H 1
2810  //  #define FSR_RCAS_H 1
2811  //  //declare input callbacks
2812  //  #include "ffx_fsr1.h"
2813  // 
2814  // Example of including this header for Vulkan HLSL :
2815  // 
2816  //  #define A_GPU 1
2817  //  #define A_HLSL 1
2818  //  #define A_HLSL_6_2 1
2819  //  #define A_NO_16_BIT_CAST 1
2820  //  #define A_HALF 1
2821  //  #include "ffx_a.h"
2822  //  #define FSR_EASU_H 1
2823  //  #define FSR_RCAS_H 1
2824  //  //declare input callbacks
2825  //  #include "ffx_fsr1.h"
2826  // 
2827  //  Example of declaring the required input callbacks for GLSL :
2828  //  The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
2829  //  EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
2830  // 
2831  //  AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
2832  //  AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
2833  //  AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
2834  //  ...
2835  //  The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
2836  //  The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
2837  //  To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
2838  //  Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
2839  //  AU4 con0,con1,con2,con3;
2840  //  FsrEasuCon(con0,con1,con2,con3,
2841  //    1920.0,1080.0,  // Viewport size (top left aligned) in the input image which is to be scaled.
2842  //    3840.0,2160.0,  // The size of the input image.
2843  //    2560.0,1440.0); // The output resolution.
2844  //==============================================================================================================================
2845  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2846  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2847  //_____________________________________________________________/\_______________________________________________________________
2848  //==============================================================================================================================
2849  //                                                      CONSTANT SETUP
2850  //==============================================================================================================================
2851  // Call to setup required constant values (works on CPU or GPU).
2852  A_STATIC void FsrEasuCon(
2853  outAU4 con0,
2854  outAU4 con1,
2855  outAU4 con2,
2856  outAU4 con3,
2857  // This the rendered image resolution being upscaled
2858  AF1 inputViewportInPixelsX,
2859  AF1 inputViewportInPixelsY,
2860  // This is the resolution of the resource containing the input image (useful for dynamic resolution)
2861  AF1 inputSizeInPixelsX,
2862  AF1 inputSizeInPixelsY,
2863  // This is the display resolution which the input image gets upscaled to
2864  AF1 outputSizeInPixelsX,
2865  AF1 outputSizeInPixelsY){
2866   // Output integer position to a pixel position in viewport.
2867   con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
2868   con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
2869   con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
2870   con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
2871   // Viewport pixel position to normalized image space.
2872   // This is used to get upper-left of 'F' tap.
2873   con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
2874   con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
2875   // Centers of gather4, first offset from upper-left of 'F'.
2876   //      +---+---+
2877   //      |   |   |
2878   //      +--(0)--+
2879   //      | b | c |
2880   //  +---F---+---+---+
2881   //  | e | f | g | h |
2882   //  +--(1)--+--(2)--+
2883   //  | i | j | k | l |
2884   //  +---+---+---+---+
2885   //      | n | o |
2886   //      +--(3)--+
2887   //      |   |   |
2888   //      +---+---+
2889   con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
2890   con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
2891   // These are from (0) instead of 'F'.
2892   con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
2893   con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
2894   con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
2895   con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
2896   con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
2897   con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
2898   con3[2]=con3[3]=0;}
2899  
2900  //If the an offset into the input image resource
2901  A_STATIC void FsrEasuConOffset(
2902      outAU4 con0,
2903      outAU4 con1,
2904      outAU4 con2,
2905      outAU4 con3,
2906      // This the rendered image resolution being upscaled
2907      AF1 inputViewportInPixelsX,
2908      AF1 inputViewportInPixelsY,
2909      // This is the resolution of the resource containing the input image (useful for dynamic resolution)
2910      AF1 inputSizeInPixelsX,
2911      AF1 inputSizeInPixelsY,
2912      // This is the display resolution which the input image gets upscaled to
2913      AF1 outputSizeInPixelsX,
2914      AF1 outputSizeInPixelsY,
2915      // This is the input image offset into the resource containing it (useful for dynamic resolution)
2916      AF1 inputOffsetInPixelsX,
2917      AF1 inputOffsetInPixelsY) {
2918      FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
2919      con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
2920      con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
2921  }
2922  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2923  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2924  //_____________________________________________________________/\_______________________________________________________________
2925  //==============================================================================================================================
2926  //                                                   NON-PACKED 32-BIT VERSION
2927  //==============================================================================================================================
2928  #if defined(A_GPU)&&defined(FSR_EASU_F)
2929   // Input callback prototypes, need to be implemented by calling shader
2930   AF4 FsrEasuRF(AF2 p);
2931   AF4 FsrEasuGF(AF2 p);
2932   AF4 FsrEasuBF(AF2 p);
2933  //------------------------------------------------------------------------------------------------------------------------------
2934   // Filtering for a given tap for the scalar.
2935   void FsrEasuTapF(
2936   inout AF3 aC, // Accumulated color, with negative lobe.
2937   inout AF1 aW, // Accumulated weight.
2938   AF2 off, // Pixel offset from resolve position to tap.
2939   AF2 dir, // Gradient direction.
2940   AF2 len, // Length.
2941   AF1 lob, // Negative lobe strength.
2942   AF1 clp, // Clipping point.
2943   AF3 c){ // Tap color.
2944    // Rotate offset by direction.
2945    AF2 v;
2946    v.x=(off.x*( dir.x))+(off.y*dir.y);
2947    v.y=(off.x*(-dir.y))+(off.y*dir.x);
2948    // Anisotropy.
2949    v*=len;
2950    // Compute distance^2.
2951    AF1 d2=v.x*v.x+v.y*v.y;
2952    // Limit to the window as at corner, 2 taps can easily be outside.
2953    d2=min(d2,clp);
2954    // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
2955    //  (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
2956    //  |_______________________________________|   |_______________|
2957    //                   base                             window
2958    // The general form of the 'base' is,
2959    //  (a*(b*x^2-1)^2-(a-1))
2960    // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
2961    AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
2962    AF1 wA=lob*d2+AF1_(-1.0);
2963    wB*=wB;
2964    wA*=wA;
2965    wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
2966    AF1 w=wB*wA;
2967    // Do weighted average.
2968    aC+=c*w;aW+=w;}
2969  //------------------------------------------------------------------------------------------------------------------------------
2970   // Accumulate direction and length.
2971   void FsrEasuSetF(
2972   inout AF2 dir,
2973   inout AF1 len,
2974   AF2 pp,
2975   AP1 biS,AP1 biT,AP1 biU,AP1 biV,
2976   AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
2977    // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
2978    //  s t
2979    //  u v
2980    AF1 w = AF1_(0.0);
2981    if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
2982    if(biT)w=           pp.x *(AF1_(1.0)-pp.y);
2983    if(biU)w=(AF1_(1.0)-pp.x)*           pp.y ;
2984    if(biV)w=           pp.x *           pp.y ;
2985    // Direction is the '+' diff.
2986    //    a
2987    //  b c d
2988    //    e
2989    // Then takes magnitude from abs average of both sides of 'c'.
2990    // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
2991    AF1 dc=lD-lC;
2992    AF1 cb=lC-lB;
2993    AF1 lenX=max(abs(dc),abs(cb));
2994    lenX=APrxLoRcpF1(lenX);
2995    AF1 dirX=lD-lB;
2996    dir.x+=dirX*w;
2997    lenX=ASatF1(abs(dirX)*lenX);
2998    lenX*=lenX;
2999    len+=lenX*w;
3000    // Repeat for the y axis.
3001    AF1 ec=lE-lC;
3002    AF1 ca=lC-lA;
3003    AF1 lenY=max(abs(ec),abs(ca));
3004    lenY=APrxLoRcpF1(lenY);
3005    AF1 dirY=lE-lA;
3006    dir.y+=dirY*w;
3007    lenY=ASatF1(abs(dirY)*lenY);
3008    lenY*=lenY;
3009    len+=lenY*w;}
3010  //------------------------------------------------------------------------------------------------------------------------------
3011   void FsrEasuF(
3012   out AF3 pix,
3013   AU2 ip, // Integer pixel position in output.
3014   AU4 con0, // Constants generated by FsrEasuCon().
3015   AU4 con1,
3016   AU4 con2,
3017   AU4 con3){
3018  //------------------------------------------------------------------------------------------------------------------------------
3019    // Get position of 'f'.
3020    AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
3021    AF2 fp=floor(pp);
3022    pp-=fp;
3023  //------------------------------------------------------------------------------------------------------------------------------
3024    // 12-tap kernel.
3025    //    b c
3026    //  e f g h
3027    //  i j k l
3028    //    n o
3029    // Gather 4 ordering.
3030    //  a b
3031    //  r g
3032    // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
3033    //    a b    <- unused (z)
3034    //    r g
3035    //  a b a b
3036    //  r g r g
3037    //    a b
3038    //    r g    <- unused (z)
3039    // Allowing dead-code removal to remove the 'z's.
3040    AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
3041    // These are from p0 to avoid pulling two constants on pre-Navi hardware.
3042    AF2 p1=p0+AF2_AU2(con2.xy);
3043    AF2 p2=p0+AF2_AU2(con2.zw);
3044    AF2 p3=p0+AF2_AU2(con3.xy);
3045    AF4 bczzR=FsrEasuRF(p0);
3046    AF4 bczzG=FsrEasuGF(p0);
3047    AF4 bczzB=FsrEasuBF(p0);
3048    AF4 ijfeR=FsrEasuRF(p1);
3049    AF4 ijfeG=FsrEasuGF(p1);
3050    AF4 ijfeB=FsrEasuBF(p1);
3051    AF4 klhgR=FsrEasuRF(p2);
3052    AF4 klhgG=FsrEasuGF(p2);
3053    AF4 klhgB=FsrEasuBF(p2);
3054    AF4 zzonR=FsrEasuRF(p3);
3055    AF4 zzonG=FsrEasuGF(p3);
3056    AF4 zzonB=FsrEasuBF(p3);
3057  //------------------------------------------------------------------------------------------------------------------------------
3058    // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
3059    AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
3060    AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
3061    AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
3062    AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
3063    // Rename.
3064    AF1 bL=bczzL.x;
3065    AF1 cL=bczzL.y;
3066    AF1 iL=ijfeL.x;
3067    AF1 jL=ijfeL.y;
3068    AF1 fL=ijfeL.z;
3069    AF1 eL=ijfeL.w;
3070    AF1 kL=klhgL.x;
3071    AF1 lL=klhgL.y;
3072    AF1 hL=klhgL.z;
3073    AF1 gL=klhgL.w;
3074    AF1 oL=zzonL.z;
3075    AF1 nL=zzonL.w;
3076    // Accumulate for bilinear interpolation.
3077    AF2 dir=AF2_(0.0);
3078    AF1 len=AF1_(0.0);
3079    FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
3080    FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
3081    FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
3082    FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
3083  //------------------------------------------------------------------------------------------------------------------------------
3084    // Normalize with approximation, and cleanup close to zero.
3085    AF2 dir2=dir*dir;
3086    AF1 dirR=dir2.x+dir2.y;
3087    AP1 zro=dirR<AF1_(1.0/32768.0);
3088    dirR=APrxLoRsqF1(dirR);
3089    dirR=zro?AF1_(1.0):dirR;
3090    dir.x=zro?AF1_(1.0):dir.x;
3091    dir*=AF2_(dirR);
3092    // Transform from {0 to 2} to {0 to 1} range, and shape with square.
3093    len=len*AF1_(0.5);
3094    len*=len;
3095    // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
3096    AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
3097    // Anisotropic length after rotation,
3098    //  x := 1.0 lerp to 'stretch' on edges
3099    //  y := 1.0 lerp to 2x on edges
3100    AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
3101    // Based on the amount of 'edge',
3102    // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
3103    AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
3104    // Set distance^2 clipping point to the end of the adjustable window.
3105    AF1 clp=APrxLoRcpF1(lob);
3106  //------------------------------------------------------------------------------------------------------------------------------
3107    // Accumulation mixed with min/max of 4 nearest.
3108    //    b c
3109    //  e f g h
3110    //  i j k l
3111    //    n o
3112    AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
3113                 AF3(klhgR.x,klhgG.x,klhgB.x));
3114    AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
3115                 AF3(klhgR.x,klhgG.x,klhgB.x));
3116    // Accumulation.
3117    AF3 aC=AF3_(0.0);
3118    AF1 aW=AF1_(0.0);
3119    FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
3120    FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
3121    FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
3122    FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
3123    FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
3124    FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
3125    FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
3126    FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
3127    FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
3128    FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
3129    FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
3130    FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
3131  //------------------------------------------------------------------------------------------------------------------------------
3132    // Normalize and dering.
3133    pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
3134  #endif
3135  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3136  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3137  //_____________________________________________________________/\_______________________________________________________________
3138  //==============================================================================================================================
3139  //                                                    PACKED 16-BIT VERSION
3140  //==============================================================================================================================
3141  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
3142  // Input callback prototypes, need to be implemented by calling shader
3143   AH4 FsrEasuRH(AF2 p);
3144   AH4 FsrEasuGH(AF2 p);
3145   AH4 FsrEasuBH(AF2 p);
3146  //------------------------------------------------------------------------------------------------------------------------------
3147   // This runs 2 taps in parallel.
3148   void FsrEasuTapH(
3149   inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
3150   inout AH2 aW,
3151   AH2 offX,AH2 offY,
3152   AH2 dir,
3153   AH2 len,
3154   AH1 lob,
3155   AH1 clp,
3156   AH2 cR,AH2 cG,AH2 cB){
3157    AH2 vX,vY;
3158    vX=offX*  dir.xx +offY*dir.yy;
3159    vY=offX*(-dir.yy)+offY*dir.xx;
3160    vX*=len.x;vY*=len.y;
3161    AH2 d2=vX*vX+vY*vY;
3162    d2=min(d2,AH2_(clp));
3163    AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
3164    AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
3165    wB*=wB;
3166    wA*=wA;
3167    wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
3168    AH2 w=wB*wA;
3169    aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
3170  //------------------------------------------------------------------------------------------------------------------------------
3171   // This runs 2 taps in parallel.
3172   void FsrEasuSetH(
3173   inout AH2 dirPX,inout AH2 dirPY,
3174   inout AH2 lenP,
3175   AH2 pp,
3176   AP1 biST,AP1 biUV,
3177   AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
3178    AH2 w = AH2_(0.0);
3179    if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
3180    if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(          pp.y);
3181    // ABS is not free in the packed FP16 path.
3182    AH2 dc=lD-lC;
3183    AH2 cb=lC-lB;
3184    AH2 lenX=max(abs(dc),abs(cb));
3185    lenX=ARcpH2(lenX);
3186    AH2 dirX=lD-lB;
3187    dirPX+=dirX*w;
3188    lenX=ASatH2(abs(dirX)*lenX);
3189    lenX*=lenX;
3190    lenP+=lenX*w;
3191    AH2 ec=lE-lC;
3192    AH2 ca=lC-lA;
3193    AH2 lenY=max(abs(ec),abs(ca));
3194    lenY=ARcpH2(lenY);
3195    AH2 dirY=lE-lA;
3196    dirPY+=dirY*w;
3197    lenY=ASatH2(abs(dirY)*lenY);
3198    lenY*=lenY;
3199    lenP+=lenY*w;}
3200  //------------------------------------------------------------------------------------------------------------------------------
3201   void FsrEasuH(
3202   out AH3 pix,
3203   AU2 ip,
3204   AU4 con0,
3205   AU4 con1,
3206   AU4 con2,
3207   AU4 con3){
3208  //------------------------------------------------------------------------------------------------------------------------------
3209    AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
3210    AF2 fp=floor(pp);
3211    pp-=fp;
3212    AH2 ppp=AH2(pp);
3213  //------------------------------------------------------------------------------------------------------------------------------
3214    AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
3215    AF2 p1=p0+AF2_AU2(con2.xy);
3216    AF2 p2=p0+AF2_AU2(con2.zw);
3217    AF2 p3=p0+AF2_AU2(con3.xy);
3218    AH4 bczzR=FsrEasuRH(p0);
3219    AH4 bczzG=FsrEasuGH(p0);
3220    AH4 bczzB=FsrEasuBH(p0);
3221    AH4 ijfeR=FsrEasuRH(p1);
3222    AH4 ijfeG=FsrEasuGH(p1);
3223    AH4 ijfeB=FsrEasuBH(p1);
3224    AH4 klhgR=FsrEasuRH(p2);
3225    AH4 klhgG=FsrEasuGH(p2);
3226    AH4 klhgB=FsrEasuBH(p2);
3227    AH4 zzonR=FsrEasuRH(p3);
3228    AH4 zzonG=FsrEasuGH(p3);
3229    AH4 zzonB=FsrEasuBH(p3);
3230  //------------------------------------------------------------------------------------------------------------------------------
3231    AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
3232    AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
3233    AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
3234    AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
3235    AH1 bL=bczzL.x;
3236    AH1 cL=bczzL.y;
3237    AH1 iL=ijfeL.x;
3238    AH1 jL=ijfeL.y;
3239    AH1 fL=ijfeL.z;
3240    AH1 eL=ijfeL.w;
3241    AH1 kL=klhgL.x;
3242    AH1 lL=klhgL.y;
3243    AH1 hL=klhgL.z;
3244    AH1 gL=klhgL.w;
3245    AH1 oL=zzonL.z;
3246    AH1 nL=zzonL.w;
3247    // This part is different, accumulating 2 taps in parallel.
3248    AH2 dirPX=AH2_(0.0);
3249    AH2 dirPY=AH2_(0.0);
3250    AH2 lenP=AH2_(0.0);
3251    FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
3252    FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
3253    AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
3254    AH1 len=lenP.r+lenP.g;
3255  //------------------------------------------------------------------------------------------------------------------------------
3256    AH2 dir2=dir*dir;
3257    AH1 dirR=dir2.x+dir2.y;
3258    AP1 zro=dirR<AH1_(1.0/32768.0);
3259    dirR=APrxLoRsqH1(dirR);
3260    dirR=zro?AH1_(1.0):dirR;
3261    dir.x=zro?AH1_(1.0):dir.x;
3262    dir*=AH2_(dirR);
3263    len=len*AH1_(0.5);
3264    len*=len;
3265    AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
3266    AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
3267    AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
3268    AH1 clp=APrxLoRcpH1(lob);
3269  //------------------------------------------------------------------------------------------------------------------------------
3270    // FP16 is different, using packed trick to do min and max in same operation.
3271    AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
3272    AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
3273    AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
3274    // This part is different for FP16, working pairs of taps at a time.
3275    AH2 pR=AH2_(0.0);
3276    AH2 pG=AH2_(0.0);
3277    AH2 pB=AH2_(0.0);
3278    AH2 pW=AH2_(0.0);
3279    FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
3280    FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
3281    FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
3282    FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
3283    FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
3284    FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
3285    AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
3286    AH1 aW=pW.x+pW.y;
3287  //------------------------------------------------------------------------------------------------------------------------------
3288    // Slightly different for FP16 version due to combined min and max.
3289    pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
3290  #endif
3291  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3292  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3293  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3294  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3295  //_____________________________________________________________/\_______________________________________________________________
3296  //==============================================================================================================================
3297  //
3298  //                                      FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
3299  //
3300  //------------------------------------------------------------------------------------------------------------------------------
3301  // CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
3302  // RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
3303  // RCAS also has a built in process to limit sharpening of what it detects as possible noise.
3304  // RCAS sharper does not support scaling, as it should be applied after EASU scaling.
3305  // Pass EASU output straight into RCAS, no color conversions necessary.
3306  //------------------------------------------------------------------------------------------------------------------------------
3307  // RCAS is based on the following logic.
3308  // RCAS uses a 5 tap filter in a cross pattern (same as CAS),
3309  //    w                n
3310  //  w 1 w  for taps  w m e 
3311  //    w                s
3312  // Where 'w' is the negative lobe weight.
3313  //  output = (w*(n+e+w+s)+m)/(4*w+1)
3314  // RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
3315  //  0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
3316  //  1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
3317  // Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
3318  // This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
3319  // So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
3320  // As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
3321  // This stabilizes RCAS.
3322  // RCAS does a simple highpass which is normalized against the local contrast then shaped,
3323  //       0.25
3324  //  0.25  -1  0.25
3325  //       0.25
3326  // This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
3327  //
3328  //  GLSL example for the required callbacks :
3329  // 
3330  //  AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
3331  //  void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
3332  //  {
3333  //    //do any simple input color conversions here or leave empty if none needed
3334  //  }
3335  //  
3336  //  FsrRcasCon need to be called from the CPU or GPU to set up constants.
3337  //  Including a GPU example here, the 'con' value would be stored out to a constant buffer.
3338  // 
3339  //  AU4 con;
3340  //  FsrRcasCon(con,
3341  //   0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
3342  // ---------------
3343  // RCAS sharpening supports a CAS-like pass-through alpha via,
3344  //  #define FSR_RCAS_PASSTHROUGH_ALPHA 1
3345  // RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
3346  // Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
3347  //  #define FSR_RCAS_DENOISE 1
3348  //==============================================================================================================================
3349  // This is set at the limit of providing unnatural results for sharpening.
3350  #define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
3351  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3352  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3353  //_____________________________________________________________/\_______________________________________________________________
3354  //==============================================================================================================================
3355  //                                                      CONSTANT SETUP
3356  //==============================================================================================================================
3357  // Call to setup required constant values (works on CPU or GPU).
3358  A_STATIC void FsrRcasCon(
3359  outAU4 con,
3360  // The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
3361  AF1 sharpness){
3362   // Transform from stops to linear value.
3363   sharpness=AExp2F1(-sharpness);
3364   varAF2(hSharp)=initAF2(sharpness,sharpness);
3365   con[0]=AU1_AF1(sharpness);
3366   con[1]=AU1_AH2_AF2(hSharp);
3367   con[2]=0;
3368   con[3]=0;}
3369  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3370  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3371  //_____________________________________________________________/\_______________________________________________________________
3372  //==============================================================================================================================
3373  //                                                   NON-PACKED 32-BIT VERSION
3374  //==============================================================================================================================
3375  #if defined(A_GPU)&&defined(FSR_RCAS_F)
3376   // Input callback prototypes that need to be implemented by calling shader
3377   AF4 FsrRcasLoadF(ASU2 p);
3378   void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
3379  //------------------------------------------------------------------------------------------------------------------------------
3380   void FsrRcasF(
3381   out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
3382   out AF1 pixG,
3383   out AF1 pixB,
3384   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3385    out AF1 pixA,
3386   #endif
3387   AU2 ip, // Integer pixel position in output.
3388   AU4 con){ // Constant generated by RcasSetup().
3389    // Algorithm uses minimal 3x3 pixel neighborhood.
3390    //    b 
3391    //  d e f
3392    //    h
3393    ASU2 sp=ASU2(ip);
3394    AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
3395    AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
3396    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3397     AF4 ee=FsrRcasLoadF(sp);
3398     AF3 e=ee.rgb;pixA=ee.a;
3399    #else
3400     AF3 e=FsrRcasLoadF(sp).rgb;
3401    #endif
3402    AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
3403    AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
3404    // Rename (32-bit) or regroup (16-bit).
3405    AF1 bR=b.r;
3406    AF1 bG=b.g;
3407    AF1 bB=b.b;
3408    AF1 dR=d.r;
3409    AF1 dG=d.g;
3410    AF1 dB=d.b;
3411    AF1 eR=e.r;
3412    AF1 eG=e.g;
3413    AF1 eB=e.b;
3414    AF1 fR=f.r;
3415    AF1 fG=f.g;
3416    AF1 fB=f.b;
3417    AF1 hR=h.r;
3418    AF1 hG=h.g;
3419    AF1 hB=h.b;
3420    // Run optional input transform.
3421    FsrRcasInputF(bR,bG,bB);
3422    FsrRcasInputF(dR,dG,dB);
3423    FsrRcasInputF(eR,eG,eB);
3424    FsrRcasInputF(fR,fG,fB);
3425    FsrRcasInputF(hR,hG,hB);
3426    // Luma times 2.
3427    AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
3428    AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
3429    AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
3430    AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
3431    AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
3432    // Noise detection.
3433    AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
3434    nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
3435    nz=AF1_(-0.5)*nz+AF1_(1.0);
3436    // Min and max of ring.
3437    AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
3438    AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
3439    AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
3440    AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
3441    AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
3442    AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
3443    // Immediate constants for peak range.
3444    AF2 peakC=AF2(1.0,-1.0*4.0);
3445    // Limiters, these need to be high precision RCPs.
3446    AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
3447    AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
3448    AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
3449    AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
3450    AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
3451    AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
3452    AF1 lobeR=max(-hitMinR,hitMaxR);
3453    AF1 lobeG=max(-hitMinG,hitMaxG);
3454    AF1 lobeB=max(-hitMinB,hitMaxB);
3455    AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
3456    // Apply noise removal.
3457    #ifdef FSR_RCAS_DENOISE
3458     lobe*=nz;
3459    #endif
3460    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3461    AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
3462    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3463    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3464    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
3465    return;} 
3466  #endif
3467  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3468  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3469  //_____________________________________________________________/\_______________________________________________________________
3470  //==============================================================================================================================
3471  //                                                  NON-PACKED 16-BIT VERSION
3472  //==============================================================================================================================
3473  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
3474   // Input callback prototypes that need to be implemented by calling shader
3475   AH4 FsrRcasLoadH(ASW2 p);
3476   void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
3477  //------------------------------------------------------------------------------------------------------------------------------
3478   void FsrRcasH(
3479   out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
3480   out AH1 pixG,
3481   out AH1 pixB,
3482   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3483    out AH1 pixA,
3484   #endif
3485   AU2 ip, // Integer pixel position in output.
3486   AU4 con){ // Constant generated by RcasSetup().
3487    // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
3488    //    b 
3489    //  d e f
3490    //    h
3491    ASW2 sp=ASW2(ip);
3492    AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
3493    AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
3494    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3495     AH4 ee=FsrRcasLoadH(sp);
3496     AH3 e=ee.rgb;pixA=ee.a;
3497    #else
3498     AH3 e=FsrRcasLoadH(sp).rgb;
3499    #endif
3500    AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
3501    AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
3502    // Rename (32-bit) or regroup (16-bit).
3503    AH1 bR=b.r;
3504    AH1 bG=b.g;
3505    AH1 bB=b.b;
3506    AH1 dR=d.r;
3507    AH1 dG=d.g;
3508    AH1 dB=d.b;
3509    AH1 eR=e.r;
3510    AH1 eG=e.g;
3511    AH1 eB=e.b;
3512    AH1 fR=f.r;
3513    AH1 fG=f.g;
3514    AH1 fB=f.b;
3515    AH1 hR=h.r;
3516    AH1 hG=h.g;
3517    AH1 hB=h.b;
3518    // Run optional input transform.
3519    FsrRcasInputH(bR,bG,bB);
3520    FsrRcasInputH(dR,dG,dB);
3521    FsrRcasInputH(eR,eG,eB);
3522    FsrRcasInputH(fR,fG,fB);
3523    FsrRcasInputH(hR,hG,hB);
3524    // Luma times 2.
3525    AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
3526    AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
3527    AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
3528    AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
3529    AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
3530    // Noise detection.
3531    AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
3532    nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
3533    nz=AH1_(-0.5)*nz+AH1_(1.0);
3534    // Min and max of ring.
3535    AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
3536    AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
3537    AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
3538    AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
3539    AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
3540    AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
3541    // Immediate constants for peak range.
3542    AH2 peakC=AH2(1.0,-1.0*4.0);
3543    // Limiters, these need to be high precision RCPs.
3544    AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
3545    AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
3546    AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
3547    AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
3548    AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
3549    AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
3550    AH1 lobeR=max(-hitMinR,hitMaxR);
3551    AH1 lobeG=max(-hitMinG,hitMaxG);
3552    AH1 lobeB=max(-hitMinB,hitMaxB);
3553    AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
3554    // Apply noise removal.
3555    #ifdef FSR_RCAS_DENOISE
3556     lobe*=nz;
3557    #endif
3558    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3559    AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
3560    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3561    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3562    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
3563  #endif
3564  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3565  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3566  //_____________________________________________________________/\_______________________________________________________________
3567  //==============================================================================================================================
3568  //                                                     PACKED 16-BIT VERSION
3569  //==============================================================================================================================
3570  #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
3571   // Input callback prototypes that need to be implemented by the calling shader
3572   AH4 FsrRcasLoadHx2(ASW2 p);
3573   void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
3574  //------------------------------------------------------------------------------------------------------------------------------
3575   // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
3576   void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
3577    #ifdef A_HLSL
3578     // Invoke a slower path for DX only, since it won't allow uninitialized values.
3579     pix0.a=pix1.a=0.0;
3580    #endif
3581    pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
3582    pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
3583  //------------------------------------------------------------------------------------------------------------------------------
3584   void FsrRcasHx2(
3585   // Output values are for 2 8x8 tiles in a 16x8 region.
3586   //  pix<R,G,B>.x =  left 8x8 tile
3587   //  pix<R,G,B>.y = right 8x8 tile
3588   // This enables later processing to easily be packed as well.
3589   out AH2 pixR,
3590   out AH2 pixG,
3591   out AH2 pixB,
3592   #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3593    out AH2 pixA,
3594   #endif
3595   AU2 ip, // Integer pixel position in output.
3596   AU4 con){ // Constant generated by RcasSetup().
3597    // No scaling algorithm uses minimal 3x3 pixel neighborhood.
3598    ASW2 sp0=ASW2(ip);
3599    AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
3600    AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
3601    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3602     AH4 ee0=FsrRcasLoadHx2(sp0);
3603     AH3 e0=ee0.rgb;pixA.r=ee0.a;
3604    #else
3605     AH3 e0=FsrRcasLoadHx2(sp0).rgb;
3606    #endif
3607    AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
3608    AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
3609    ASW2 sp1=sp0+ASW2(8,0);
3610    AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
3611    AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
3612    #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
3613     AH4 ee1=FsrRcasLoadHx2(sp1);
3614     AH3 e1=ee1.rgb;pixA.g=ee1.a;
3615    #else
3616     AH3 e1=FsrRcasLoadHx2(sp1).rgb;
3617    #endif
3618    AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
3619    AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
3620    // Arrays of Structures to Structures of Arrays conversion.
3621    AH2 bR=AH2(b0.r,b1.r);
3622    AH2 bG=AH2(b0.g,b1.g);
3623    AH2 bB=AH2(b0.b,b1.b);
3624    AH2 dR=AH2(d0.r,d1.r);
3625    AH2 dG=AH2(d0.g,d1.g);
3626    AH2 dB=AH2(d0.b,d1.b);
3627    AH2 eR=AH2(e0.r,e1.r);
3628    AH2 eG=AH2(e0.g,e1.g);
3629    AH2 eB=AH2(e0.b,e1.b);
3630    AH2 fR=AH2(f0.r,f1.r);
3631    AH2 fG=AH2(f0.g,f1.g);
3632    AH2 fB=AH2(f0.b,f1.b);
3633    AH2 hR=AH2(h0.r,h1.r);
3634    AH2 hG=AH2(h0.g,h1.g);
3635    AH2 hB=AH2(h0.b,h1.b);
3636    // Run optional input transform.
3637    FsrRcasInputHx2(bR,bG,bB);
3638    FsrRcasInputHx2(dR,dG,dB);
3639    FsrRcasInputHx2(eR,eG,eB);
3640    FsrRcasInputHx2(fR,fG,fB);
3641    FsrRcasInputHx2(hR,hG,hB);
3642    // Luma times 2.
3643    AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
3644    AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
3645    AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
3646    AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
3647    AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
3648    // Noise detection.
3649    AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
3650    nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
3651    nz=AH2_(-0.5)*nz+AH2_(1.0);
3652    // Min and max of ring.
3653    AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
3654    AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
3655    AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
3656    AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
3657    AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
3658    AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
3659    // Immediate constants for peak range.
3660    AH2 peakC=AH2(1.0,-1.0*4.0);
3661    // Limiters, these need to be high precision RCPs.
3662    AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
3663    AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
3664    AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
3665    AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
3666    AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
3667    AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
3668    AH2 lobeR=max(-hitMinR,hitMaxR);
3669    AH2 lobeG=max(-hitMinG,hitMaxG);
3670    AH2 lobeB=max(-hitMinB,hitMaxB);
3671    AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
3672    // Apply noise removal.
3673    #ifdef FSR_RCAS_DENOISE
3674     lobe*=nz;
3675    #endif
3676    // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
3677    AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
3678    pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
3679    pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
3680    pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
3681  #endif
3682  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3683  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3684  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3685  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3686  //_____________________________________________________________/\_______________________________________________________________
3687  //==============================================================================================================================
3688  //
3689  //                                          FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
3690  //
3691  //------------------------------------------------------------------------------------------------------------------------------
3692  // Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
3693  // Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
3694  // The 'Lfga*()' functions provide a convenient way to introduce grain.
3695  // These functions limit grain based on distance to signal limits.
3696  // This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
3697  // Grain application should be done in a linear colorspace.
3698  // The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
3699  //------------------------------------------------------------------------------------------------------------------------------
3700  // Usage,
3701  //   FsrLfga*(
3702  //    color, // In/out linear colorspace color {0 to 1} ranged.
3703  //    grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
3704  //    amount); // Amount of grain (0 to 1} ranged.
3705  //------------------------------------------------------------------------------------------------------------------------------
3706  // Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
3707  //==============================================================================================================================
3708  #if defined(A_GPU)
3709   // Maximum grain is the minimum distance to the signal limit.
3710   void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
3711  #endif
3712  //==============================================================================================================================
3713  #if defined(A_GPU)&&defined(A_HALF)
3714   // Half precision version (slower).
3715   void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
3716  //------------------------------------------------------------------------------------------------------------------------------
3717   // Packed half precision version (faster).
3718   void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
3719    cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
3720  #endif
3721  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3722  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3723  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3724  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3725  //_____________________________________________________________/\_______________________________________________________________
3726  //==============================================================================================================================
3727  //
3728  //                                          FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
3729  //
3730  //------------------------------------------------------------------------------------------------------------------------------
3731  // This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
3732  // The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
3733  //------------------------------------------------------------------------------------------------------------------------------
3734  // Reversible tonemapper usage,
3735  //  FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
3736  //  FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
3737  //==============================================================================================================================
3738  #if defined(A_GPU)
3739   void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
3740   // The extra max solves the c=1.0 case (which is a /0).
3741   void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
3742  #endif
3743  //==============================================================================================================================
3744  #if defined(A_GPU)&&defined(A_HALF)
3745   void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
3746   void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
3747  //------------------------------------------------------------------------------------------------------------------------------
3748   void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
3749    AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
3750   void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
3751    AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
3752  #endif
3753  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3754  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3755  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3756  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3757  //_____________________________________________________________/\_______________________________________________________________
3758  //==============================================================================================================================
3759  //
3760  //                                       FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
3761  //
3762  //------------------------------------------------------------------------------------------------------------------------------
3763  // Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
3764  // Gamma 2.0 is used so that the conversion back to linear is just to square the color.
3765  // The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
3766  // Given good non-biased temporal blue noise as dither input,
3767  // the output dither will temporally conserve energy.
3768  // This is done by choosing the linear nearest step point instead of perceptual nearest.
3769  // See code below for details.
3770  //------------------------------------------------------------------------------------------------------------------------------
3771  // DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
3772  // ===============================================
3773  // - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
3774  // - Thus rounding is to nearest.
3775  // - NaN gets converted to zero.
3776  // - INF is clamped to {0.0 to 1.0}.
3777  //==============================================================================================================================
3778  #if defined(A_GPU)
3779   // Hand tuned integer position to dither value, with more values than simple checkerboard.
3780   // Only 32-bit has enough precision for this compddation.
3781   // Output is {0 to <1}.
3782   AF1 FsrTepdDitF(AU2 p,AU1 f){
3783    AF1 x=AF1_(p.x+f);
3784    AF1 y=AF1_(p.y);
3785    // The 1.61803 golden ratio.
3786    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3787    // Number designed to provide a good visual pattern.
3788    AF1 b=AF1_(1.0/3.69);
3789    x=x*a+(y*b);
3790    return AFractF1(x);}
3791  //------------------------------------------------------------------------------------------------------------------------------
3792   // This version is 8-bit gamma 2.0.
3793   // The 'c' input is {0 to 1}.
3794   // Output is {0 to 1} ready for image store.
3795   void FsrTepdC8F(inout AF3 c,AF1 dit){
3796    AF3 n=sqrt(c);
3797    n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
3798    AF3 a=n*n;
3799    AF3 b=n+AF3_(1.0/255.0);b=b*b;
3800    // Ratio of 'a' to 'b' required to produce 'c'.
3801    // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
3802    // APrxMedRcpF1() is an IADD,FMA,MUL.
3803    AF3 r=(c-b)*APrxMedRcpF3(a-b);
3804    // Use the ratio as a cutoff to choose 'a' or 'b'.
3805    // AGtZeroF1() is a MUL.
3806    c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
3807  //------------------------------------------------------------------------------------------------------------------------------
3808   // This version is 10-bit gamma 2.0.
3809   // The 'c' input is {0 to 1}.
3810   // Output is {0 to 1} ready for image store.
3811   void FsrTepdC10F(inout AF3 c,AF1 dit){
3812    AF3 n=sqrt(c);
3813    n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
3814    AF3 a=n*n;
3815    AF3 b=n+AF3_(1.0/1023.0);b=b*b;
3816    AF3 r=(c-b)*APrxMedRcpF3(a-b);
3817    c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
3818  #endif
3819  //==============================================================================================================================
3820  #if defined(A_GPU)&&defined(A_HALF)
3821   AH1 FsrTepdDitH(AU2 p,AU1 f){
3822    AF1 x=AF1_(p.x+f);
3823    AF1 y=AF1_(p.y);
3824    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3825    AF1 b=AF1_(1.0/3.69);
3826    x=x*a+(y*b);
3827    return AH1(AFractF1(x));}
3828  //------------------------------------------------------------------------------------------------------------------------------
3829   void FsrTepdC8H(inout AH3 c,AH1 dit){
3830    AH3 n=sqrt(c);
3831    n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
3832    AH3 a=n*n;
3833    AH3 b=n+AH3_(1.0/255.0);b=b*b;
3834    AH3 r=(c-b)*APrxMedRcpH3(a-b);
3835    c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
3836  //------------------------------------------------------------------------------------------------------------------------------
3837   void FsrTepdC10H(inout AH3 c,AH1 dit){
3838    AH3 n=sqrt(c);
3839    n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
3840    AH3 a=n*n;
3841    AH3 b=n+AH3_(1.0/1023.0);b=b*b;
3842    AH3 r=(c-b)*APrxMedRcpH3(a-b);
3843    c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
3844  //==============================================================================================================================
3845   // This computes dither for positions 'p' and 'p+{8,0}'.
3846   AH2 FsrTepdDitHx2(AU2 p,AU1 f){
3847    AF2 x;
3848    x.x=AF1_(p.x+f);
3849    x.y=x.x+AF1_(8.0);
3850    AF1 y=AF1_(p.y);
3851    AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
3852    AF1 b=AF1_(1.0/3.69);
3853    x=x*AF2_(a)+AF2_(y*b);
3854    return AH2(AFractF2(x));}
3855  //------------------------------------------------------------------------------------------------------------------------------
3856   void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
3857    AH2 nR=sqrt(cR);
3858    AH2 nG=sqrt(cG);
3859    AH2 nB=sqrt(cB);
3860    nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
3861    nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
3862    nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
3863    AH2 aR=nR*nR;
3864    AH2 aG=nG*nG;
3865    AH2 aB=nB*nB;
3866    AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
3867    AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
3868    AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
3869    AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
3870    AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
3871    AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
3872    cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
3873    cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
3874    cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
3875  //------------------------------------------------------------------------------------------------------------------------------
3876   void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
3877    AH2 nR=sqrt(cR);
3878    AH2 nG=sqrt(cG);
3879    AH2 nB=sqrt(cB);
3880    nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
3881    nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
3882    nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
3883    AH2 aR=nR*nR;
3884    AH2 aG=nG*nG;
3885    AH2 aB=nB*nB;
3886    AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
3887    AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
3888    AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
3889    AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
3890    AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
3891    AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
3892    cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
3893    cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
3894    cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
3895  #endif
3896  
3897  
3898  float insideBox(vec2 v) {
3899      vec2 s = step(bLeft, v) - step(tRight, v);
3900      return s.x * s.y;   
3901  }
3902  
3903  AF2 translateDest(AF2 pos) {
3904      AF2 translatedPos = AF2(pos.x, pos.y);
3905      translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x;
3906      translatedPos.y = dstY0 < dstY1 ? dstY1 + dstY0 - translatedPos.y - 1 : translatedPos.y;
3907      return translatedPos;
3908  }
3909  
3910  void CurrFilter(AU2 pos)
3911  {
3912      if((insideBox(vec2(pos.x, pos.y))) == 0) {
3913          imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1));
3914         return;
3915      }
3916      AF3 c;
3917      FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3);
3918      imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1));
3919  }
3920  
3921  void main() {
3922  	srcW = abs(srcX1 - srcX0);
3923  	srcH = abs(srcY1 - srcY0);
3924  	dstW = abs(dstX1 - dstX0);
3925  	dstH = abs(dstY1 - dstY0);
3926  
3927  	AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
3928  
3929  	setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1),
3930  	    vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0));
3931  
3932  	// Upscaling
3933  	FsrEasuCon(con0, con1, con2, con3,
3934  	srcW, srcH,  // Viewport size (top left aligned) in the input image which is to be scaled.
3935  	srcW, srcH,  // The size of the input image.
3936  	dstW, dstH); // The output resolution.
3937  
3938  	CurrFilter(gxy);
3939  	gxy.x += 8u;
3940  	CurrFilter(gxy);
3941  	gxy.y += 8u;
3942  	CurrFilter(gxy);
3943  	gxy.x -= 8u;
3944  	CurrFilter(gxy);
3945  }