FsrSharpening.glsl
1 // Sharpening 2 #version 430 core 3 layout (local_size_x = 64) in; 4 layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; 5 layout( binding = 2 ) uniform invResolution 6 { 7 vec2 invResolution_data; 8 }; 9 layout( binding = 3 ) uniform outvResolution 10 { 11 vec2 outvResolution_data; 12 }; 13 layout( binding = 1, set = 2) uniform sampler2D source; 14 layout( binding = 4 ) uniform sharpening 15 { 16 float sharpening_data; 17 }; 18 19 #define A_GPU 1 20 #define A_GLSL 1 21 //============================================================================================================================== 22 // 23 // [A] SHADER PORTABILITY 1.20210629 24 // 25 //============================================================================================================================== 26 // FidelityFX Super Resolution Sample 27 // 28 // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. 29 // Permission is hereby granted, free of charge, to any person obtaining a copy 30 // of this software and associated documentation files(the "Software"), to deal 31 // in the Software without restriction, including without limitation the rights 32 // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell 33 // copies of the Software, and to permit persons to whom the Software is 34 // furnished to do so, subject to the following conditions : 35 // The above copyright notice and this permission notice shall be included in 36 // all copies or substantial portions of the Software. 37 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 39 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 40 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 41 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 42 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 43 // THE SOFTWARE. 44 //------------------------------------------------------------------------------------------------------------------------------ 45 // MIT LICENSE 46 // =========== 47 // Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). 48 // ----------- 49 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 50 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 51 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 52 // Software is furnished to do so, subject to the following conditions: 53 // ----------- 54 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 55 // Software. 56 // ----------- 57 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 58 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 59 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 60 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 61 //------------------------------------------------------------------------------------------------------------------------------ 62 // ABOUT 63 // ===== 64 // Common central point for high-level shading language and C portability for various shader headers. 65 //------------------------------------------------------------------------------------------------------------------------------ 66 // DEFINES 67 // ======= 68 // A_CPU ..... Include the CPU related code. 69 // A_GPU ..... Include the GPU related code. 70 // A_GLSL .... Using GLSL. 71 // A_HLSL .... Using HLSL. 72 // A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). 73 // A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) 74 // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). 75 // ======= 76 // A_BYTE .... Support 8-bit integer. 77 // A_HALF .... Support 16-bit integer and floating point. 78 // A_LONG .... Support 64-bit integer. 79 // A_DUBL .... Support 64-bit floating point. 80 // ======= 81 // A_WAVE .... Support wave-wide operations. 82 //------------------------------------------------------------------------------------------------------------------------------ 83 // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. 84 //------------------------------------------------------------------------------------------------------------------------------ 85 // SIMPLIFIED TYPE SYSTEM 86 // ====================== 87 // - All ints will be unsigned with exception of when signed is required. 88 // - Type naming simplified and shortened "A<type><#components>", 89 // - H = 16-bit float (half) 90 // - F = 32-bit float (float) 91 // - D = 64-bit float (double) 92 // - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) 93 // - B = 8-bit integer (byte) 94 // - W = 16-bit integer (word) 95 // - U = 32-bit integer (unsigned) 96 // - L = 64-bit integer (long) 97 // - Using "AS<type><#components>" for signed when required. 98 //------------------------------------------------------------------------------------------------------------------------------ 99 // TODO 100 // ==== 101 // - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). 102 //------------------------------------------------------------------------------------------------------------------------------ 103 // CHANGE LOG 104 // ========== 105 // 20200914 - Expanded wave ops and prx code. 106 // 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. 107 //============================================================================================================================== 108 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 109 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 110 //_____________________________________________________________/\_______________________________________________________________ 111 //============================================================================================================================== 112 // COMMON 113 //============================================================================================================================== 114 #define A_2PI 6.28318530718 115 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 116 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 117 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 118 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 119 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 120 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 121 //_____________________________________________________________/\_______________________________________________________________ 122 //============================================================================================================================== 123 // 124 // 125 // CPU 126 // 127 // 128 //============================================================================================================================== 129 #ifdef A_CPU 130 // Supporting user defined overrides. 131 #ifndef A_RESTRICT 132 #define A_RESTRICT __restrict 133 #endif 134 //------------------------------------------------------------------------------------------------------------------------------ 135 #ifndef A_STATIC 136 #define A_STATIC static 137 #endif 138 //------------------------------------------------------------------------------------------------------------------------------ 139 // Same types across CPU and GPU. 140 // Predicate uses 32-bit integer (C friendly bool). 141 typedef uint32_t AP1; 142 typedef float AF1; 143 typedef double AD1; 144 typedef uint8_t AB1; 145 typedef uint16_t AW1; 146 typedef uint32_t AU1; 147 typedef uint64_t AL1; 148 typedef int8_t ASB1; 149 typedef int16_t ASW1; 150 typedef int32_t ASU1; 151 typedef int64_t ASL1; 152 //------------------------------------------------------------------------------------------------------------------------------ 153 #define AD1_(a) ((AD1)(a)) 154 #define AF1_(a) ((AF1)(a)) 155 #define AL1_(a) ((AL1)(a)) 156 #define AU1_(a) ((AU1)(a)) 157 //------------------------------------------------------------------------------------------------------------------------------ 158 #define ASL1_(a) ((ASL1)(a)) 159 #define ASU1_(a) ((ASU1)(a)) 160 //------------------------------------------------------------------------------------------------------------------------------ 161 A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} 162 //------------------------------------------------------------------------------------------------------------------------------ 163 #define A_TRUE 1 164 #define A_FALSE 0 165 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 166 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 167 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 168 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 169 //_____________________________________________________________/\_______________________________________________________________ 170 //============================================================================================================================== 171 // 172 // CPU/GPU PORTING 173 // 174 //------------------------------------------------------------------------------------------------------------------------------ 175 // Get CPU and GPU to share all setup code, without duplicate code paths. 176 // This uses a lower-case prefix for special vector constructs. 177 // - In C restrict pointers are used. 178 // - In the shading language, in/inout/out arguments are used. 179 // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). 180 //============================================================================================================================== 181 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 182 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 183 //_____________________________________________________________/\_______________________________________________________________ 184 //============================================================================================================================== 185 // VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY 186 //============================================================================================================================== 187 #define retAD2 AD1 *A_RESTRICT 188 #define retAD3 AD1 *A_RESTRICT 189 #define retAD4 AD1 *A_RESTRICT 190 #define retAF2 AF1 *A_RESTRICT 191 #define retAF3 AF1 *A_RESTRICT 192 #define retAF4 AF1 *A_RESTRICT 193 #define retAL2 AL1 *A_RESTRICT 194 #define retAL3 AL1 *A_RESTRICT 195 #define retAL4 AL1 *A_RESTRICT 196 #define retAU2 AU1 *A_RESTRICT 197 #define retAU3 AU1 *A_RESTRICT 198 #define retAU4 AU1 *A_RESTRICT 199 //------------------------------------------------------------------------------------------------------------------------------ 200 #define inAD2 AD1 *A_RESTRICT 201 #define inAD3 AD1 *A_RESTRICT 202 #define inAD4 AD1 *A_RESTRICT 203 #define inAF2 AF1 *A_RESTRICT 204 #define inAF3 AF1 *A_RESTRICT 205 #define inAF4 AF1 *A_RESTRICT 206 #define inAL2 AL1 *A_RESTRICT 207 #define inAL3 AL1 *A_RESTRICT 208 #define inAL4 AL1 *A_RESTRICT 209 #define inAU2 AU1 *A_RESTRICT 210 #define inAU3 AU1 *A_RESTRICT 211 #define inAU4 AU1 *A_RESTRICT 212 //------------------------------------------------------------------------------------------------------------------------------ 213 #define inoutAD2 AD1 *A_RESTRICT 214 #define inoutAD3 AD1 *A_RESTRICT 215 #define inoutAD4 AD1 *A_RESTRICT 216 #define inoutAF2 AF1 *A_RESTRICT 217 #define inoutAF3 AF1 *A_RESTRICT 218 #define inoutAF4 AF1 *A_RESTRICT 219 #define inoutAL2 AL1 *A_RESTRICT 220 #define inoutAL3 AL1 *A_RESTRICT 221 #define inoutAL4 AL1 *A_RESTRICT 222 #define inoutAU2 AU1 *A_RESTRICT 223 #define inoutAU3 AU1 *A_RESTRICT 224 #define inoutAU4 AU1 *A_RESTRICT 225 //------------------------------------------------------------------------------------------------------------------------------ 226 #define outAD2 AD1 *A_RESTRICT 227 #define outAD3 AD1 *A_RESTRICT 228 #define outAD4 AD1 *A_RESTRICT 229 #define outAF2 AF1 *A_RESTRICT 230 #define outAF3 AF1 *A_RESTRICT 231 #define outAF4 AF1 *A_RESTRICT 232 #define outAL2 AL1 *A_RESTRICT 233 #define outAL3 AL1 *A_RESTRICT 234 #define outAL4 AL1 *A_RESTRICT 235 #define outAU2 AU1 *A_RESTRICT 236 #define outAU3 AU1 *A_RESTRICT 237 #define outAU4 AU1 *A_RESTRICT 238 //------------------------------------------------------------------------------------------------------------------------------ 239 #define varAD2(x) AD1 x[2] 240 #define varAD3(x) AD1 x[3] 241 #define varAD4(x) AD1 x[4] 242 #define varAF2(x) AF1 x[2] 243 #define varAF3(x) AF1 x[3] 244 #define varAF4(x) AF1 x[4] 245 #define varAL2(x) AL1 x[2] 246 #define varAL3(x) AL1 x[3] 247 #define varAL4(x) AL1 x[4] 248 #define varAU2(x) AU1 x[2] 249 #define varAU3(x) AU1 x[3] 250 #define varAU4(x) AU1 x[4] 251 //------------------------------------------------------------------------------------------------------------------------------ 252 #define initAD2(x,y) {x,y} 253 #define initAD3(x,y,z) {x,y,z} 254 #define initAD4(x,y,z,w) {x,y,z,w} 255 #define initAF2(x,y) {x,y} 256 #define initAF3(x,y,z) {x,y,z} 257 #define initAF4(x,y,z,w) {x,y,z,w} 258 #define initAL2(x,y) {x,y} 259 #define initAL3(x,y,z) {x,y,z} 260 #define initAL4(x,y,z,w) {x,y,z,w} 261 #define initAU2(x,y) {x,y} 262 #define initAU3(x,y,z) {x,y,z} 263 #define initAU4(x,y,z,w) {x,y,z,w} 264 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 265 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 266 //_____________________________________________________________/\_______________________________________________________________ 267 //============================================================================================================================== 268 // SCALAR RETURN OPS 269 //------------------------------------------------------------------------------------------------------------------------------ 270 // TODO 271 // ==== 272 // - Replace transcendentals with manual versions. 273 //============================================================================================================================== 274 #ifdef A_GCC 275 A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} 276 A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} 277 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} 278 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} 279 #else 280 A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} 281 A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} 282 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} 283 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} 284 #endif 285 //------------------------------------------------------------------------------------------------------------------------------ 286 #ifdef A_GCC 287 A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} 288 A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} 289 #else 290 A_STATIC AD1 ACosD1(AD1 a){return cos(a);} 291 A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} 292 #endif 293 //------------------------------------------------------------------------------------------------------------------------------ 294 A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} 295 A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} 296 A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} 297 A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} 298 A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} 299 A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} 300 //------------------------------------------------------------------------------------------------------------------------------ 301 #ifdef A_GCC 302 A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} 303 A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} 304 #else 305 A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} 306 A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} 307 #endif 308 //------------------------------------------------------------------------------------------------------------------------------ 309 #ifdef A_GCC 310 A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} 311 A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} 312 #else 313 A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} 314 A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} 315 #endif 316 //------------------------------------------------------------------------------------------------------------------------------ 317 A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} 318 A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} 319 //------------------------------------------------------------------------------------------------------------------------------ 320 #ifdef A_GCC 321 A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} 322 A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} 323 #else 324 A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} 325 A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} 326 #endif 327 //------------------------------------------------------------------------------------------------------------------------------ 328 A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} 329 A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} 330 A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} 331 A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} 332 //------------------------------------------------------------------------------------------------------------------------------ 333 // These follow the convention that A integer types don't have signage, until they are operated on. 334 A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} 335 A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} 336 //------------------------------------------------------------------------------------------------------------------------------ 337 A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;} 338 A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;} 339 A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;} 340 A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;} 341 //------------------------------------------------------------------------------------------------------------------------------ 342 A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;} 343 A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;} 344 //------------------------------------------------------------------------------------------------------------------------------ 345 A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;} 346 A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;} 347 //------------------------------------------------------------------------------------------------------------------------------ 348 A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));} 349 A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} 350 //------------------------------------------------------------------------------------------------------------------------------ 351 #ifdef A_GCC 352 A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} 353 A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} 354 #else 355 A_STATIC AD1 ASinD1(AD1 a){return sin(a);} 356 A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} 357 #endif 358 //------------------------------------------------------------------------------------------------------------------------------ 359 #ifdef A_GCC 360 A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} 361 A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} 362 #else 363 A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} 364 A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} 365 #endif 366 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 367 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 368 //_____________________________________________________________/\_______________________________________________________________ 369 //============================================================================================================================== 370 // SCALAR RETURN OPS - DEPENDENT 371 //============================================================================================================================== 372 A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} 373 A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} 374 //------------------------------------------------------------------------------------------------------------------------------ 375 A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} 376 A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} 377 //------------------------------------------------------------------------------------------------------------------------------ 378 A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} 379 A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} 380 //------------------------------------------------------------------------------------------------------------------------------ 381 A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} 382 A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} 383 //------------------------------------------------------------------------------------------------------------------------------ 384 A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} 385 A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} 386 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 387 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 388 //_____________________________________________________________/\_______________________________________________________________ 389 //============================================================================================================================== 390 // VECTOR OPS 391 //------------------------------------------------------------------------------------------------------------------------------ 392 // These are added as needed for production or prototyping, so not necessarily a complete set. 393 // They follow a convention of taking in a destination and also returning the destination value to increase utility. 394 //============================================================================================================================== 395 A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} 396 A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} 397 A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} 398 //------------------------------------------------------------------------------------------------------------------------------ 399 A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} 400 A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} 401 A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} 402 //============================================================================================================================== 403 A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} 404 A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} 405 A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} 406 //------------------------------------------------------------------------------------------------------------------------------ 407 A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} 408 A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} 409 A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} 410 //============================================================================================================================== 411 A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} 412 A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} 413 A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} 414 //------------------------------------------------------------------------------------------------------------------------------ 415 A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} 416 A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} 417 A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} 418 //============================================================================================================================== 419 A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} 420 A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} 421 A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} 422 //------------------------------------------------------------------------------------------------------------------------------ 423 A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} 424 A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} 425 A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} 426 //============================================================================================================================== 427 A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} 428 A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} 429 A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} 430 //------------------------------------------------------------------------------------------------------------------------------ 431 A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} 432 A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} 433 A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} 434 //============================================================================================================================== 435 A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} 436 A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} 437 A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} 438 //------------------------------------------------------------------------------------------------------------------------------ 439 A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} 440 A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} 441 A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} 442 //============================================================================================================================== 443 A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} 444 A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} 445 A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} 446 //------------------------------------------------------------------------------------------------------------------------------ 447 A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} 448 A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} 449 A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} 450 //============================================================================================================================== 451 A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} 452 A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} 453 A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} 454 //------------------------------------------------------------------------------------------------------------------------------ 455 A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} 456 A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} 457 A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} 458 //============================================================================================================================== 459 A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} 460 A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} 461 A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} 462 //------------------------------------------------------------------------------------------------------------------------------ 463 A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} 464 A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} 465 A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} 466 //============================================================================================================================== 467 A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} 468 A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} 469 A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} 470 //------------------------------------------------------------------------------------------------------------------------------ 471 A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} 472 A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} 473 A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} 474 //============================================================================================================================== 475 A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} 476 A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} 477 A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} 478 //------------------------------------------------------------------------------------------------------------------------------ 479 A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} 480 A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} 481 A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} 482 //============================================================================================================================== 483 A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} 484 A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} 485 A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} 486 //------------------------------------------------------------------------------------------------------------------------------ 487 A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} 488 A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} 489 A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} 490 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 491 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 492 //_____________________________________________________________/\_______________________________________________________________ 493 //============================================================================================================================== 494 // HALF FLOAT PACKING 495 //============================================================================================================================== 496 // Convert float to half (in lower 16-bits of output). 497 // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf 498 // Supports denormals. 499 // Conversion rules are to make computations possibly "safer" on the GPU, 500 // -INF & -NaN -> -65504 501 // +INF & +NaN -> +65504 502 A_STATIC AU1 AU1_AH1_AF1(AF1 f){ 503 static AW1 base[512]={ 504 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 505 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 506 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 507 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 508 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 509 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 510 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, 511 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, 512 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, 513 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 514 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 515 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 516 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 517 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 518 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 519 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 520 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 521 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 522 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 523 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 524 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 525 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 526 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, 527 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, 528 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, 529 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 530 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 531 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 532 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 533 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 534 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 535 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; 536 static AB1 shift[512]={ 537 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 538 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 539 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 540 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 541 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 542 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 543 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 544 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 545 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 546 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 547 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 548 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 549 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 550 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 551 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 552 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 553 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 554 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 555 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 556 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 557 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 558 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 559 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 560 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 561 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 562 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 563 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 564 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 565 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 566 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 567 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 568 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; 569 union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} 570 //------------------------------------------------------------------------------------------------------------------------------ 571 // Used to output packed constant. 572 A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} 573 #endif 574 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 575 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 576 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 577 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 578 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 579 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 580 //_____________________________________________________________/\_______________________________________________________________ 581 //============================================================================================================================== 582 // 583 // 584 // GLSL 585 // 586 // 587 //============================================================================================================================== 588 #if defined(A_GLSL) && defined(A_GPU) 589 #ifndef A_SKIP_EXT 590 #ifdef A_HALF 591 #extension GL_EXT_shader_16bit_storage:require 592 #extension GL_EXT_shader_explicit_arithmetic_types:require 593 #endif 594 //------------------------------------------------------------------------------------------------------------------------------ 595 #ifdef A_LONG 596 #extension GL_ARB_gpu_shader_int64:require 597 #extension GL_NV_shader_atomic_int64:require 598 #endif 599 //------------------------------------------------------------------------------------------------------------------------------ 600 #ifdef A_WAVE 601 #extension GL_KHR_shader_subgroup_arithmetic:require 602 #extension GL_KHR_shader_subgroup_ballot:require 603 #extension GL_KHR_shader_subgroup_quad:require 604 #extension GL_KHR_shader_subgroup_shuffle:require 605 #endif 606 #endif 607 //============================================================================================================================== 608 #define AP1 bool 609 #define AP2 bvec2 610 #define AP3 bvec3 611 #define AP4 bvec4 612 //------------------------------------------------------------------------------------------------------------------------------ 613 #define AF1 float 614 #define AF2 vec2 615 #define AF3 vec3 616 #define AF4 vec4 617 //------------------------------------------------------------------------------------------------------------------------------ 618 #define AU1 uint 619 #define AU2 uvec2 620 #define AU3 uvec3 621 #define AU4 uvec4 622 //------------------------------------------------------------------------------------------------------------------------------ 623 #define ASU1 int 624 #define ASU2 ivec2 625 #define ASU3 ivec3 626 #define ASU4 ivec4 627 //============================================================================================================================== 628 #define AF1_AU1(x) uintBitsToFloat(AU1(x)) 629 #define AF2_AU2(x) uintBitsToFloat(AU2(x)) 630 #define AF3_AU3(x) uintBitsToFloat(AU3(x)) 631 #define AF4_AU4(x) uintBitsToFloat(AU4(x)) 632 //------------------------------------------------------------------------------------------------------------------------------ 633 #define AU1_AF1(x) floatBitsToUint(AF1(x)) 634 #define AU2_AF2(x) floatBitsToUint(AF2(x)) 635 #define AU3_AF3(x) floatBitsToUint(AF3(x)) 636 #define AU4_AF4(x) floatBitsToUint(AF4(x)) 637 //------------------------------------------------------------------------------------------------------------------------------ 638 AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} 639 #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) 640 //------------------------------------------------------------------------------------------------------------------------------ 641 #define AU1_AH2_AF2 packHalf2x16 642 #define AU1_AW2Unorm_AF2 packUnorm2x16 643 #define AU1_AB4Unorm_AF4 packUnorm4x8 644 //------------------------------------------------------------------------------------------------------------------------------ 645 #define AF2_AH2_AU1 unpackHalf2x16 646 #define AF2_AW2Unorm_AU1 unpackUnorm2x16 647 #define AF4_AB4Unorm_AU1 unpackUnorm4x8 648 //============================================================================================================================== 649 AF1 AF1_x(AF1 a){return AF1(a);} 650 AF2 AF2_x(AF1 a){return AF2(a,a);} 651 AF3 AF3_x(AF1 a){return AF3(a,a,a);} 652 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} 653 #define AF1_(a) AF1_x(AF1(a)) 654 #define AF2_(a) AF2_x(AF1(a)) 655 #define AF3_(a) AF3_x(AF1(a)) 656 #define AF4_(a) AF4_x(AF1(a)) 657 //------------------------------------------------------------------------------------------------------------------------------ 658 AU1 AU1_x(AU1 a){return AU1(a);} 659 AU2 AU2_x(AU1 a){return AU2(a,a);} 660 AU3 AU3_x(AU1 a){return AU3(a,a,a);} 661 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} 662 #define AU1_(a) AU1_x(AU1(a)) 663 #define AU2_(a) AU2_x(AU1(a)) 664 #define AU3_(a) AU3_x(AU1(a)) 665 #define AU4_(a) AU4_x(AU1(a)) 666 //============================================================================================================================== 667 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} 668 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} 669 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} 670 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} 671 //------------------------------------------------------------------------------------------------------------------------------ 672 AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} 673 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} 674 // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. 675 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));} 676 //------------------------------------------------------------------------------------------------------------------------------ 677 // V_MED3_F32. 678 AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);} 679 AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);} 680 AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);} 681 AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);} 682 //------------------------------------------------------------------------------------------------------------------------------ 683 // V_FRACT_F32 (note DX frac() is different). 684 AF1 AFractF1(AF1 x){return fract(x);} 685 AF2 AFractF2(AF2 x){return fract(x);} 686 AF3 AFractF3(AF3 x){return fract(x);} 687 AF4 AFractF4(AF4 x){return fract(x);} 688 //------------------------------------------------------------------------------------------------------------------------------ 689 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);} 690 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);} 691 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);} 692 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);} 693 //------------------------------------------------------------------------------------------------------------------------------ 694 // V_MAX3_F32. 695 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} 696 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} 697 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} 698 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} 699 //------------------------------------------------------------------------------------------------------------------------------ 700 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} 701 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} 702 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} 703 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} 704 //------------------------------------------------------------------------------------------------------------------------------ 705 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} 706 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} 707 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} 708 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} 709 //------------------------------------------------------------------------------------------------------------------------------ 710 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} 711 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} 712 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} 713 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} 714 //------------------------------------------------------------------------------------------------------------------------------ 715 // Clamp has an easier pattern match for med3 when some ordering is known. 716 // V_MED3_F32. 717 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} 718 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} 719 AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} 720 AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} 721 //------------------------------------------------------------------------------------------------------------------------------ 722 // V_MIN3_F32. 723 AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} 724 AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} 725 AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} 726 AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} 727 //------------------------------------------------------------------------------------------------------------------------------ 728 AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} 729 AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} 730 AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} 731 AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} 732 //------------------------------------------------------------------------------------------------------------------------------ 733 AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} 734 AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} 735 AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} 736 AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} 737 //------------------------------------------------------------------------------------------------------------------------------ 738 AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} 739 AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} 740 AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} 741 AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} 742 //------------------------------------------------------------------------------------------------------------------------------ 743 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. 744 // V_COS_F32. 745 AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} 746 AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} 747 AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} 748 AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} 749 //------------------------------------------------------------------------------------------------------------------------------ 750 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. 751 // V_SIN_F32. 752 AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} 753 AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} 754 AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} 755 AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} 756 //------------------------------------------------------------------------------------------------------------------------------ 757 AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;} 758 AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;} 759 AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;} 760 AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;} 761 //------------------------------------------------------------------------------------------------------------------------------ 762 AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);} 763 AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);} 764 AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);} 765 AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);} 766 //------------------------------------------------------------------------------------------------------------------------------ 767 AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));} 768 AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));} 769 AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));} 770 AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));} 771 //------------------------------------------------------------------------------------------------------------------------------ 772 AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} 773 AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} 774 AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} 775 AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} 776 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 777 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 778 //_____________________________________________________________/\_______________________________________________________________ 779 //============================================================================================================================== 780 // GLSL BYTE 781 //============================================================================================================================== 782 #ifdef A_BYTE 783 #define AB1 uint8_t 784 #define AB2 u8vec2 785 #define AB3 u8vec3 786 #define AB4 u8vec4 787 //------------------------------------------------------------------------------------------------------------------------------ 788 #define ASB1 int8_t 789 #define ASB2 i8vec2 790 #define ASB3 i8vec3 791 #define ASB4 i8vec4 792 //------------------------------------------------------------------------------------------------------------------------------ 793 AB1 AB1_x(AB1 a){return AB1(a);} 794 AB2 AB2_x(AB1 a){return AB2(a,a);} 795 AB3 AB3_x(AB1 a){return AB3(a,a,a);} 796 AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} 797 #define AB1_(a) AB1_x(AB1(a)) 798 #define AB2_(a) AB2_x(AB1(a)) 799 #define AB3_(a) AB3_x(AB1(a)) 800 #define AB4_(a) AB4_x(AB1(a)) 801 #endif 802 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 803 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 804 //_____________________________________________________________/\_______________________________________________________________ 805 //============================================================================================================================== 806 // GLSL HALF 807 //============================================================================================================================== 808 #ifdef A_HALF 809 #define AH1 float16_t 810 #define AH2 f16vec2 811 #define AH3 f16vec3 812 #define AH4 f16vec4 813 //------------------------------------------------------------------------------------------------------------------------------ 814 #define AW1 uint16_t 815 #define AW2 u16vec2 816 #define AW3 u16vec3 817 #define AW4 u16vec4 818 //------------------------------------------------------------------------------------------------------------------------------ 819 #define ASW1 int16_t 820 #define ASW2 i16vec2 821 #define ASW3 i16vec3 822 #define ASW4 i16vec4 823 //============================================================================================================================== 824 #define AH2_AU1(x) unpackFloat2x16(AU1(x)) 825 AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} 826 #define AH4_AU2(x) AH4_AU2_x(AU2(x)) 827 #define AW2_AU1(x) unpackUint2x16(AU1(x)) 828 #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) 829 //------------------------------------------------------------------------------------------------------------------------------ 830 #define AU1_AH2(x) packFloat2x16(AH2(x)) 831 AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} 832 #define AU2_AH4(x) AU2_AH4_x(AH4(x)) 833 #define AU1_AW2(x) packUint2x16(AW2(x)) 834 #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) 835 //============================================================================================================================== 836 #define AW1_AH1(x) halfBitsToUint16(AH1(x)) 837 #define AW2_AH2(x) halfBitsToUint16(AH2(x)) 838 #define AW3_AH3(x) halfBitsToUint16(AH3(x)) 839 #define AW4_AH4(x) halfBitsToUint16(AH4(x)) 840 //------------------------------------------------------------------------------------------------------------------------------ 841 #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) 842 #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) 843 #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) 844 #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) 845 //============================================================================================================================== 846 AH1 AH1_x(AH1 a){return AH1(a);} 847 AH2 AH2_x(AH1 a){return AH2(a,a);} 848 AH3 AH3_x(AH1 a){return AH3(a,a,a);} 849 AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} 850 #define AH1_(a) AH1_x(AH1(a)) 851 #define AH2_(a) AH2_x(AH1(a)) 852 #define AH3_(a) AH3_x(AH1(a)) 853 #define AH4_(a) AH4_x(AH1(a)) 854 //------------------------------------------------------------------------------------------------------------------------------ 855 AW1 AW1_x(AW1 a){return AW1(a);} 856 AW2 AW2_x(AW1 a){return AW2(a,a);} 857 AW3 AW3_x(AW1 a){return AW3(a,a,a);} 858 AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} 859 #define AW1_(a) AW1_x(AW1(a)) 860 #define AW2_(a) AW2_x(AW1(a)) 861 #define AW3_(a) AW3_x(AW1(a)) 862 #define AW4_(a) AW4_x(AW1(a)) 863 //============================================================================================================================== 864 AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} 865 AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} 866 AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} 867 AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} 868 //------------------------------------------------------------------------------------------------------------------------------ 869 AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} 870 AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} 871 AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} 872 AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} 873 //------------------------------------------------------------------------------------------------------------------------------ 874 AH1 AFractH1(AH1 x){return fract(x);} 875 AH2 AFractH2(AH2 x){return fract(x);} 876 AH3 AFractH3(AH3 x){return fract(x);} 877 AH4 AFractH4(AH4 x){return fract(x);} 878 //------------------------------------------------------------------------------------------------------------------------------ 879 AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} 880 AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} 881 AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} 882 AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} 883 //------------------------------------------------------------------------------------------------------------------------------ 884 // No packed version of max3. 885 AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} 886 AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} 887 AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} 888 AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} 889 //------------------------------------------------------------------------------------------------------------------------------ 890 AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} 891 AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} 892 AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} 893 AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} 894 //------------------------------------------------------------------------------------------------------------------------------ 895 // No packed version of min3. 896 AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} 897 AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} 898 AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} 899 AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} 900 //------------------------------------------------------------------------------------------------------------------------------ 901 AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} 902 AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} 903 AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} 904 AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} 905 //------------------------------------------------------------------------------------------------------------------------------ 906 AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} 907 AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} 908 AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} 909 AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} 910 //------------------------------------------------------------------------------------------------------------------------------ 911 AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} 912 AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} 913 AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} 914 AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} 915 //------------------------------------------------------------------------------------------------------------------------------ 916 AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} 917 AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} 918 AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} 919 AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} 920 //------------------------------------------------------------------------------------------------------------------------------ 921 AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} 922 AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} 923 AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} 924 AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} 925 #endif 926 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 927 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 928 //_____________________________________________________________/\_______________________________________________________________ 929 //============================================================================================================================== 930 // GLSL DOUBLE 931 //============================================================================================================================== 932 #ifdef A_DUBL 933 #define AD1 double 934 #define AD2 dvec2 935 #define AD3 dvec3 936 #define AD4 dvec4 937 //------------------------------------------------------------------------------------------------------------------------------ 938 AD1 AD1_x(AD1 a){return AD1(a);} 939 AD2 AD2_x(AD1 a){return AD2(a,a);} 940 AD3 AD3_x(AD1 a){return AD3(a,a,a);} 941 AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} 942 #define AD1_(a) AD1_x(AD1(a)) 943 #define AD2_(a) AD2_x(AD1(a)) 944 #define AD3_(a) AD3_x(AD1(a)) 945 #define AD4_(a) AD4_x(AD1(a)) 946 //============================================================================================================================== 947 AD1 AFractD1(AD1 x){return fract(x);} 948 AD2 AFractD2(AD2 x){return fract(x);} 949 AD3 AFractD3(AD3 x){return fract(x);} 950 AD4 AFractD4(AD4 x){return fract(x);} 951 //------------------------------------------------------------------------------------------------------------------------------ 952 AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} 953 AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} 954 AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} 955 AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} 956 //------------------------------------------------------------------------------------------------------------------------------ 957 AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} 958 AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} 959 AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} 960 AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} 961 //------------------------------------------------------------------------------------------------------------------------------ 962 AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} 963 AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} 964 AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} 965 AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} 966 //------------------------------------------------------------------------------------------------------------------------------ 967 AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} 968 AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} 969 AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} 970 AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} 971 #endif 972 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 973 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 974 //_____________________________________________________________/\_______________________________________________________________ 975 //============================================================================================================================== 976 // GLSL LONG 977 //============================================================================================================================== 978 #ifdef A_LONG 979 #define AL1 uint64_t 980 #define AL2 u64vec2 981 #define AL3 u64vec3 982 #define AL4 u64vec4 983 //------------------------------------------------------------------------------------------------------------------------------ 984 #define ASL1 int64_t 985 #define ASL2 i64vec2 986 #define ASL3 i64vec3 987 #define ASL4 i64vec4 988 //------------------------------------------------------------------------------------------------------------------------------ 989 #define AL1_AU2(x) packUint2x32(AU2(x)) 990 #define AU2_AL1(x) unpackUint2x32(AL1(x)) 991 //------------------------------------------------------------------------------------------------------------------------------ 992 AL1 AL1_x(AL1 a){return AL1(a);} 993 AL2 AL2_x(AL1 a){return AL2(a,a);} 994 AL3 AL3_x(AL1 a){return AL3(a,a,a);} 995 AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} 996 #define AL1_(a) AL1_x(AL1(a)) 997 #define AL2_(a) AL2_x(AL1(a)) 998 #define AL3_(a) AL3_x(AL1(a)) 999 #define AL4_(a) AL4_x(AL1(a)) 1000 //============================================================================================================================== 1001 AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} 1002 AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} 1003 AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} 1004 AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} 1005 //------------------------------------------------------------------------------------------------------------------------------ 1006 AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} 1007 AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} 1008 AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} 1009 AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} 1010 //------------------------------------------------------------------------------------------------------------------------------ 1011 AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} 1012 AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} 1013 AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} 1014 AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} 1015 #endif 1016 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1017 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1018 //_____________________________________________________________/\_______________________________________________________________ 1019 //============================================================================================================================== 1020 // WAVE OPERATIONS 1021 //============================================================================================================================== 1022 #ifdef A_WAVE 1023 // Where 'x' must be a compile time literal. 1024 AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} 1025 AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} 1026 AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} 1027 AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} 1028 AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} 1029 AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} 1030 AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} 1031 AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} 1032 //------------------------------------------------------------------------------------------------------------------------------ 1033 #ifdef A_HALF 1034 AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} 1035 AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} 1036 AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} 1037 AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} 1038 #endif 1039 #endif 1040 //============================================================================================================================== 1041 #endif 1042 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1043 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1044 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1045 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1046 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1047 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1048 //_____________________________________________________________/\_______________________________________________________________ 1049 //============================================================================================================================== 1050 // 1051 // 1052 // HLSL 1053 // 1054 // 1055 //============================================================================================================================== 1056 #if defined(A_HLSL) && defined(A_GPU) 1057 #ifdef A_HLSL_6_2 1058 #define AP1 bool 1059 #define AP2 bool2 1060 #define AP3 bool3 1061 #define AP4 bool4 1062 //------------------------------------------------------------------------------------------------------------------------------ 1063 #define AF1 float32_t 1064 #define AF2 float32_t2 1065 #define AF3 float32_t3 1066 #define AF4 float32_t4 1067 //------------------------------------------------------------------------------------------------------------------------------ 1068 #define AU1 uint32_t 1069 #define AU2 uint32_t2 1070 #define AU3 uint32_t3 1071 #define AU4 uint32_t4 1072 //------------------------------------------------------------------------------------------------------------------------------ 1073 #define ASU1 int32_t 1074 #define ASU2 int32_t2 1075 #define ASU3 int32_t3 1076 #define ASU4 int32_t4 1077 #else 1078 #define AP1 bool 1079 #define AP2 bool2 1080 #define AP3 bool3 1081 #define AP4 bool4 1082 //------------------------------------------------------------------------------------------------------------------------------ 1083 #define AF1 float 1084 #define AF2 float2 1085 #define AF3 float3 1086 #define AF4 float4 1087 //------------------------------------------------------------------------------------------------------------------------------ 1088 #define AU1 uint 1089 #define AU2 uint2 1090 #define AU3 uint3 1091 #define AU4 uint4 1092 //------------------------------------------------------------------------------------------------------------------------------ 1093 #define ASU1 int 1094 #define ASU2 int2 1095 #define ASU3 int3 1096 #define ASU4 int4 1097 #endif 1098 //============================================================================================================================== 1099 #define AF1_AU1(x) asfloat(AU1(x)) 1100 #define AF2_AU2(x) asfloat(AU2(x)) 1101 #define AF3_AU3(x) asfloat(AU3(x)) 1102 #define AF4_AU4(x) asfloat(AU4(x)) 1103 //------------------------------------------------------------------------------------------------------------------------------ 1104 #define AU1_AF1(x) asuint(AF1(x)) 1105 #define AU2_AF2(x) asuint(AF2(x)) 1106 #define AU3_AF3(x) asuint(AF3(x)) 1107 #define AU4_AF4(x) asuint(AF4(x)) 1108 //------------------------------------------------------------------------------------------------------------------------------ 1109 AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} 1110 #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) 1111 //------------------------------------------------------------------------------------------------------------------------------ 1112 AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} 1113 #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 1114 #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) 1115 //------------------------------------------------------------------------------------------------------------------------------ 1116 AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} 1117 #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) 1118 //============================================================================================================================== 1119 AF1 AF1_x(AF1 a){return AF1(a);} 1120 AF2 AF2_x(AF1 a){return AF2(a,a);} 1121 AF3 AF3_x(AF1 a){return AF3(a,a,a);} 1122 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} 1123 #define AF1_(a) AF1_x(AF1(a)) 1124 #define AF2_(a) AF2_x(AF1(a)) 1125 #define AF3_(a) AF3_x(AF1(a)) 1126 #define AF4_(a) AF4_x(AF1(a)) 1127 //------------------------------------------------------------------------------------------------------------------------------ 1128 AU1 AU1_x(AU1 a){return AU1(a);} 1129 AU2 AU2_x(AU1 a){return AU2(a,a);} 1130 AU3 AU3_x(AU1 a){return AU3(a,a,a);} 1131 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} 1132 #define AU1_(a) AU1_x(AU1(a)) 1133 #define AU2_(a) AU2_x(AU1(a)) 1134 #define AU3_(a) AU3_x(AU1(a)) 1135 #define AU4_(a) AU4_x(AU1(a)) 1136 //============================================================================================================================== 1137 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} 1138 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} 1139 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} 1140 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} 1141 //------------------------------------------------------------------------------------------------------------------------------ 1142 AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;} 1143 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} 1144 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));} 1145 //------------------------------------------------------------------------------------------------------------------------------ 1146 AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));} 1147 AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));} 1148 AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));} 1149 AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));} 1150 //------------------------------------------------------------------------------------------------------------------------------ 1151 AF1 AFractF1(AF1 x){return x-floor(x);} 1152 AF2 AFractF2(AF2 x){return x-floor(x);} 1153 AF3 AFractF3(AF3 x){return x-floor(x);} 1154 AF4 AFractF4(AF4 x){return x-floor(x);} 1155 //------------------------------------------------------------------------------------------------------------------------------ 1156 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);} 1157 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);} 1158 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);} 1159 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);} 1160 //------------------------------------------------------------------------------------------------------------------------------ 1161 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} 1162 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} 1163 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} 1164 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} 1165 //------------------------------------------------------------------------------------------------------------------------------ 1166 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} 1167 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} 1168 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} 1169 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} 1170 //------------------------------------------------------------------------------------------------------------------------------ 1171 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} 1172 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} 1173 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} 1174 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} 1175 //------------------------------------------------------------------------------------------------------------------------------ 1176 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} 1177 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} 1178 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} 1179 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} 1180 //------------------------------------------------------------------------------------------------------------------------------ 1181 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} 1182 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} 1183 AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} 1184 AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} 1185 //------------------------------------------------------------------------------------------------------------------------------ 1186 AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} 1187 AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} 1188 AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} 1189 AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} 1190 //------------------------------------------------------------------------------------------------------------------------------ 1191 AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} 1192 AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} 1193 AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} 1194 AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} 1195 //------------------------------------------------------------------------------------------------------------------------------ 1196 AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} 1197 AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} 1198 AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} 1199 AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} 1200 //------------------------------------------------------------------------------------------------------------------------------ 1201 AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} 1202 AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} 1203 AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} 1204 AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} 1205 //------------------------------------------------------------------------------------------------------------------------------ 1206 AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} 1207 AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} 1208 AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} 1209 AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} 1210 //------------------------------------------------------------------------------------------------------------------------------ 1211 AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} 1212 AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} 1213 AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} 1214 AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} 1215 //------------------------------------------------------------------------------------------------------------------------------ 1216 AF1 ARcpF1(AF1 x){return rcp(x);} 1217 AF2 ARcpF2(AF2 x){return rcp(x);} 1218 AF3 ARcpF3(AF3 x){return rcp(x);} 1219 AF4 ARcpF4(AF4 x){return rcp(x);} 1220 //------------------------------------------------------------------------------------------------------------------------------ 1221 AF1 ARsqF1(AF1 x){return rsqrt(x);} 1222 AF2 ARsqF2(AF2 x){return rsqrt(x);} 1223 AF3 ARsqF3(AF3 x){return rsqrt(x);} 1224 AF4 ARsqF4(AF4 x){return rsqrt(x);} 1225 //------------------------------------------------------------------------------------------------------------------------------ 1226 AF1 ASatF1(AF1 x){return saturate(x);} 1227 AF2 ASatF2(AF2 x){return saturate(x);} 1228 AF3 ASatF3(AF3 x){return saturate(x);} 1229 AF4 ASatF4(AF4 x){return saturate(x);} 1230 //------------------------------------------------------------------------------------------------------------------------------ 1231 AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} 1232 AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} 1233 AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} 1234 AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} 1235 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1236 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1237 //_____________________________________________________________/\_______________________________________________________________ 1238 //============================================================================================================================== 1239 // HLSL BYTE 1240 //============================================================================================================================== 1241 #ifdef A_BYTE 1242 #endif 1243 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1244 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1245 //_____________________________________________________________/\_______________________________________________________________ 1246 //============================================================================================================================== 1247 // HLSL HALF 1248 //============================================================================================================================== 1249 #ifdef A_HALF 1250 #ifdef A_HLSL_6_2 1251 #define AH1 float16_t 1252 #define AH2 float16_t2 1253 #define AH3 float16_t3 1254 #define AH4 float16_t4 1255 //------------------------------------------------------------------------------------------------------------------------------ 1256 #define AW1 uint16_t 1257 #define AW2 uint16_t2 1258 #define AW3 uint16_t3 1259 #define AW4 uint16_t4 1260 //------------------------------------------------------------------------------------------------------------------------------ 1261 #define ASW1 int16_t 1262 #define ASW2 int16_t2 1263 #define ASW3 int16_t3 1264 #define ASW4 int16_t4 1265 #else 1266 #define AH1 min16float 1267 #define AH2 min16float2 1268 #define AH3 min16float3 1269 #define AH4 min16float4 1270 //------------------------------------------------------------------------------------------------------------------------------ 1271 #define AW1 min16uint 1272 #define AW2 min16uint2 1273 #define AW3 min16uint3 1274 #define AW4 min16uint4 1275 //------------------------------------------------------------------------------------------------------------------------------ 1276 #define ASW1 min16int 1277 #define ASW2 min16int2 1278 #define ASW3 min16int3 1279 #define ASW4 min16int4 1280 #endif 1281 //============================================================================================================================== 1282 // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). 1283 // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ 1284 AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} 1285 AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} 1286 AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} 1287 AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} 1288 #define AH2_AU1(x) AH2_AU1_x(AU1(x)) 1289 #define AH4_AU2(x) AH4_AU2_x(AU2(x)) 1290 #define AW2_AU1(x) AW2_AU1_x(AU1(x)) 1291 #define AW4_AU2(x) AW4_AU2_x(AU2(x)) 1292 //------------------------------------------------------------------------------------------------------------------------------ 1293 AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} 1294 AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} 1295 AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} 1296 AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} 1297 #define AU1_AH2(x) AU1_AH2_x(AH2(x)) 1298 #define AU2_AH4(x) AU2_AH4_x(AH4(x)) 1299 #define AU1_AW2(x) AU1_AW2_x(AW2(x)) 1300 #define AU2_AW4(x) AU2_AW4_x(AW4(x)) 1301 //============================================================================================================================== 1302 #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) 1303 #define AW1_AH1(x) asuint16(x) 1304 #define AW2_AH2(x) asuint16(x) 1305 #define AW3_AH3(x) asuint16(x) 1306 #define AW4_AH4(x) asuint16(x) 1307 #else 1308 #define AW1_AH1(a) AW1(f32tof16(AF1(a))) 1309 #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) 1310 #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) 1311 #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) 1312 #endif 1313 //------------------------------------------------------------------------------------------------------------------------------ 1314 #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) 1315 #define AH1_AW1(x) asfloat16(x) 1316 #define AH2_AW2(x) asfloat16(x) 1317 #define AH3_AW3(x) asfloat16(x) 1318 #define AH4_AW4(x) asfloat16(x) 1319 #else 1320 #define AH1_AW1(a) AH1(f16tof32(AU1(a))) 1321 #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) 1322 #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) 1323 #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) 1324 #endif 1325 //============================================================================================================================== 1326 AH1 AH1_x(AH1 a){return AH1(a);} 1327 AH2 AH2_x(AH1 a){return AH2(a,a);} 1328 AH3 AH3_x(AH1 a){return AH3(a,a,a);} 1329 AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} 1330 #define AH1_(a) AH1_x(AH1(a)) 1331 #define AH2_(a) AH2_x(AH1(a)) 1332 #define AH3_(a) AH3_x(AH1(a)) 1333 #define AH4_(a) AH4_x(AH1(a)) 1334 //------------------------------------------------------------------------------------------------------------------------------ 1335 AW1 AW1_x(AW1 a){return AW1(a);} 1336 AW2 AW2_x(AW1 a){return AW2(a,a);} 1337 AW3 AW3_x(AW1 a){return AW3(a,a,a);} 1338 AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} 1339 #define AW1_(a) AW1_x(AW1(a)) 1340 #define AW2_(a) AW2_x(AW1(a)) 1341 #define AW3_(a) AW3_x(AW1(a)) 1342 #define AW4_(a) AW4_x(AW1(a)) 1343 //============================================================================================================================== 1344 AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} 1345 AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} 1346 AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} 1347 AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} 1348 //------------------------------------------------------------------------------------------------------------------------------ 1349 AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} 1350 AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} 1351 AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} 1352 AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} 1353 //------------------------------------------------------------------------------------------------------------------------------ 1354 // V_FRACT_F16 (note DX frac() is different). 1355 AH1 AFractH1(AH1 x){return x-floor(x);} 1356 AH2 AFractH2(AH2 x){return x-floor(x);} 1357 AH3 AFractH3(AH3 x){return x-floor(x);} 1358 AH4 AFractH4(AH4 x){return x-floor(x);} 1359 //------------------------------------------------------------------------------------------------------------------------------ 1360 AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} 1361 AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} 1362 AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} 1363 AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} 1364 //------------------------------------------------------------------------------------------------------------------------------ 1365 AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} 1366 AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} 1367 AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} 1368 AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} 1369 //------------------------------------------------------------------------------------------------------------------------------ 1370 AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} 1371 AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} 1372 AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} 1373 AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} 1374 //------------------------------------------------------------------------------------------------------------------------------ 1375 AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} 1376 AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} 1377 AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} 1378 AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} 1379 //------------------------------------------------------------------------------------------------------------------------------ 1380 AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} 1381 AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} 1382 AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} 1383 AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} 1384 //------------------------------------------------------------------------------------------------------------------------------ 1385 AH1 ARcpH1(AH1 x){return rcp(x);} 1386 AH2 ARcpH2(AH2 x){return rcp(x);} 1387 AH3 ARcpH3(AH3 x){return rcp(x);} 1388 AH4 ARcpH4(AH4 x){return rcp(x);} 1389 //------------------------------------------------------------------------------------------------------------------------------ 1390 AH1 ARsqH1(AH1 x){return rsqrt(x);} 1391 AH2 ARsqH2(AH2 x){return rsqrt(x);} 1392 AH3 ARsqH3(AH3 x){return rsqrt(x);} 1393 AH4 ARsqH4(AH4 x){return rsqrt(x);} 1394 //------------------------------------------------------------------------------------------------------------------------------ 1395 AH1 ASatH1(AH1 x){return saturate(x);} 1396 AH2 ASatH2(AH2 x){return saturate(x);} 1397 AH3 ASatH3(AH3 x){return saturate(x);} 1398 AH4 ASatH4(AH4 x){return saturate(x);} 1399 //------------------------------------------------------------------------------------------------------------------------------ 1400 AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} 1401 AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} 1402 AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} 1403 AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} 1404 #endif 1405 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1406 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1407 //_____________________________________________________________/\_______________________________________________________________ 1408 //============================================================================================================================== 1409 // HLSL DOUBLE 1410 //============================================================================================================================== 1411 #ifdef A_DUBL 1412 #ifdef A_HLSL_6_2 1413 #define AD1 float64_t 1414 #define AD2 float64_t2 1415 #define AD3 float64_t3 1416 #define AD4 float64_t4 1417 #else 1418 #define AD1 double 1419 #define AD2 double2 1420 #define AD3 double3 1421 #define AD4 double4 1422 #endif 1423 //------------------------------------------------------------------------------------------------------------------------------ 1424 AD1 AD1_x(AD1 a){return AD1(a);} 1425 AD2 AD2_x(AD1 a){return AD2(a,a);} 1426 AD3 AD3_x(AD1 a){return AD3(a,a,a);} 1427 AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} 1428 #define AD1_(a) AD1_x(AD1(a)) 1429 #define AD2_(a) AD2_x(AD1(a)) 1430 #define AD3_(a) AD3_x(AD1(a)) 1431 #define AD4_(a) AD4_x(AD1(a)) 1432 //============================================================================================================================== 1433 AD1 AFractD1(AD1 a){return a-floor(a);} 1434 AD2 AFractD2(AD2 a){return a-floor(a);} 1435 AD3 AFractD3(AD3 a){return a-floor(a);} 1436 AD4 AFractD4(AD4 a){return a-floor(a);} 1437 //------------------------------------------------------------------------------------------------------------------------------ 1438 AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} 1439 AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} 1440 AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} 1441 AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} 1442 //------------------------------------------------------------------------------------------------------------------------------ 1443 AD1 ARcpD1(AD1 x){return rcp(x);} 1444 AD2 ARcpD2(AD2 x){return rcp(x);} 1445 AD3 ARcpD3(AD3 x){return rcp(x);} 1446 AD4 ARcpD4(AD4 x){return rcp(x);} 1447 //------------------------------------------------------------------------------------------------------------------------------ 1448 AD1 ARsqD1(AD1 x){return rsqrt(x);} 1449 AD2 ARsqD2(AD2 x){return rsqrt(x);} 1450 AD3 ARsqD3(AD3 x){return rsqrt(x);} 1451 AD4 ARsqD4(AD4 x){return rsqrt(x);} 1452 //------------------------------------------------------------------------------------------------------------------------------ 1453 AD1 ASatD1(AD1 x){return saturate(x);} 1454 AD2 ASatD2(AD2 x){return saturate(x);} 1455 AD3 ASatD3(AD3 x){return saturate(x);} 1456 AD4 ASatD4(AD4 x){return saturate(x);} 1457 #endif 1458 //============================================================================================================================== 1459 // HLSL WAVE 1460 //============================================================================================================================== 1461 #ifdef A_WAVE 1462 // Where 'x' must be a compile time literal. 1463 AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1464 AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1465 AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1466 AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1467 AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1468 AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1469 AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1470 AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} 1471 //------------------------------------------------------------------------------------------------------------------------------ 1472 #ifdef A_HALF 1473 AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} 1474 AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} 1475 AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} 1476 AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} 1477 #endif 1478 #endif 1479 //============================================================================================================================== 1480 #endif 1481 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1482 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1483 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1484 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1485 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1486 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1487 //_____________________________________________________________/\_______________________________________________________________ 1488 //============================================================================================================================== 1489 // 1490 // 1491 // GPU COMMON 1492 // 1493 // 1494 //============================================================================================================================== 1495 #ifdef A_GPU 1496 // Negative and positive infinity. 1497 #define A_INFP_F AF1_AU1(0x7f800000u) 1498 #define A_INFN_F AF1_AU1(0xff800000u) 1499 //------------------------------------------------------------------------------------------------------------------------------ 1500 // Copy sign from 's' to positive 'd'. 1501 AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} 1502 AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} 1503 AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} 1504 AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} 1505 //------------------------------------------------------------------------------------------------------------------------------ 1506 // Single operation to return (useful to create a mask to use in lerp for branch free logic), 1507 // m=NaN := 0 1508 // m>=0 := 0 1509 // m<0 := 1 1510 // Uses the following useful floating point logic, 1511 // saturate(+a*(-INF)==-INF) := 0 1512 // saturate( 0*(-INF)== NaN) := 0 1513 // saturate(-a*(-INF)==+INF) := 1 1514 AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} 1515 AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} 1516 AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} 1517 AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} 1518 //------------------------------------------------------------------------------------------------------------------------------ 1519 AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} 1520 AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} 1521 AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} 1522 AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} 1523 //============================================================================================================================== 1524 #ifdef A_HALF 1525 #ifdef A_HLSL_6_2 1526 #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) 1527 #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) 1528 #else 1529 #define A_INFP_H AH1_AW1(0x7c00u) 1530 #define A_INFN_H AH1_AW1(0xfc00u) 1531 #endif 1532 1533 //------------------------------------------------------------------------------------------------------------------------------ 1534 AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} 1535 AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} 1536 AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} 1537 AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} 1538 //------------------------------------------------------------------------------------------------------------------------------ 1539 AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} 1540 AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} 1541 AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} 1542 AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} 1543 //------------------------------------------------------------------------------------------------------------------------------ 1544 AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} 1545 AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} 1546 AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} 1547 AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} 1548 #endif 1549 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1550 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1551 //_____________________________________________________________/\_______________________________________________________________ 1552 //============================================================================================================================== 1553 // [FIS] FLOAT INTEGER SORTABLE 1554 //------------------------------------------------------------------------------------------------------------------------------ 1555 // Float to integer sortable. 1556 // - If sign bit=0, flip the sign bit (positives). 1557 // - If sign bit=1, flip all bits (negatives). 1558 // Integer sortable to float. 1559 // - If sign bit=1, flip the sign bit (positives). 1560 // - If sign bit=0, flip all bits (negatives). 1561 // Has nice side effects. 1562 // - Larger integers are more positive values. 1563 // - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). 1564 // Burns 3 ops for conversion {shift,or,xor}. 1565 //============================================================================================================================== 1566 AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} 1567 AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} 1568 //------------------------------------------------------------------------------------------------------------------------------ 1569 // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). 1570 AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} 1571 AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} 1572 //------------------------------------------------------------------------------------------------------------------------------ 1573 #ifdef A_HALF 1574 AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} 1575 AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} 1576 //------------------------------------------------------------------------------------------------------------------------------ 1577 AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} 1578 AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} 1579 #endif 1580 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1581 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1582 //_____________________________________________________________/\_______________________________________________________________ 1583 //============================================================================================================================== 1584 // [PERM] V_PERM_B32 1585 //------------------------------------------------------------------------------------------------------------------------------ 1586 // Support for V_PERM_B32 started in the 3rd generation of GCN. 1587 //------------------------------------------------------------------------------------------------------------------------------ 1588 // yyyyxxxx - The 'i' input. 1589 // 76543210 1590 // ======== 1591 // HGFEDCBA - Naming on permutation. 1592 //------------------------------------------------------------------------------------------------------------------------------ 1593 // TODO 1594 // ==== 1595 // - Make sure compiler optimizes this. 1596 //============================================================================================================================== 1597 #ifdef A_HALF 1598 AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} 1599 AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} 1600 AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} 1601 AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} 1602 //------------------------------------------------------------------------------------------------------------------------------ 1603 AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} 1604 AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} 1605 AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} 1606 AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} 1607 AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} 1608 AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} 1609 AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} 1610 AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} 1611 //------------------------------------------------------------------------------------------------------------------------------ 1612 AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} 1613 AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} 1614 #endif 1615 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1616 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1617 //_____________________________________________________________/\_______________________________________________________________ 1618 //============================================================================================================================== 1619 // [BUC] BYTE UNSIGNED CONVERSION 1620 //------------------------------------------------------------------------------------------------------------------------------ 1621 // Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. 1622 // Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. 1623 //------------------------------------------------------------------------------------------------------------------------------ 1624 // OPCODE NOTES 1625 // ============ 1626 // GCN does not do UNORM or SNORM for bytes in opcodes. 1627 // - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. 1628 // - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). 1629 // V_PERM_B32 does byte packing with ability to zero fill bytes as well. 1630 // - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 1631 //------------------------------------------------------------------------------------------------------------------------------ 1632 // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. 1633 // ==== ===== 1634 // 0 : 0 1635 // 1 : 1 1636 // ... 1637 // 255 : 255 1638 // : 256 (just outside the encoding range) 1639 //------------------------------------------------------------------------------------------------------------------------------ 1640 // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. 1641 // ==== ===== 1642 // 0 : 0 1643 // 1 : 1/512 1644 // 2 : 1/256 1645 // ... 1646 // 64 : 1/8 1647 // 128 : 1/4 1648 // 255 : 255/512 1649 // : 1/2 (just outside the encoding range) 1650 //------------------------------------------------------------------------------------------------------------------------------ 1651 // OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES 1652 // ============================================ 1653 // r=ABuc0FromU1(i) 1654 // V_CVT_F32_UBYTE0 r,i 1655 // -------------------------------------------- 1656 // r=ABuc0ToU1(d,i) 1657 // V_CVT_PKACCUM_U8_F32 r,i,0,d 1658 // -------------------------------------------- 1659 // d=ABuc0FromU2(i) 1660 // Where 'k0' is an SGPR with 0x0E0A 1661 // Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits 1662 // V_PERM_B32 d,i.x,i.y,k0 1663 // V_PK_FMA_F16 d,d,k1.x,0 1664 // -------------------------------------------- 1665 // r=ABuc0ToU2(d,i) 1666 // Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits 1667 // Where 'k1' is an SGPR with 0x???? 1668 // Where 'k2' is an SGPR with 0x???? 1669 // V_PK_FMA_F16 i,i,k0.x,0 1670 // V_PERM_B32 r.x,i,i,k1 1671 // V_PERM_B32 r.y,i,i,k2 1672 //============================================================================================================================== 1673 // Peak range for 32-bit and 16-bit operations. 1674 #define A_BUC_32 (255.0) 1675 #define A_BUC_16 (255.0/512.0) 1676 //============================================================================================================================== 1677 #if 1 1678 // Designed to be one V_CVT_PKACCUM_U8_F32. 1679 // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. 1680 AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} 1681 AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} 1682 AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} 1683 AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} 1684 //------------------------------------------------------------------------------------------------------------------------------ 1685 // Designed to be one V_CVT_F32_UBYTE*. 1686 AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} 1687 AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} 1688 AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} 1689 AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} 1690 #endif 1691 //============================================================================================================================== 1692 #ifdef A_HALF 1693 // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. 1694 AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); 1695 return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} 1696 //------------------------------------------------------------------------------------------------------------------------------ 1697 // Designed for 3 ops to do SOA to AOS and conversion. 1698 AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); 1699 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} 1700 AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); 1701 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} 1702 AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); 1703 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} 1704 AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); 1705 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} 1706 //------------------------------------------------------------------------------------------------------------------------------ 1707 // Designed for 2 ops to do both AOS to SOA, and conversion. 1708 AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} 1709 AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} 1710 AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} 1711 AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} 1712 #endif 1713 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1714 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1715 //_____________________________________________________________/\_______________________________________________________________ 1716 //============================================================================================================================== 1717 // [BSC] BYTE SIGNED CONVERSION 1718 //------------------------------------------------------------------------------------------------------------------------------ 1719 // Similar to [BUC]. 1720 // Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. 1721 //------------------------------------------------------------------------------------------------------------------------------ 1722 // ENCODING (without zero-based encoding) 1723 // ======== 1724 // 0 = unused (can be used to mean something else) 1725 // 1 = lowest value 1726 // 128 = exact zero center (zero based encoding 1727 // 255 = highest value 1728 //------------------------------------------------------------------------------------------------------------------------------ 1729 // Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). 1730 // This is useful if there is a desire for cleared values to decode as zero. 1731 //------------------------------------------------------------------------------------------------------------------------------ 1732 // BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. 1733 // ==== ===== 1734 // 0 : -127/512 (unused) 1735 // 1 : -126/512 1736 // 2 : -125/512 1737 // ... 1738 // 128 : 0 1739 // ... 1740 // 255 : 127/512 1741 // : 1/4 (just outside the encoding range) 1742 //============================================================================================================================== 1743 // Peak range for 32-bit and 16-bit operations. 1744 #define A_BSC_32 (127.0) 1745 #define A_BSC_16 (127.0/512.0) 1746 //============================================================================================================================== 1747 #if 1 1748 AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} 1749 AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} 1750 AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} 1751 AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} 1752 //------------------------------------------------------------------------------------------------------------------------------ 1753 AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} 1754 AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} 1755 AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} 1756 AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} 1757 //------------------------------------------------------------------------------------------------------------------------------ 1758 AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} 1759 AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} 1760 AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} 1761 AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} 1762 //------------------------------------------------------------------------------------------------------------------------------ 1763 AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} 1764 AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} 1765 AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} 1766 AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} 1767 #endif 1768 //============================================================================================================================== 1769 #ifdef A_HALF 1770 // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. 1771 AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); 1772 return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} 1773 //------------------------------------------------------------------------------------------------------------------------------ 1774 AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); 1775 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} 1776 AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); 1777 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} 1778 AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); 1779 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} 1780 AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); 1781 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} 1782 //------------------------------------------------------------------------------------------------------------------------------ 1783 AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; 1784 return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} 1785 AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; 1786 return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} 1787 AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; 1788 return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} 1789 AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; 1790 return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} 1791 //------------------------------------------------------------------------------------------------------------------------------ 1792 AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} 1793 AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} 1794 AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} 1795 AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} 1796 //------------------------------------------------------------------------------------------------------------------------------ 1797 AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} 1798 AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} 1799 AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} 1800 AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} 1801 #endif 1802 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1803 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1804 //_____________________________________________________________/\_______________________________________________________________ 1805 //============================================================================================================================== 1806 // HALF APPROXIMATIONS 1807 //------------------------------------------------------------------------------------------------------------------------------ 1808 // These support only positive inputs. 1809 // Did not see value yet in specialization for range. 1810 // Using quick testing, ended up mostly getting the same "best" approximation for various ranges. 1811 // With hardware that can co-execute transcendentals, the value in approximations could be less than expected. 1812 // However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. 1813 // And co-execution would require a compiler interleaving a lot of independent work for packed usage. 1814 //------------------------------------------------------------------------------------------------------------------------------ 1815 // The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). 1816 // Same with sqrt(), as this could be x*rsq() (7 ops). 1817 //============================================================================================================================== 1818 #ifdef A_HALF 1819 // Minimize squared error across full positive range, 2 ops. 1820 // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. 1821 AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} 1822 AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} 1823 AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} 1824 AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} 1825 //------------------------------------------------------------------------------------------------------------------------------ 1826 // Lower precision estimation, 1 op. 1827 // Minimize squared error across {smallest normal to 16384.0}. 1828 AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} 1829 AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} 1830 AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} 1831 AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} 1832 //------------------------------------------------------------------------------------------------------------------------------ 1833 // Medium precision estimation, one Newton Raphson iteration, 3 ops. 1834 AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} 1835 AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} 1836 AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} 1837 AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} 1838 //------------------------------------------------------------------------------------------------------------------------------ 1839 // Minimize squared error across {smallest normal to 16384.0}, 2 ops. 1840 AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} 1841 AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} 1842 AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} 1843 AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} 1844 #endif 1845 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1846 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1847 //_____________________________________________________________/\_______________________________________________________________ 1848 //============================================================================================================================== 1849 // FLOAT APPROXIMATIONS 1850 //------------------------------------------------------------------------------------------------------------------------------ 1851 // Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", 1852 // - Idea dates back to SGI, then to Quake 3, etc. 1853 // - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf 1854 // - sqrt(x)=rsqrt(x)*x 1855 // - rcp(x)=rsqrt(x)*rsqrt(x) for positive x 1856 // - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h 1857 //------------------------------------------------------------------------------------------------------------------------------ 1858 // These below are from perhaps less complete searching for optimal. 1859 // Used FP16 normal range for testing with +4096 32-bit step size for sampling error. 1860 // So these match up well with the half approximations. 1861 //============================================================================================================================== 1862 AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} 1863 AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} 1864 AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} 1865 AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} 1866 //------------------------------------------------------------------------------------------------------------------------------ 1867 AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} 1868 AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} 1869 AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} 1870 AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} 1871 //------------------------------------------------------------------------------------------------------------------------------ 1872 AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} 1873 AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} 1874 AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} 1875 AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} 1876 //------------------------------------------------------------------------------------------------------------------------------ 1877 AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} 1878 AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} 1879 AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} 1880 AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} 1881 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1882 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1883 //_____________________________________________________________/\_______________________________________________________________ 1884 //============================================================================================================================== 1885 // PQ APPROXIMATIONS 1886 //------------------------------------------------------------------------------------------------------------------------------ 1887 // PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do 1888 // PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. 1889 //============================================================================================================================== 1890 // Helpers 1891 AF1 Quart(AF1 a) { a = a * a; return a * a;} 1892 AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } 1893 AF2 Quart(AF2 a) { a = a * a; return a * a; } 1894 AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } 1895 AF3 Quart(AF3 a) { a = a * a; return a * a; } 1896 AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } 1897 AF4 Quart(AF4 a) { a = a * a; return a * a; } 1898 AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } 1899 //------------------------------------------------------------------------------------------------------------------------------ 1900 AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } 1901 AF1 APrxPQToLinear(AF1 a) { return Oct(a); } 1902 AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } 1903 AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } 1904 AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } 1905 AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } 1906 AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } 1907 AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } 1908 //------------------------------------------------------------------------------------------------------------------------------ 1909 AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } 1910 AF2 APrxPQToLinear(AF2 a) { return Oct(a); } 1911 AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } 1912 AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } 1913 AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } 1914 AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } 1915 AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } 1916 AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } 1917 //------------------------------------------------------------------------------------------------------------------------------ 1918 AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } 1919 AF3 APrxPQToLinear(AF3 a) { return Oct(a); } 1920 AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } 1921 AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } 1922 AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } 1923 AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } 1924 AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } 1925 AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } 1926 //------------------------------------------------------------------------------------------------------------------------------ 1927 AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } 1928 AF4 APrxPQToLinear(AF4 a) { return Oct(a); } 1929 AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } 1930 AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } 1931 AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } 1932 AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } 1933 AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } 1934 AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } 1935 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1936 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1937 //_____________________________________________________________/\_______________________________________________________________ 1938 //============================================================================================================================== 1939 // PARABOLIC SIN & COS 1940 //------------------------------------------------------------------------------------------------------------------------------ 1941 // Approximate answers to transcendental questions. 1942 //------------------------------------------------------------------------------------------------------------------------------ 1943 //============================================================================================================================== 1944 #if 1 1945 // Valid input range is {-1 to 1} representing {0 to 2 pi}. 1946 // Output range is {-1/4 to 1/4} representing {-1 to 1}. 1947 AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. 1948 AF2 APSinF2(AF2 x){return x*abs(x)-x;} 1949 AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT 1950 AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} 1951 AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} 1952 #endif 1953 //------------------------------------------------------------------------------------------------------------------------------ 1954 #ifdef A_HALF 1955 // For a packed {sin,cos} pair, 1956 // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). 1957 // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). 1958 AH1 APSinH1(AH1 x){return x*abs(x)-x;} 1959 AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA 1960 AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 1961 AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND 1962 AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} 1963 #endif 1964 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1965 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1966 //_____________________________________________________________/\_______________________________________________________________ 1967 //============================================================================================================================== 1968 // [ZOL] ZERO ONE LOGIC 1969 //------------------------------------------------------------------------------------------------------------------------------ 1970 // Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. 1971 //------------------------------------------------------------------------------------------------------------------------------ 1972 // 0 := false 1973 // 1 := true 1974 //------------------------------------------------------------------------------------------------------------------------------ 1975 // AndNot(x,y) -> !(x&y) .... One op. 1976 // AndOr(x,y,z) -> (x&y)|z ... One op. 1977 // GtZero(x) -> x>0.0 ..... One op. 1978 // Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. 1979 // Signed(x) -> x<0.0 ..... One op. 1980 // ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. 1981 //------------------------------------------------------------------------------------------------------------------------------ 1982 // OPTIMIZATION NOTES 1983 // ================== 1984 // - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. 1985 // For example 'a.xy*k.xx+k.yy'. 1986 //============================================================================================================================== 1987 #if 1 1988 AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} 1989 AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} 1990 AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} 1991 AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} 1992 //------------------------------------------------------------------------------------------------------------------------------ 1993 AU1 AZolNotU1(AU1 x){return x^AU1_(1);} 1994 AU2 AZolNotU2(AU2 x){return x^AU2_(1);} 1995 AU3 AZolNotU3(AU3 x){return x^AU3_(1);} 1996 AU4 AZolNotU4(AU4 x){return x^AU4_(1);} 1997 //------------------------------------------------------------------------------------------------------------------------------ 1998 AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} 1999 AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} 2000 AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} 2001 AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} 2002 //============================================================================================================================== 2003 AU1 AZolF1ToU1(AF1 x){return AU1(x);} 2004 AU2 AZolF2ToU2(AF2 x){return AU2(x);} 2005 AU3 AZolF3ToU3(AF3 x){return AU3(x);} 2006 AU4 AZolF4ToU4(AF4 x){return AU4(x);} 2007 //------------------------------------------------------------------------------------------------------------------------------ 2008 // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). 2009 AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} 2010 AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} 2011 AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} 2012 AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} 2013 //------------------------------------------------------------------------------------------------------------------------------ 2014 AF1 AZolU1ToF1(AU1 x){return AF1(x);} 2015 AF2 AZolU2ToF2(AU2 x){return AF2(x);} 2016 AF3 AZolU3ToF3(AU3 x){return AF3(x);} 2017 AF4 AZolU4ToF4(AU4 x){return AF4(x);} 2018 //============================================================================================================================== 2019 AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} 2020 AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} 2021 AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} 2022 AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} 2023 //------------------------------------------------------------------------------------------------------------------------------ 2024 AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} 2025 AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} 2026 AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} 2027 AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} 2028 //------------------------------------------------------------------------------------------------------------------------------ 2029 AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} 2030 AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} 2031 AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} 2032 AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} 2033 //------------------------------------------------------------------------------------------------------------------------------ 2034 AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} 2035 AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} 2036 AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} 2037 AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} 2038 //------------------------------------------------------------------------------------------------------------------------------ 2039 AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} 2040 AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} 2041 AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} 2042 AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} 2043 //------------------------------------------------------------------------------------------------------------------------------ 2044 AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} 2045 AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} 2046 AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} 2047 AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} 2048 //------------------------------------------------------------------------------------------------------------------------------ 2049 AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} 2050 AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} 2051 AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} 2052 AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} 2053 //------------------------------------------------------------------------------------------------------------------------------ 2054 AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} 2055 AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} 2056 AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} 2057 AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} 2058 //------------------------------------------------------------------------------------------------------------------------------ 2059 AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} 2060 AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} 2061 AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} 2062 AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} 2063 #endif 2064 //============================================================================================================================== 2065 #ifdef A_HALF 2066 AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} 2067 AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} 2068 AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} 2069 AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} 2070 //------------------------------------------------------------------------------------------------------------------------------ 2071 AW1 AZolNotW1(AW1 x){return x^AW1_(1);} 2072 AW2 AZolNotW2(AW2 x){return x^AW2_(1);} 2073 AW3 AZolNotW3(AW3 x){return x^AW3_(1);} 2074 AW4 AZolNotW4(AW4 x){return x^AW4_(1);} 2075 //------------------------------------------------------------------------------------------------------------------------------ 2076 AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} 2077 AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} 2078 AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} 2079 AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} 2080 //============================================================================================================================== 2081 // Uses denormal trick. 2082 AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} 2083 AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} 2084 AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} 2085 AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} 2086 //------------------------------------------------------------------------------------------------------------------------------ 2087 // AMD arch lacks a packed conversion opcode. 2088 AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} 2089 AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} 2090 AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} 2091 AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} 2092 //============================================================================================================================== 2093 AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} 2094 AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} 2095 AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} 2096 AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} 2097 //------------------------------------------------------------------------------------------------------------------------------ 2098 AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} 2099 AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} 2100 AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} 2101 AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} 2102 //------------------------------------------------------------------------------------------------------------------------------ 2103 AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} 2104 AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} 2105 AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} 2106 AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} 2107 //------------------------------------------------------------------------------------------------------------------------------ 2108 AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} 2109 AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} 2110 AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} 2111 AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} 2112 //------------------------------------------------------------------------------------------------------------------------------ 2113 AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} 2114 AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} 2115 AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} 2116 AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} 2117 //------------------------------------------------------------------------------------------------------------------------------ 2118 AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} 2119 AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} 2120 AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} 2121 AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} 2122 //------------------------------------------------------------------------------------------------------------------------------ 2123 AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} 2124 AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} 2125 AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} 2126 AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} 2127 //------------------------------------------------------------------------------------------------------------------------------ 2128 AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} 2129 AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} 2130 AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} 2131 AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} 2132 #endif 2133 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2134 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2135 //_____________________________________________________________/\_______________________________________________________________ 2136 //============================================================================================================================== 2137 // COLOR CONVERSIONS 2138 //------------------------------------------------------------------------------------------------------------------------------ 2139 // These are all linear to/from some other space (where 'linear' has been shortened out of the function name). 2140 // So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. 2141 // These are branch free implementations. 2142 // The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. 2143 //------------------------------------------------------------------------------------------------------------------------------ 2144 // TRANSFER FUNCTIONS 2145 // ================== 2146 // 709 ..... Rec709 used for some HDTVs 2147 // Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native 2148 // Pq ...... PQ native for HDR10 2149 // Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type 2150 // Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) 2151 // Three ... Gamma 3.0, less fast, but good for HDR. 2152 //------------------------------------------------------------------------------------------------------------------------------ 2153 // KEEPING TO SPEC 2154 // =============== 2155 // Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. 2156 // (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). 2157 // (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). 2158 // Also there is a slight step in the transition regions. 2159 // Precision of the coefficients in the spec being the likely cause. 2160 // Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. 2161 // This is to work around lack of hardware (typically only ROP does the conversion for free). 2162 // To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). 2163 // So this header keeps with the spec. 2164 // For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. 2165 // Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. 2166 //------------------------------------------------------------------------------------------------------------------------------ 2167 // FOR PQ 2168 // ====== 2169 // Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. 2170 // All constants are only specified to FP32 precision. 2171 // External PQ source reference, 2172 // - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl 2173 //------------------------------------------------------------------------------------------------------------------------------ 2174 // PACKED VERSIONS 2175 // =============== 2176 // These are the A*H2() functions. 2177 // There is no PQ functions as FP16 seemed to not have enough precision for the conversion. 2178 // The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. 2179 // Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). 2180 //------------------------------------------------------------------------------------------------------------------------------ 2181 // NOTES 2182 // ===== 2183 // Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. 2184 //============================================================================================================================== 2185 #if 1 2186 AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); 2187 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} 2188 AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); 2189 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} 2190 AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); 2191 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} 2192 //------------------------------------------------------------------------------------------------------------------------------ 2193 // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). 2194 AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 2195 AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 2196 AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 2197 //------------------------------------------------------------------------------------------------------------------------------ 2198 AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); 2199 return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} 2200 AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); 2201 return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} 2202 AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); 2203 return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} 2204 //------------------------------------------------------------------------------------------------------------------------------ 2205 AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); 2206 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} 2207 AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); 2208 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} 2209 AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); 2210 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} 2211 //------------------------------------------------------------------------------------------------------------------------------ 2212 AF1 AToTwoF1(AF1 c){return sqrt(c);} 2213 AF2 AToTwoF2(AF2 c){return sqrt(c);} 2214 AF3 AToTwoF3(AF3 c){return sqrt(c);} 2215 //------------------------------------------------------------------------------------------------------------------------------ 2216 AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} 2217 AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} 2218 AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} 2219 #endif 2220 //============================================================================================================================== 2221 #if 1 2222 // Unfortunately median won't work here. 2223 AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); 2224 return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} 2225 AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); 2226 return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} 2227 AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); 2228 return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} 2229 //------------------------------------------------------------------------------------------------------------------------------ 2230 AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 2231 AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 2232 AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 2233 //------------------------------------------------------------------------------------------------------------------------------ 2234 AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); 2235 return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} 2236 AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); 2237 return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} 2238 AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); 2239 return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} 2240 //------------------------------------------------------------------------------------------------------------------------------ 2241 // Unfortunately median won't work here. 2242 AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); 2243 return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} 2244 AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); 2245 return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} 2246 AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); 2247 return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} 2248 //------------------------------------------------------------------------------------------------------------------------------ 2249 AF1 AFromTwoF1(AF1 c){return c*c;} 2250 AF2 AFromTwoF2(AF2 c){return c*c;} 2251 AF3 AFromTwoF3(AF3 c){return c*c;} 2252 //------------------------------------------------------------------------------------------------------------------------------ 2253 AF1 AFromThreeF1(AF1 c){return c*c*c;} 2254 AF2 AFromThreeF2(AF2 c){return c*c*c;} 2255 AF3 AFromThreeF3(AF3 c){return c*c*c;} 2256 #endif 2257 //============================================================================================================================== 2258 #ifdef A_HALF 2259 AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); 2260 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} 2261 AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); 2262 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} 2263 AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); 2264 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} 2265 //------------------------------------------------------------------------------------------------------------------------------ 2266 AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} 2267 AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} 2268 AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} 2269 //------------------------------------------------------------------------------------------------------------------------------ 2270 AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); 2271 return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} 2272 AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); 2273 return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} 2274 AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); 2275 return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} 2276 //------------------------------------------------------------------------------------------------------------------------------ 2277 AH1 AToTwoH1(AH1 c){return sqrt(c);} 2278 AH2 AToTwoH2(AH2 c){return sqrt(c);} 2279 AH3 AToTwoH3(AH3 c){return sqrt(c);} 2280 //------------------------------------------------------------------------------------------------------------------------------ 2281 AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} 2282 AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} 2283 AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} 2284 #endif 2285 //============================================================================================================================== 2286 #ifdef A_HALF 2287 AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); 2288 return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} 2289 AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); 2290 return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} 2291 AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); 2292 return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} 2293 //------------------------------------------------------------------------------------------------------------------------------ 2294 AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} 2295 AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} 2296 AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} 2297 //------------------------------------------------------------------------------------------------------------------------------ 2298 AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); 2299 return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} 2300 AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); 2301 return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} 2302 AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); 2303 return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} 2304 //------------------------------------------------------------------------------------------------------------------------------ 2305 AH1 AFromTwoH1(AH1 c){return c*c;} 2306 AH2 AFromTwoH2(AH2 c){return c*c;} 2307 AH3 AFromTwoH3(AH3 c){return c*c;} 2308 //------------------------------------------------------------------------------------------------------------------------------ 2309 AH1 AFromThreeH1(AH1 c){return c*c*c;} 2310 AH2 AFromThreeH2(AH2 c){return c*c*c;} 2311 AH3 AFromThreeH3(AH3 c){return c*c*c;} 2312 #endif 2313 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2314 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2315 //_____________________________________________________________/\_______________________________________________________________ 2316 //============================================================================================================================== 2317 // CS REMAP 2318 //============================================================================================================================== 2319 // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. 2320 // 543210 2321 // ====== 2322 // ..xxx. 2323 // yy...y 2324 AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} 2325 //============================================================================================================================== 2326 // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. 2327 // 543210 2328 // ====== 2329 // .xx..x 2330 // y..yy. 2331 // Details, 2332 // LANE TO 8x8 MAPPING 2333 // =================== 2334 // 00 01 08 09 10 11 18 19 2335 // 02 03 0a 0b 12 13 1a 1b 2336 // 04 05 0c 0d 14 15 1c 1d 2337 // 06 07 0e 0f 16 17 1e 1f 2338 // 20 21 28 29 30 31 38 39 2339 // 22 23 2a 2b 32 33 3a 3b 2340 // 24 25 2c 2d 34 35 3c 3d 2341 // 26 27 2e 2f 36 37 3e 3f 2342 AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} 2343 //============================================================================================================================== 2344 #ifdef A_HALF 2345 AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} 2346 AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} 2347 #endif 2348 #endif 2349 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2350 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2351 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2352 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2353 //_____________________________________________________________/\_______________________________________________________________ 2354 //============================================================================================================================== 2355 // 2356 // REFERENCE 2357 // 2358 //------------------------------------------------------------------------------------------------------------------------------ 2359 // IEEE FLOAT RULES 2360 // ================ 2361 // - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 2362 // - {+/-}0 * {+/-}INF = NaN 2363 // - -INF + (+INF) = NaN 2364 // - {+/-}0 / {+/-}0 = NaN 2365 // - {+/-}INF / {+/-}INF = NaN 2366 // - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) 2367 // - 0 == -0 2368 // - 4/0 = +INF 2369 // - 4/-0 = -INF 2370 // - 4+INF = +INF 2371 // - 4-INF = -INF 2372 // - 4*(+INF) = +INF 2373 // - 4*(-INF) = -INF 2374 // - -4*(+INF) = -INF 2375 // - sqrt(+INF) = +INF 2376 //------------------------------------------------------------------------------------------------------------------------------ 2377 // FP16 ENCODING 2378 // ============= 2379 // fedcba9876543210 2380 // ---------------- 2381 // ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) 2382 // .eeeee.......... 5-bit exponent 2383 // .00000.......... denormals 2384 // .00001.......... -14 exponent 2385 // .11110.......... 15 exponent 2386 // .111110000000000 infinity 2387 // .11111nnnnnnnnnn NaN with n!=0 2388 // s............... sign 2389 //------------------------------------------------------------------------------------------------------------------------------ 2390 // FP16/INT16 ALIASING DENORMAL 2391 // ============================ 2392 // 11-bit unsigned integers alias with half float denormal/normal values, 2393 // 1 = 2^(-24) = 1/16777216 ....................... first denormal value 2394 // 2 = 2^(-23) 2395 // ... 2396 // 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value 2397 // 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers 2398 // 2047 .............................................. last normal value that still maps to integers 2399 // Scaling limits, 2400 // 2^15 = 32768 ...................................... largest power of 2 scaling 2401 // Largest pow2 conversion mapping is at *32768, 2402 // 1 : 2^(-9) = 1/512 2403 // 2 : 1/256 2404 // 4 : 1/128 2405 // 8 : 1/64 2406 // 16 : 1/32 2407 // 32 : 1/16 2408 // 64 : 1/8 2409 // 128 : 1/4 2410 // 256 : 1/2 2411 // 512 : 1 2412 // 1024 : 2 2413 // 2047 : a little less than 4 2414 //============================================================================================================================== 2415 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2416 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2417 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2418 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2419 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2420 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2421 //_____________________________________________________________/\_______________________________________________________________ 2422 //============================================================================================================================== 2423 // 2424 // 2425 // GPU/CPU PORTABILITY 2426 // 2427 // 2428 //------------------------------------------------------------------------------------------------------------------------------ 2429 // This is the GPU implementation. 2430 // See the CPU implementation for docs. 2431 //============================================================================================================================== 2432 #ifdef A_GPU 2433 #define A_TRUE true 2434 #define A_FALSE false 2435 #define A_STATIC 2436 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2437 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2438 //_____________________________________________________________/\_______________________________________________________________ 2439 //============================================================================================================================== 2440 // VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY 2441 //============================================================================================================================== 2442 #define retAD2 AD2 2443 #define retAD3 AD3 2444 #define retAD4 AD4 2445 #define retAF2 AF2 2446 #define retAF3 AF3 2447 #define retAF4 AF4 2448 #define retAL2 AL2 2449 #define retAL3 AL3 2450 #define retAL4 AL4 2451 #define retAU2 AU2 2452 #define retAU3 AU3 2453 #define retAU4 AU4 2454 //------------------------------------------------------------------------------------------------------------------------------ 2455 #define inAD2 in AD2 2456 #define inAD3 in AD3 2457 #define inAD4 in AD4 2458 #define inAF2 in AF2 2459 #define inAF3 in AF3 2460 #define inAF4 in AF4 2461 #define inAL2 in AL2 2462 #define inAL3 in AL3 2463 #define inAL4 in AL4 2464 #define inAU2 in AU2 2465 #define inAU3 in AU3 2466 #define inAU4 in AU4 2467 //------------------------------------------------------------------------------------------------------------------------------ 2468 #define inoutAD2 inout AD2 2469 #define inoutAD3 inout AD3 2470 #define inoutAD4 inout AD4 2471 #define inoutAF2 inout AF2 2472 #define inoutAF3 inout AF3 2473 #define inoutAF4 inout AF4 2474 #define inoutAL2 inout AL2 2475 #define inoutAL3 inout AL3 2476 #define inoutAL4 inout AL4 2477 #define inoutAU2 inout AU2 2478 #define inoutAU3 inout AU3 2479 #define inoutAU4 inout AU4 2480 //------------------------------------------------------------------------------------------------------------------------------ 2481 #define outAD2 out AD2 2482 #define outAD3 out AD3 2483 #define outAD4 out AD4 2484 #define outAF2 out AF2 2485 #define outAF3 out AF3 2486 #define outAF4 out AF4 2487 #define outAL2 out AL2 2488 #define outAL3 out AL3 2489 #define outAL4 out AL4 2490 #define outAU2 out AU2 2491 #define outAU3 out AU3 2492 #define outAU4 out AU4 2493 //------------------------------------------------------------------------------------------------------------------------------ 2494 #define varAD2(x) AD2 x 2495 #define varAD3(x) AD3 x 2496 #define varAD4(x) AD4 x 2497 #define varAF2(x) AF2 x 2498 #define varAF3(x) AF3 x 2499 #define varAF4(x) AF4 x 2500 #define varAL2(x) AL2 x 2501 #define varAL3(x) AL3 x 2502 #define varAL4(x) AL4 x 2503 #define varAU2(x) AU2 x 2504 #define varAU3(x) AU3 x 2505 #define varAU4(x) AU4 x 2506 //------------------------------------------------------------------------------------------------------------------------------ 2507 #define initAD2(x,y) AD2(x,y) 2508 #define initAD3(x,y,z) AD3(x,y,z) 2509 #define initAD4(x,y,z,w) AD4(x,y,z,w) 2510 #define initAF2(x,y) AF2(x,y) 2511 #define initAF3(x,y,z) AF3(x,y,z) 2512 #define initAF4(x,y,z,w) AF4(x,y,z,w) 2513 #define initAL2(x,y) AL2(x,y) 2514 #define initAL3(x,y,z) AL3(x,y,z) 2515 #define initAL4(x,y,z,w) AL4(x,y,z,w) 2516 #define initAU2(x,y) AU2(x,y) 2517 #define initAU3(x,y,z) AU3(x,y,z) 2518 #define initAU4(x,y,z,w) AU4(x,y,z,w) 2519 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2520 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2521 //_____________________________________________________________/\_______________________________________________________________ 2522 //============================================================================================================================== 2523 // SCALAR RETURN OPS 2524 //============================================================================================================================== 2525 #define AAbsD1(a) abs(AD1(a)) 2526 #define AAbsF1(a) abs(AF1(a)) 2527 //------------------------------------------------------------------------------------------------------------------------------ 2528 #define ACosD1(a) cos(AD1(a)) 2529 #define ACosF1(a) cos(AF1(a)) 2530 //------------------------------------------------------------------------------------------------------------------------------ 2531 #define ADotD2(a,b) dot(AD2(a),AD2(b)) 2532 #define ADotD3(a,b) dot(AD3(a),AD3(b)) 2533 #define ADotD4(a,b) dot(AD4(a),AD4(b)) 2534 #define ADotF2(a,b) dot(AF2(a),AF2(b)) 2535 #define ADotF3(a,b) dot(AF3(a),AF3(b)) 2536 #define ADotF4(a,b) dot(AF4(a),AF4(b)) 2537 //------------------------------------------------------------------------------------------------------------------------------ 2538 #define AExp2D1(a) exp2(AD1(a)) 2539 #define AExp2F1(a) exp2(AF1(a)) 2540 //------------------------------------------------------------------------------------------------------------------------------ 2541 #define AFloorD1(a) floor(AD1(a)) 2542 #define AFloorF1(a) floor(AF1(a)) 2543 //------------------------------------------------------------------------------------------------------------------------------ 2544 #define ALog2D1(a) log2(AD1(a)) 2545 #define ALog2F1(a) log2(AF1(a)) 2546 //------------------------------------------------------------------------------------------------------------------------------ 2547 #define AMaxD1(a,b) max(a,b) 2548 #define AMaxF1(a,b) max(a,b) 2549 #define AMaxL1(a,b) max(a,b) 2550 #define AMaxU1(a,b) max(a,b) 2551 //------------------------------------------------------------------------------------------------------------------------------ 2552 #define AMinD1(a,b) min(a,b) 2553 #define AMinF1(a,b) min(a,b) 2554 #define AMinL1(a,b) min(a,b) 2555 #define AMinU1(a,b) min(a,b) 2556 //------------------------------------------------------------------------------------------------------------------------------ 2557 #define ASinD1(a) sin(AD1(a)) 2558 #define ASinF1(a) sin(AF1(a)) 2559 //------------------------------------------------------------------------------------------------------------------------------ 2560 #define ASqrtD1(a) sqrt(AD1(a)) 2561 #define ASqrtF1(a) sqrt(AF1(a)) 2562 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2563 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2564 //_____________________________________________________________/\_______________________________________________________________ 2565 //============================================================================================================================== 2566 // SCALAR RETURN OPS - DEPENDENT 2567 //============================================================================================================================== 2568 #define APowD1(a,b) pow(AD1(a),AF1(b)) 2569 #define APowF1(a,b) pow(AF1(a),AF1(b)) 2570 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2571 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2572 //_____________________________________________________________/\_______________________________________________________________ 2573 //============================================================================================================================== 2574 // VECTOR OPS 2575 //------------------------------------------------------------------------------------------------------------------------------ 2576 // These are added as needed for production or prototyping, so not necessarily a complete set. 2577 // They follow a convention of taking in a destination and also returning the destination value to increase utility. 2578 //============================================================================================================================== 2579 #ifdef A_DUBL 2580 AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} 2581 AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} 2582 AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} 2583 //------------------------------------------------------------------------------------------------------------------------------ 2584 AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} 2585 AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} 2586 AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} 2587 //------------------------------------------------------------------------------------------------------------------------------ 2588 AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} 2589 AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} 2590 AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} 2591 //------------------------------------------------------------------------------------------------------------------------------ 2592 AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} 2593 AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} 2594 AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} 2595 //------------------------------------------------------------------------------------------------------------------------------ 2596 AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} 2597 AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} 2598 AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} 2599 //------------------------------------------------------------------------------------------------------------------------------ 2600 AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} 2601 AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} 2602 AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} 2603 //------------------------------------------------------------------------------------------------------------------------------ 2604 AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} 2605 AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} 2606 AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} 2607 //------------------------------------------------------------------------------------------------------------------------------ 2608 AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} 2609 AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} 2610 AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} 2611 //------------------------------------------------------------------------------------------------------------------------------ 2612 AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} 2613 AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} 2614 AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} 2615 //------------------------------------------------------------------------------------------------------------------------------ 2616 AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} 2617 AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} 2618 AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} 2619 //------------------------------------------------------------------------------------------------------------------------------ 2620 AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} 2621 AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} 2622 AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} 2623 //------------------------------------------------------------------------------------------------------------------------------ 2624 AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} 2625 AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} 2626 AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} 2627 #endif 2628 //============================================================================================================================== 2629 AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} 2630 AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} 2631 AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} 2632 //------------------------------------------------------------------------------------------------------------------------------ 2633 AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} 2634 AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} 2635 AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} 2636 //------------------------------------------------------------------------------------------------------------------------------ 2637 AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} 2638 AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} 2639 AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} 2640 //------------------------------------------------------------------------------------------------------------------------------ 2641 AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} 2642 AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} 2643 AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} 2644 //------------------------------------------------------------------------------------------------------------------------------ 2645 AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} 2646 AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} 2647 AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} 2648 //------------------------------------------------------------------------------------------------------------------------------ 2649 AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} 2650 AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} 2651 AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} 2652 //------------------------------------------------------------------------------------------------------------------------------ 2653 AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} 2654 AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} 2655 AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} 2656 //------------------------------------------------------------------------------------------------------------------------------ 2657 AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} 2658 AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} 2659 AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} 2660 //------------------------------------------------------------------------------------------------------------------------------ 2661 AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} 2662 AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} 2663 AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} 2664 //------------------------------------------------------------------------------------------------------------------------------ 2665 AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} 2666 AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} 2667 AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} 2668 //------------------------------------------------------------------------------------------------------------------------------ 2669 AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} 2670 AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} 2671 AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} 2672 //------------------------------------------------------------------------------------------------------------------------------ 2673 AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} 2674 AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} 2675 AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} 2676 #endif 2677 2678 2679 #define FSR_RCAS_F 1 2680 AU4 con0; 2681 2682 AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); } 2683 void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} 2684 2685 //_____________________________________________________________/\_______________________________________________________________ 2686 //============================================================================================================================== 2687 // 2688 // 2689 // AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 2690 // 2691 // 2692 //------------------------------------------------------------------------------------------------------------------------------ 2693 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2694 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2695 //------------------------------------------------------------------------------------------------------------------------------ 2696 // FidelityFX Super Resolution Sample 2697 // 2698 // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. 2699 // Permission is hereby granted, free of charge, to any person obtaining a copy 2700 // of this software and associated documentation files(the "Software"), to deal 2701 // in the Software without restriction, including without limitation the rights 2702 // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell 2703 // copies of the Software, and to permit persons to whom the Software is 2704 // furnished to do so, subject to the following conditions : 2705 // The above copyright notice and this permission notice shall be included in 2706 // all copies or substantial portions of the Software. 2707 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2708 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2709 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 2710 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2711 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2712 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2713 // THE SOFTWARE. 2714 //------------------------------------------------------------------------------------------------------------------------------ 2715 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2716 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2717 //------------------------------------------------------------------------------------------------------------------------------ 2718 // ABOUT 2719 // ===== 2720 // FSR is a collection of algorithms relating to generating a higher resolution image. 2721 // This specific header focuses on single-image non-temporal image scaling, and related tools. 2722 // 2723 // The core functions are EASU and RCAS: 2724 // [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. 2725 // [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. 2726 // RCAS needs to be applied after EASU as a separate pass. 2727 // 2728 // Optional utility functions are: 2729 // [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. 2730 // [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. 2731 // [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. 2732 // See each individual sub-section for inline documentation. 2733 //------------------------------------------------------------------------------------------------------------------------------ 2734 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2735 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2736 //------------------------------------------------------------------------------------------------------------------------------ 2737 // FUNCTION PERMUTATIONS 2738 // ===================== 2739 // *F() ..... Single item computation with 32-bit. 2740 // *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. 2741 // *Hx2() ... Processing two items in parallel with 16-bit, easier packing. 2742 // Not all interfaces in this file have a *Hx2() form. 2743 //============================================================================================================================== 2744 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2745 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2746 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2747 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2748 //_____________________________________________________________/\_______________________________________________________________ 2749 //============================================================================================================================== 2750 // 2751 // FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING 2752 // 2753 //------------------------------------------------------------------------------------------------------------------------------ 2754 // EASU provides a high quality spatial-only scaling at relatively low cost. 2755 // Meaning EASU is appropiate for laptops and other low-end GPUs. 2756 // Quality from 1x to 4x area scaling is good. 2757 //------------------------------------------------------------------------------------------------------------------------------ 2758 // The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. 2759 // EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. 2760 // This is also kept as simple as possible to have minimum runtime. 2761 //------------------------------------------------------------------------------------------------------------------------------ 2762 // The lanzcos filter has negative lobes, so by itself it will introduce ringing. 2763 // To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, 2764 // and limits output to the minimum and maximum of that neighborhood. 2765 //------------------------------------------------------------------------------------------------------------------------------ 2766 // Input image requirements: 2767 // 2768 // Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) 2769 // Each channel needs to be in the range[0, 1] 2770 // Any color primaries are supported 2771 // Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) 2772 // There should be no banding in the input 2773 // There should be no high amplitude noise in the input 2774 // There should be no noise in the input that is not at input pixel granularity 2775 // For performance purposes, use 32bpp formats 2776 //------------------------------------------------------------------------------------------------------------------------------ 2777 // Best to apply EASU at the end of the frame after tonemapping 2778 // but before film grain or composite of the UI. 2779 //------------------------------------------------------------------------------------------------------------------------------ 2780 // Example of including this header for D3D HLSL : 2781 // 2782 // #define A_GPU 1 2783 // #define A_HLSL 1 2784 // #define A_HALF 1 2785 // #include "ffx_a.h" 2786 // #define FSR_EASU_H 1 2787 // #define FSR_RCAS_H 1 2788 // //declare input callbacks 2789 // #include "ffx_fsr1.h" 2790 // 2791 // Example of including this header for Vulkan GLSL : 2792 // 2793 // #define A_GPU 1 2794 // #define A_GLSL 1 2795 // #define A_HALF 1 2796 // #include "ffx_a.h" 2797 // #define FSR_EASU_H 1 2798 // #define FSR_RCAS_H 1 2799 // //declare input callbacks 2800 // #include "ffx_fsr1.h" 2801 // 2802 // Example of including this header for Vulkan HLSL : 2803 // 2804 // #define A_GPU 1 2805 // #define A_HLSL 1 2806 // #define A_HLSL_6_2 1 2807 // #define A_NO_16_BIT_CAST 1 2808 // #define A_HALF 1 2809 // #include "ffx_a.h" 2810 // #define FSR_EASU_H 1 2811 // #define FSR_RCAS_H 1 2812 // //declare input callbacks 2813 // #include "ffx_fsr1.h" 2814 // 2815 // Example of declaring the required input callbacks for GLSL : 2816 // The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. 2817 // EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. 2818 // 2819 // AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} 2820 // AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} 2821 // AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} 2822 // ... 2823 // The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. 2824 // The difference in viewport and input image size is there to support Dynamic Resolution Scaling. 2825 // To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. 2826 // Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. 2827 // AU4 con0,con1,con2,con3; 2828 // FsrEasuCon(con0,con1,con2,con3, 2829 // 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. 2830 // 3840.0,2160.0, // The size of the input image. 2831 // 2560.0,1440.0); // The output resolution. 2832 //============================================================================================================================== 2833 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2834 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2835 //_____________________________________________________________/\_______________________________________________________________ 2836 //============================================================================================================================== 2837 // CONSTANT SETUP 2838 //============================================================================================================================== 2839 // Call to setup required constant values (works on CPU or GPU). 2840 A_STATIC void FsrEasuCon( 2841 outAU4 con0, 2842 outAU4 con1, 2843 outAU4 con2, 2844 outAU4 con3, 2845 // This the rendered image resolution being upscaled 2846 AF1 inputViewportInPixelsX, 2847 AF1 inputViewportInPixelsY, 2848 // This is the resolution of the resource containing the input image (useful for dynamic resolution) 2849 AF1 inputSizeInPixelsX, 2850 AF1 inputSizeInPixelsY, 2851 // This is the display resolution which the input image gets upscaled to 2852 AF1 outputSizeInPixelsX, 2853 AF1 outputSizeInPixelsY){ 2854 // Output integer position to a pixel position in viewport. 2855 con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); 2856 con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); 2857 con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); 2858 con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); 2859 // Viewport pixel position to normalized image space. 2860 // This is used to get upper-left of 'F' tap. 2861 con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); 2862 con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); 2863 // Centers of gather4, first offset from upper-left of 'F'. 2864 // +---+---+ 2865 // | | | 2866 // +--(0)--+ 2867 // | b | c | 2868 // +---F---+---+---+ 2869 // | e | f | g | h | 2870 // +--(1)--+--(2)--+ 2871 // | i | j | k | l | 2872 // +---+---+---+---+ 2873 // | n | o | 2874 // +--(3)--+ 2875 // | | | 2876 // +---+---+ 2877 con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); 2878 con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); 2879 // These are from (0) instead of 'F'. 2880 con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); 2881 con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); 2882 con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); 2883 con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); 2884 con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); 2885 con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); 2886 con3[2]=con3[3]=0;} 2887 2888 //If the an offset into the input image resource 2889 A_STATIC void FsrEasuConOffset( 2890 outAU4 con0, 2891 outAU4 con1, 2892 outAU4 con2, 2893 outAU4 con3, 2894 // This the rendered image resolution being upscaled 2895 AF1 inputViewportInPixelsX, 2896 AF1 inputViewportInPixelsY, 2897 // This is the resolution of the resource containing the input image (useful for dynamic resolution) 2898 AF1 inputSizeInPixelsX, 2899 AF1 inputSizeInPixelsY, 2900 // This is the display resolution which the input image gets upscaled to 2901 AF1 outputSizeInPixelsX, 2902 AF1 outputSizeInPixelsY, 2903 // This is the input image offset into the resource containing it (useful for dynamic resolution) 2904 AF1 inputOffsetInPixelsX, 2905 AF1 inputOffsetInPixelsY) { 2906 FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); 2907 con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); 2908 con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); 2909 } 2910 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2911 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2912 //_____________________________________________________________/\_______________________________________________________________ 2913 //============================================================================================================================== 2914 // NON-PACKED 32-BIT VERSION 2915 //============================================================================================================================== 2916 #if defined(A_GPU)&&defined(FSR_EASU_F) 2917 // Input callback prototypes, need to be implemented by calling shader 2918 AF4 FsrEasuRF(AF2 p); 2919 AF4 FsrEasuGF(AF2 p); 2920 AF4 FsrEasuBF(AF2 p); 2921 //------------------------------------------------------------------------------------------------------------------------------ 2922 // Filtering for a given tap for the scalar. 2923 void FsrEasuTapF( 2924 inout AF3 aC, // Accumulated color, with negative lobe. 2925 inout AF1 aW, // Accumulated weight. 2926 AF2 off, // Pixel offset from resolve position to tap. 2927 AF2 dir, // Gradient direction. 2928 AF2 len, // Length. 2929 AF1 lob, // Negative lobe strength. 2930 AF1 clp, // Clipping point. 2931 AF3 c){ // Tap color. 2932 // Rotate offset by direction. 2933 AF2 v; 2934 v.x=(off.x*( dir.x))+(off.y*dir.y); 2935 v.y=(off.x*(-dir.y))+(off.y*dir.x); 2936 // Anisotropy. 2937 v*=len; 2938 // Compute distance^2. 2939 AF1 d2=v.x*v.x+v.y*v.y; 2940 // Limit to the window as at corner, 2 taps can easily be outside. 2941 d2=min(d2,clp); 2942 // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. 2943 // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 2944 // |_______________________________________| |_______________| 2945 // base window 2946 // The general form of the 'base' is, 2947 // (a*(b*x^2-1)^2-(a-1)) 2948 // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. 2949 AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); 2950 AF1 wA=lob*d2+AF1_(-1.0); 2951 wB*=wB; 2952 wA*=wA; 2953 wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); 2954 AF1 w=wB*wA; 2955 // Do weighted average. 2956 aC+=c*w;aW+=w;} 2957 //------------------------------------------------------------------------------------------------------------------------------ 2958 // Accumulate direction and length. 2959 void FsrEasuSetF( 2960 inout AF2 dir, 2961 inout AF1 len, 2962 AF2 pp, 2963 AP1 biS,AP1 biT,AP1 biU,AP1 biV, 2964 AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ 2965 // Compute bilinear weight, branches factor out as predicates are compiler time immediates. 2966 // s t 2967 // u v 2968 AF1 w = AF1_(0.0); 2969 if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); 2970 if(biT)w= pp.x *(AF1_(1.0)-pp.y); 2971 if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; 2972 if(biV)w= pp.x * pp.y ; 2973 // Direction is the '+' diff. 2974 // a 2975 // b c d 2976 // e 2977 // Then takes magnitude from abs average of both sides of 'c'. 2978 // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. 2979 AF1 dc=lD-lC; 2980 AF1 cb=lC-lB; 2981 AF1 lenX=max(abs(dc),abs(cb)); 2982 lenX=APrxLoRcpF1(lenX); 2983 AF1 dirX=lD-lB; 2984 dir.x+=dirX*w; 2985 lenX=ASatF1(abs(dirX)*lenX); 2986 lenX*=lenX; 2987 len+=lenX*w; 2988 // Repeat for the y axis. 2989 AF1 ec=lE-lC; 2990 AF1 ca=lC-lA; 2991 AF1 lenY=max(abs(ec),abs(ca)); 2992 lenY=APrxLoRcpF1(lenY); 2993 AF1 dirY=lE-lA; 2994 dir.y+=dirY*w; 2995 lenY=ASatF1(abs(dirY)*lenY); 2996 lenY*=lenY; 2997 len+=lenY*w;} 2998 //------------------------------------------------------------------------------------------------------------------------------ 2999 void FsrEasuF( 3000 out AF3 pix, 3001 AU2 ip, // Integer pixel position in output. 3002 AU4 con0, // Constants generated by FsrEasuCon(). 3003 AU4 con1, 3004 AU4 con2, 3005 AU4 con3){ 3006 //------------------------------------------------------------------------------------------------------------------------------ 3007 // Get position of 'f'. 3008 AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); 3009 AF2 fp=floor(pp); 3010 pp-=fp; 3011 //------------------------------------------------------------------------------------------------------------------------------ 3012 // 12-tap kernel. 3013 // b c 3014 // e f g h 3015 // i j k l 3016 // n o 3017 // Gather 4 ordering. 3018 // a b 3019 // r g 3020 // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, 3021 // a b <- unused (z) 3022 // r g 3023 // a b a b 3024 // r g r g 3025 // a b 3026 // r g <- unused (z) 3027 // Allowing dead-code removal to remove the 'z's. 3028 AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); 3029 // These are from p0 to avoid pulling two constants on pre-Navi hardware. 3030 AF2 p1=p0+AF2_AU2(con2.xy); 3031 AF2 p2=p0+AF2_AU2(con2.zw); 3032 AF2 p3=p0+AF2_AU2(con3.xy); 3033 AF4 bczzR=FsrEasuRF(p0); 3034 AF4 bczzG=FsrEasuGF(p0); 3035 AF4 bczzB=FsrEasuBF(p0); 3036 AF4 ijfeR=FsrEasuRF(p1); 3037 AF4 ijfeG=FsrEasuGF(p1); 3038 AF4 ijfeB=FsrEasuBF(p1); 3039 AF4 klhgR=FsrEasuRF(p2); 3040 AF4 klhgG=FsrEasuGF(p2); 3041 AF4 klhgB=FsrEasuBF(p2); 3042 AF4 zzonR=FsrEasuRF(p3); 3043 AF4 zzonG=FsrEasuGF(p3); 3044 AF4 zzonB=FsrEasuBF(p3); 3045 //------------------------------------------------------------------------------------------------------------------------------ 3046 // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). 3047 AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); 3048 AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); 3049 AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); 3050 AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); 3051 // Rename. 3052 AF1 bL=bczzL.x; 3053 AF1 cL=bczzL.y; 3054 AF1 iL=ijfeL.x; 3055 AF1 jL=ijfeL.y; 3056 AF1 fL=ijfeL.z; 3057 AF1 eL=ijfeL.w; 3058 AF1 kL=klhgL.x; 3059 AF1 lL=klhgL.y; 3060 AF1 hL=klhgL.z; 3061 AF1 gL=klhgL.w; 3062 AF1 oL=zzonL.z; 3063 AF1 nL=zzonL.w; 3064 // Accumulate for bilinear interpolation. 3065 AF2 dir=AF2_(0.0); 3066 AF1 len=AF1_(0.0); 3067 FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); 3068 FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); 3069 FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); 3070 FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); 3071 //------------------------------------------------------------------------------------------------------------------------------ 3072 // Normalize with approximation, and cleanup close to zero. 3073 AF2 dir2=dir*dir; 3074 AF1 dirR=dir2.x+dir2.y; 3075 AP1 zro=dirR<AF1_(1.0/32768.0); 3076 dirR=APrxLoRsqF1(dirR); 3077 dirR=zro?AF1_(1.0):dirR; 3078 dir.x=zro?AF1_(1.0):dir.x; 3079 dir*=AF2_(dirR); 3080 // Transform from {0 to 2} to {0 to 1} range, and shape with square. 3081 len=len*AF1_(0.5); 3082 len*=len; 3083 // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. 3084 AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y))); 3085 // Anisotropic length after rotation, 3086 // x := 1.0 lerp to 'stretch' on edges 3087 // y := 1.0 lerp to 2x on edges 3088 AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len); 3089 // Based on the amount of 'edge', 3090 // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. 3091 AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len; 3092 // Set distance^2 clipping point to the end of the adjustable window. 3093 AF1 clp=APrxLoRcpF1(lob); 3094 //------------------------------------------------------------------------------------------------------------------------------ 3095 // Accumulation mixed with min/max of 4 nearest. 3096 // b c 3097 // e f g h 3098 // i j k l 3099 // n o 3100 AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), 3101 AF3(klhgR.x,klhgG.x,klhgB.x)); 3102 AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), 3103 AF3(klhgR.x,klhgG.x,klhgB.x)); 3104 // Accumulation. 3105 AF3 aC=AF3_(0.0); 3106 AF1 aW=AF1_(0.0); 3107 FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b 3108 FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c 3109 FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i 3110 FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j 3111 FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f 3112 FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e 3113 FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k 3114 FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l 3115 FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h 3116 FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g 3117 FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o 3118 FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n 3119 //------------------------------------------------------------------------------------------------------------------------------ 3120 // Normalize and dering. 3121 pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));} 3122 #endif 3123 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3124 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3125 //_____________________________________________________________/\_______________________________________________________________ 3126 //============================================================================================================================== 3127 // PACKED 16-BIT VERSION 3128 //============================================================================================================================== 3129 #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H) 3130 // Input callback prototypes, need to be implemented by calling shader 3131 AH4 FsrEasuRH(AF2 p); 3132 AH4 FsrEasuGH(AF2 p); 3133 AH4 FsrEasuBH(AF2 p); 3134 //------------------------------------------------------------------------------------------------------------------------------ 3135 // This runs 2 taps in parallel. 3136 void FsrEasuTapH( 3137 inout AH2 aCR,inout AH2 aCG,inout AH2 aCB, 3138 inout AH2 aW, 3139 AH2 offX,AH2 offY, 3140 AH2 dir, 3141 AH2 len, 3142 AH1 lob, 3143 AH1 clp, 3144 AH2 cR,AH2 cG,AH2 cB){ 3145 AH2 vX,vY; 3146 vX=offX* dir.xx +offY*dir.yy; 3147 vY=offX*(-dir.yy)+offY*dir.xx; 3148 vX*=len.x;vY*=len.y; 3149 AH2 d2=vX*vX+vY*vY; 3150 d2=min(d2,AH2_(clp)); 3151 AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0); 3152 AH2 wA=AH2_(lob)*d2+AH2_(-1.0); 3153 wB*=wB; 3154 wA*=wA; 3155 wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0)); 3156 AH2 w=wB*wA; 3157 aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;} 3158 //------------------------------------------------------------------------------------------------------------------------------ 3159 // This runs 2 taps in parallel. 3160 void FsrEasuSetH( 3161 inout AH2 dirPX,inout AH2 dirPY, 3162 inout AH2 lenP, 3163 AH2 pp, 3164 AP1 biST,AP1 biUV, 3165 AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){ 3166 AH2 w = AH2_(0.0); 3167 if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y); 3168 if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_( pp.y); 3169 // ABS is not free in the packed FP16 path. 3170 AH2 dc=lD-lC; 3171 AH2 cb=lC-lB; 3172 AH2 lenX=max(abs(dc),abs(cb)); 3173 lenX=ARcpH2(lenX); 3174 AH2 dirX=lD-lB; 3175 dirPX+=dirX*w; 3176 lenX=ASatH2(abs(dirX)*lenX); 3177 lenX*=lenX; 3178 lenP+=lenX*w; 3179 AH2 ec=lE-lC; 3180 AH2 ca=lC-lA; 3181 AH2 lenY=max(abs(ec),abs(ca)); 3182 lenY=ARcpH2(lenY); 3183 AH2 dirY=lE-lA; 3184 dirPY+=dirY*w; 3185 lenY=ASatH2(abs(dirY)*lenY); 3186 lenY*=lenY; 3187 lenP+=lenY*w;} 3188 //------------------------------------------------------------------------------------------------------------------------------ 3189 void FsrEasuH( 3190 out AH3 pix, 3191 AU2 ip, 3192 AU4 con0, 3193 AU4 con1, 3194 AU4 con2, 3195 AU4 con3){ 3196 //------------------------------------------------------------------------------------------------------------------------------ 3197 AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); 3198 AF2 fp=floor(pp); 3199 pp-=fp; 3200 AH2 ppp=AH2(pp); 3201 //------------------------------------------------------------------------------------------------------------------------------ 3202 AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); 3203 AF2 p1=p0+AF2_AU2(con2.xy); 3204 AF2 p2=p0+AF2_AU2(con2.zw); 3205 AF2 p3=p0+AF2_AU2(con3.xy); 3206 AH4 bczzR=FsrEasuRH(p0); 3207 AH4 bczzG=FsrEasuGH(p0); 3208 AH4 bczzB=FsrEasuBH(p0); 3209 AH4 ijfeR=FsrEasuRH(p1); 3210 AH4 ijfeG=FsrEasuGH(p1); 3211 AH4 ijfeB=FsrEasuBH(p1); 3212 AH4 klhgR=FsrEasuRH(p2); 3213 AH4 klhgG=FsrEasuGH(p2); 3214 AH4 klhgB=FsrEasuBH(p2); 3215 AH4 zzonR=FsrEasuRH(p3); 3216 AH4 zzonG=FsrEasuGH(p3); 3217 AH4 zzonB=FsrEasuBH(p3); 3218 //------------------------------------------------------------------------------------------------------------------------------ 3219 AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG); 3220 AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG); 3221 AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG); 3222 AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG); 3223 AH1 bL=bczzL.x; 3224 AH1 cL=bczzL.y; 3225 AH1 iL=ijfeL.x; 3226 AH1 jL=ijfeL.y; 3227 AH1 fL=ijfeL.z; 3228 AH1 eL=ijfeL.w; 3229 AH1 kL=klhgL.x; 3230 AH1 lL=klhgL.y; 3231 AH1 hL=klhgL.z; 3232 AH1 gL=klhgL.w; 3233 AH1 oL=zzonL.z; 3234 AH1 nL=zzonL.w; 3235 // This part is different, accumulating 2 taps in parallel. 3236 AH2 dirPX=AH2_(0.0); 3237 AH2 dirPY=AH2_(0.0); 3238 AH2 lenP=AH2_(0.0); 3239 FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL)); 3240 FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL)); 3241 AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g); 3242 AH1 len=lenP.r+lenP.g; 3243 //------------------------------------------------------------------------------------------------------------------------------ 3244 AH2 dir2=dir*dir; 3245 AH1 dirR=dir2.x+dir2.y; 3246 AP1 zro=dirR<AH1_(1.0/32768.0); 3247 dirR=APrxLoRsqH1(dirR); 3248 dirR=zro?AH1_(1.0):dirR; 3249 dir.x=zro?AH1_(1.0):dir.x; 3250 dir*=AH2_(dirR); 3251 len=len*AH1_(0.5); 3252 len*=len; 3253 AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y))); 3254 AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); 3255 AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; 3256 AH1 clp=APrxLoRcpH1(lob); 3257 //------------------------------------------------------------------------------------------------------------------------------ 3258 // FP16 is different, using packed trick to do min and max in same operation. 3259 AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x))); 3260 AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x))); 3261 AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x))); 3262 // This part is different for FP16, working pairs of taps at a time. 3263 AH2 pR=AH2_(0.0); 3264 AH2 pG=AH2_(0.0); 3265 AH2 pB=AH2_(0.0); 3266 AH2 pW=AH2_(0.0); 3267 FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy); 3268 FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); 3269 FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); 3270 FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); 3271 FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); 3272 FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw); 3273 AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); 3274 AH1 aW=pW.x+pW.y; 3275 //------------------------------------------------------------------------------------------------------------------------------ 3276 // Slightly different for FP16 version due to combined min and max. 3277 pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));} 3278 #endif 3279 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3280 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3281 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3282 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3283 //_____________________________________________________________/\_______________________________________________________________ 3284 //============================================================================================================================== 3285 // 3286 // FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING 3287 // 3288 //------------------------------------------------------------------------------------------------------------------------------ 3289 // CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness. 3290 // RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping. 3291 // RCAS also has a built in process to limit sharpening of what it detects as possible noise. 3292 // RCAS sharper does not support scaling, as it should be applied after EASU scaling. 3293 // Pass EASU output straight into RCAS, no color conversions necessary. 3294 //------------------------------------------------------------------------------------------------------------------------------ 3295 // RCAS is based on the following logic. 3296 // RCAS uses a 5 tap filter in a cross pattern (same as CAS), 3297 // w n 3298 // w 1 w for taps w m e 3299 // w s 3300 // Where 'w' is the negative lobe weight. 3301 // output = (w*(n+e+w+s)+m)/(4*w+1) 3302 // RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range, 3303 // 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s) 3304 // 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) 3305 // Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. 3306 // This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. 3307 // So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. 3308 // As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. 3309 // This stabilizes RCAS. 3310 // RCAS does a simple highpass which is normalized against the local contrast then shaped, 3311 // 0.25 3312 // 0.25 -1 0.25 3313 // 0.25 3314 // This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. 3315 // 3316 // GLSL example for the required callbacks : 3317 // 3318 // AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} 3319 // void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) 3320 // { 3321 // //do any simple input color conversions here or leave empty if none needed 3322 // } 3323 // 3324 // FsrRcasCon need to be called from the CPU or GPU to set up constants. 3325 // Including a GPU example here, the 'con' value would be stored out to a constant buffer. 3326 // 3327 // AU4 con; 3328 // FsrRcasCon(con, 3329 // 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 3330 // --------------- 3331 // RCAS sharpening supports a CAS-like pass-through alpha via, 3332 // #define FSR_RCAS_PASSTHROUGH_ALPHA 1 3333 // RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. 3334 // Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, 3335 // #define FSR_RCAS_DENOISE 1 3336 //============================================================================================================================== 3337 // This is set at the limit of providing unnatural results for sharpening. 3338 #define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) 3339 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3340 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3341 //_____________________________________________________________/\_______________________________________________________________ 3342 //============================================================================================================================== 3343 // CONSTANT SETUP 3344 //============================================================================================================================== 3345 // Call to setup required constant values (works on CPU or GPU). 3346 A_STATIC void FsrRcasCon( 3347 outAU4 con, 3348 // The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 3349 AF1 sharpness){ 3350 // Transform from stops to linear value. 3351 sharpness=AExp2F1(-sharpness); 3352 varAF2(hSharp)=initAF2(sharpness,sharpness); 3353 con[0]=AU1_AF1(sharpness); 3354 con[1]=AU1_AH2_AF2(hSharp); 3355 con[2]=0; 3356 con[3]=0;} 3357 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3358 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3359 //_____________________________________________________________/\_______________________________________________________________ 3360 //============================================================================================================================== 3361 // NON-PACKED 32-BIT VERSION 3362 //============================================================================================================================== 3363 #if defined(A_GPU)&&defined(FSR_RCAS_F) 3364 // Input callback prototypes that need to be implemented by calling shader 3365 AF4 FsrRcasLoadF(ASU2 p); 3366 void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); 3367 //------------------------------------------------------------------------------------------------------------------------------ 3368 void FsrRcasF( 3369 out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. 3370 out AF1 pixG, 3371 out AF1 pixB, 3372 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3373 out AF1 pixA, 3374 #endif 3375 AU2 ip, // Integer pixel position in output. 3376 AU4 con){ // Constant generated by RcasSetup(). 3377 // Algorithm uses minimal 3x3 pixel neighborhood. 3378 // b 3379 // d e f 3380 // h 3381 ASU2 sp=ASU2(ip); 3382 AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; 3383 AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; 3384 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3385 AF4 ee=FsrRcasLoadF(sp); 3386 AF3 e=ee.rgb;pixA=ee.a; 3387 #else 3388 AF3 e=FsrRcasLoadF(sp).rgb; 3389 #endif 3390 AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; 3391 AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; 3392 // Rename (32-bit) or regroup (16-bit). 3393 AF1 bR=b.r; 3394 AF1 bG=b.g; 3395 AF1 bB=b.b; 3396 AF1 dR=d.r; 3397 AF1 dG=d.g; 3398 AF1 dB=d.b; 3399 AF1 eR=e.r; 3400 AF1 eG=e.g; 3401 AF1 eB=e.b; 3402 AF1 fR=f.r; 3403 AF1 fG=f.g; 3404 AF1 fB=f.b; 3405 AF1 hR=h.r; 3406 AF1 hG=h.g; 3407 AF1 hB=h.b; 3408 // Run optional input transform. 3409 FsrRcasInputF(bR,bG,bB); 3410 FsrRcasInputF(dR,dG,dB); 3411 FsrRcasInputF(eR,eG,eB); 3412 FsrRcasInputF(fR,fG,fB); 3413 FsrRcasInputF(hR,hG,hB); 3414 // Luma times 2. 3415 AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); 3416 AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); 3417 AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); 3418 AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); 3419 AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); 3420 // Noise detection. 3421 AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; 3422 nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); 3423 nz=AF1_(-0.5)*nz+AF1_(1.0); 3424 // Min and max of ring. 3425 AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); 3426 AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); 3427 AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); 3428 AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); 3429 AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); 3430 AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); 3431 // Immediate constants for peak range. 3432 AF2 peakC=AF2(1.0,-1.0*4.0); 3433 // Limiters, these need to be high precision RCPs. 3434 AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); 3435 AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); 3436 AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); 3437 AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); 3438 AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); 3439 AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); 3440 AF1 lobeR=max(-hitMinR,hitMaxR); 3441 AF1 lobeG=max(-hitMinG,hitMaxG); 3442 AF1 lobeB=max(-hitMinB,hitMaxB); 3443 AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); 3444 // Apply noise removal. 3445 #ifdef FSR_RCAS_DENOISE 3446 lobe*=nz; 3447 #endif 3448 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. 3449 AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); 3450 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; 3451 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; 3452 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; 3453 return;} 3454 #endif 3455 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3456 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3457 //_____________________________________________________________/\_______________________________________________________________ 3458 //============================================================================================================================== 3459 // NON-PACKED 16-BIT VERSION 3460 //============================================================================================================================== 3461 #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) 3462 // Input callback prototypes that need to be implemented by calling shader 3463 AH4 FsrRcasLoadH(ASW2 p); 3464 void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); 3465 //------------------------------------------------------------------------------------------------------------------------------ 3466 void FsrRcasH( 3467 out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. 3468 out AH1 pixG, 3469 out AH1 pixB, 3470 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3471 out AH1 pixA, 3472 #endif 3473 AU2 ip, // Integer pixel position in output. 3474 AU4 con){ // Constant generated by RcasSetup(). 3475 // Sharpening algorithm uses minimal 3x3 pixel neighborhood. 3476 // b 3477 // d e f 3478 // h 3479 ASW2 sp=ASW2(ip); 3480 AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; 3481 AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; 3482 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3483 AH4 ee=FsrRcasLoadH(sp); 3484 AH3 e=ee.rgb;pixA=ee.a; 3485 #else 3486 AH3 e=FsrRcasLoadH(sp).rgb; 3487 #endif 3488 AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; 3489 AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; 3490 // Rename (32-bit) or regroup (16-bit). 3491 AH1 bR=b.r; 3492 AH1 bG=b.g; 3493 AH1 bB=b.b; 3494 AH1 dR=d.r; 3495 AH1 dG=d.g; 3496 AH1 dB=d.b; 3497 AH1 eR=e.r; 3498 AH1 eG=e.g; 3499 AH1 eB=e.b; 3500 AH1 fR=f.r; 3501 AH1 fG=f.g; 3502 AH1 fB=f.b; 3503 AH1 hR=h.r; 3504 AH1 hG=h.g; 3505 AH1 hB=h.b; 3506 // Run optional input transform. 3507 FsrRcasInputH(bR,bG,bB); 3508 FsrRcasInputH(dR,dG,dB); 3509 FsrRcasInputH(eR,eG,eB); 3510 FsrRcasInputH(fR,fG,fB); 3511 FsrRcasInputH(hR,hG,hB); 3512 // Luma times 2. 3513 AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); 3514 AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); 3515 AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); 3516 AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); 3517 AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); 3518 // Noise detection. 3519 AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; 3520 nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); 3521 nz=AH1_(-0.5)*nz+AH1_(1.0); 3522 // Min and max of ring. 3523 AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); 3524 AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); 3525 AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); 3526 AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); 3527 AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); 3528 AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); 3529 // Immediate constants for peak range. 3530 AH2 peakC=AH2(1.0,-1.0*4.0); 3531 // Limiters, these need to be high precision RCPs. 3532 AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); 3533 AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); 3534 AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); 3535 AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); 3536 AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); 3537 AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); 3538 AH1 lobeR=max(-hitMinR,hitMaxR); 3539 AH1 lobeG=max(-hitMinG,hitMaxG); 3540 AH1 lobeB=max(-hitMinB,hitMaxB); 3541 AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; 3542 // Apply noise removal. 3543 #ifdef FSR_RCAS_DENOISE 3544 lobe*=nz; 3545 #endif 3546 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. 3547 AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); 3548 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; 3549 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; 3550 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} 3551 #endif 3552 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3553 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3554 //_____________________________________________________________/\_______________________________________________________________ 3555 //============================================================================================================================== 3556 // PACKED 16-BIT VERSION 3557 //============================================================================================================================== 3558 #if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) 3559 // Input callback prototypes that need to be implemented by the calling shader 3560 AH4 FsrRcasLoadHx2(ASW2 p); 3561 void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); 3562 //------------------------------------------------------------------------------------------------------------------------------ 3563 // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. 3564 void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ 3565 #ifdef A_HLSL 3566 // Invoke a slower path for DX only, since it won't allow uninitialized values. 3567 pix0.a=pix1.a=0.0; 3568 #endif 3569 pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); 3570 pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} 3571 //------------------------------------------------------------------------------------------------------------------------------ 3572 void FsrRcasHx2( 3573 // Output values are for 2 8x8 tiles in a 16x8 region. 3574 // pix<R,G,B>.x = left 8x8 tile 3575 // pix<R,G,B>.y = right 8x8 tile 3576 // This enables later processing to easily be packed as well. 3577 out AH2 pixR, 3578 out AH2 pixG, 3579 out AH2 pixB, 3580 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3581 out AH2 pixA, 3582 #endif 3583 AU2 ip, // Integer pixel position in output. 3584 AU4 con){ // Constant generated by RcasSetup(). 3585 // No scaling algorithm uses minimal 3x3 pixel neighborhood. 3586 ASW2 sp0=ASW2(ip); 3587 AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; 3588 AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; 3589 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3590 AH4 ee0=FsrRcasLoadHx2(sp0); 3591 AH3 e0=ee0.rgb;pixA.r=ee0.a; 3592 #else 3593 AH3 e0=FsrRcasLoadHx2(sp0).rgb; 3594 #endif 3595 AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; 3596 AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; 3597 ASW2 sp1=sp0+ASW2(8,0); 3598 AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; 3599 AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; 3600 #ifdef FSR_RCAS_PASSTHROUGH_ALPHA 3601 AH4 ee1=FsrRcasLoadHx2(sp1); 3602 AH3 e1=ee1.rgb;pixA.g=ee1.a; 3603 #else 3604 AH3 e1=FsrRcasLoadHx2(sp1).rgb; 3605 #endif 3606 AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; 3607 AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; 3608 // Arrays of Structures to Structures of Arrays conversion. 3609 AH2 bR=AH2(b0.r,b1.r); 3610 AH2 bG=AH2(b0.g,b1.g); 3611 AH2 bB=AH2(b0.b,b1.b); 3612 AH2 dR=AH2(d0.r,d1.r); 3613 AH2 dG=AH2(d0.g,d1.g); 3614 AH2 dB=AH2(d0.b,d1.b); 3615 AH2 eR=AH2(e0.r,e1.r); 3616 AH2 eG=AH2(e0.g,e1.g); 3617 AH2 eB=AH2(e0.b,e1.b); 3618 AH2 fR=AH2(f0.r,f1.r); 3619 AH2 fG=AH2(f0.g,f1.g); 3620 AH2 fB=AH2(f0.b,f1.b); 3621 AH2 hR=AH2(h0.r,h1.r); 3622 AH2 hG=AH2(h0.g,h1.g); 3623 AH2 hB=AH2(h0.b,h1.b); 3624 // Run optional input transform. 3625 FsrRcasInputHx2(bR,bG,bB); 3626 FsrRcasInputHx2(dR,dG,dB); 3627 FsrRcasInputHx2(eR,eG,eB); 3628 FsrRcasInputHx2(fR,fG,fB); 3629 FsrRcasInputHx2(hR,hG,hB); 3630 // Luma times 2. 3631 AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); 3632 AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); 3633 AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); 3634 AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); 3635 AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); 3636 // Noise detection. 3637 AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; 3638 nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); 3639 nz=AH2_(-0.5)*nz+AH2_(1.0); 3640 // Min and max of ring. 3641 AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); 3642 AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); 3643 AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); 3644 AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); 3645 AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); 3646 AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); 3647 // Immediate constants for peak range. 3648 AH2 peakC=AH2(1.0,-1.0*4.0); 3649 // Limiters, these need to be high precision RCPs. 3650 AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); 3651 AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); 3652 AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); 3653 AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); 3654 AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); 3655 AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); 3656 AH2 lobeR=max(-hitMinR,hitMaxR); 3657 AH2 lobeG=max(-hitMinG,hitMaxG); 3658 AH2 lobeB=max(-hitMinB,hitMaxB); 3659 AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); 3660 // Apply noise removal. 3661 #ifdef FSR_RCAS_DENOISE 3662 lobe*=nz; 3663 #endif 3664 // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. 3665 AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); 3666 pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; 3667 pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; 3668 pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} 3669 #endif 3670 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3671 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3672 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3673 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3674 //_____________________________________________________________/\_______________________________________________________________ 3675 //============================================================================================================================== 3676 // 3677 // FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR 3678 // 3679 //------------------------------------------------------------------------------------------------------------------------------ 3680 // Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. 3681 // Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. 3682 // The 'Lfga*()' functions provide a convenient way to introduce grain. 3683 // These functions limit grain based on distance to signal limits. 3684 // This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. 3685 // Grain application should be done in a linear colorspace. 3686 // The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). 3687 //------------------------------------------------------------------------------------------------------------------------------ 3688 // Usage, 3689 // FsrLfga*( 3690 // color, // In/out linear colorspace color {0 to 1} ranged. 3691 // grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. 3692 // amount); // Amount of grain (0 to 1} ranged. 3693 //------------------------------------------------------------------------------------------------------------------------------ 3694 // Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' 3695 //============================================================================================================================== 3696 #if defined(A_GPU) 3697 // Maximum grain is the minimum distance to the signal limit. 3698 void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} 3699 #endif 3700 //============================================================================================================================== 3701 #if defined(A_GPU)&&defined(A_HALF) 3702 // Half precision version (slower). 3703 void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} 3704 //------------------------------------------------------------------------------------------------------------------------------ 3705 // Packed half precision version (faster). 3706 void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ 3707 cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} 3708 #endif 3709 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3710 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3711 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3712 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3713 //_____________________________________________________________/\_______________________________________________________________ 3714 //============================================================================================================================== 3715 // 3716 // FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER 3717 // 3718 //------------------------------------------------------------------------------------------------------------------------------ 3719 // This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. 3720 // The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. 3721 //------------------------------------------------------------------------------------------------------------------------------ 3722 // Reversible tonemapper usage, 3723 // FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. 3724 // FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. 3725 //============================================================================================================================== 3726 #if defined(A_GPU) 3727 void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} 3728 // The extra max solves the c=1.0 case (which is a /0). 3729 void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} 3730 #endif 3731 //============================================================================================================================== 3732 #if defined(A_GPU)&&defined(A_HALF) 3733 void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} 3734 void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} 3735 //------------------------------------------------------------------------------------------------------------------------------ 3736 void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ 3737 AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} 3738 void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ 3739 AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} 3740 #endif 3741 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3742 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3743 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3744 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3745 //_____________________________________________________________/\_______________________________________________________________ 3746 //============================================================================================================================== 3747 // 3748 // FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER 3749 // 3750 //------------------------------------------------------------------------------------------------------------------------------ 3751 // Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. 3752 // Gamma 2.0 is used so that the conversion back to linear is just to square the color. 3753 // The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. 3754 // Given good non-biased temporal blue noise as dither input, 3755 // the output dither will temporally conserve energy. 3756 // This is done by choosing the linear nearest step point instead of perceptual nearest. 3757 // See code below for details. 3758 //------------------------------------------------------------------------------------------------------------------------------ 3759 // DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION 3760 // =============================================== 3761 // - Output is 'uint(floor(saturate(n)*255.0+0.5))'. 3762 // - Thus rounding is to nearest. 3763 // - NaN gets converted to zero. 3764 // - INF is clamped to {0.0 to 1.0}. 3765 //============================================================================================================================== 3766 #if defined(A_GPU) 3767 // Hand tuned integer position to dither value, with more values than simple checkerboard. 3768 // Only 32-bit has enough precision for this compddation. 3769 // Output is {0 to <1}. 3770 AF1 FsrTepdDitF(AU2 p,AU1 f){ 3771 AF1 x=AF1_(p.x+f); 3772 AF1 y=AF1_(p.y); 3773 // The 1.61803 golden ratio. 3774 AF1 a=AF1_((1.0+sqrt(5.0))/2.0); 3775 // Number designed to provide a good visual pattern. 3776 AF1 b=AF1_(1.0/3.69); 3777 x=x*a+(y*b); 3778 return AFractF1(x);} 3779 //------------------------------------------------------------------------------------------------------------------------------ 3780 // This version is 8-bit gamma 2.0. 3781 // The 'c' input is {0 to 1}. 3782 // Output is {0 to 1} ready for image store. 3783 void FsrTepdC8F(inout AF3 c,AF1 dit){ 3784 AF3 n=sqrt(c); 3785 n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); 3786 AF3 a=n*n; 3787 AF3 b=n+AF3_(1.0/255.0);b=b*b; 3788 // Ratio of 'a' to 'b' required to produce 'c'. 3789 // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). 3790 // APrxMedRcpF1() is an IADD,FMA,MUL. 3791 AF3 r=(c-b)*APrxMedRcpF3(a-b); 3792 // Use the ratio as a cutoff to choose 'a' or 'b'. 3793 // AGtZeroF1() is a MUL. 3794 c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} 3795 //------------------------------------------------------------------------------------------------------------------------------ 3796 // This version is 10-bit gamma 2.0. 3797 // The 'c' input is {0 to 1}. 3798 // Output is {0 to 1} ready for image store. 3799 void FsrTepdC10F(inout AF3 c,AF1 dit){ 3800 AF3 n=sqrt(c); 3801 n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); 3802 AF3 a=n*n; 3803 AF3 b=n+AF3_(1.0/1023.0);b=b*b; 3804 AF3 r=(c-b)*APrxMedRcpF3(a-b); 3805 c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} 3806 #endif 3807 //============================================================================================================================== 3808 #if defined(A_GPU)&&defined(A_HALF) 3809 AH1 FsrTepdDitH(AU2 p,AU1 f){ 3810 AF1 x=AF1_(p.x+f); 3811 AF1 y=AF1_(p.y); 3812 AF1 a=AF1_((1.0+sqrt(5.0))/2.0); 3813 AF1 b=AF1_(1.0/3.69); 3814 x=x*a+(y*b); 3815 return AH1(AFractF1(x));} 3816 //------------------------------------------------------------------------------------------------------------------------------ 3817 void FsrTepdC8H(inout AH3 c,AH1 dit){ 3818 AH3 n=sqrt(c); 3819 n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); 3820 AH3 a=n*n; 3821 AH3 b=n+AH3_(1.0/255.0);b=b*b; 3822 AH3 r=(c-b)*APrxMedRcpH3(a-b); 3823 c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} 3824 //------------------------------------------------------------------------------------------------------------------------------ 3825 void FsrTepdC10H(inout AH3 c,AH1 dit){ 3826 AH3 n=sqrt(c); 3827 n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); 3828 AH3 a=n*n; 3829 AH3 b=n+AH3_(1.0/1023.0);b=b*b; 3830 AH3 r=(c-b)*APrxMedRcpH3(a-b); 3831 c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} 3832 //============================================================================================================================== 3833 // This computes dither for positions 'p' and 'p+{8,0}'. 3834 AH2 FsrTepdDitHx2(AU2 p,AU1 f){ 3835 AF2 x; 3836 x.x=AF1_(p.x+f); 3837 x.y=x.x+AF1_(8.0); 3838 AF1 y=AF1_(p.y); 3839 AF1 a=AF1_((1.0+sqrt(5.0))/2.0); 3840 AF1 b=AF1_(1.0/3.69); 3841 x=x*AF2_(a)+AF2_(y*b); 3842 return AH2(AFractF2(x));} 3843 //------------------------------------------------------------------------------------------------------------------------------ 3844 void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ 3845 AH2 nR=sqrt(cR); 3846 AH2 nG=sqrt(cG); 3847 AH2 nB=sqrt(cB); 3848 nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); 3849 nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); 3850 nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); 3851 AH2 aR=nR*nR; 3852 AH2 aG=nG*nG; 3853 AH2 aB=nB*nB; 3854 AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; 3855 AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; 3856 AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; 3857 AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); 3858 AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); 3859 AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); 3860 cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); 3861 cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); 3862 cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} 3863 //------------------------------------------------------------------------------------------------------------------------------ 3864 void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ 3865 AH2 nR=sqrt(cR); 3866 AH2 nG=sqrt(cG); 3867 AH2 nB=sqrt(cB); 3868 nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); 3869 nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); 3870 nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); 3871 AH2 aR=nR*nR; 3872 AH2 aG=nG*nG; 3873 AH2 aB=nB*nB; 3874 AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; 3875 AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; 3876 AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; 3877 AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); 3878 AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); 3879 AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); 3880 cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); 3881 cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); 3882 cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} 3883 #endif 3884 3885 3886 void CurrFilter(AU2 pos) 3887 { 3888 AF3 c; 3889 FsrRcasF(c.r, c.g, c.b, pos, con0); 3890 imageStore(imgOutput, ASU2(pos), AF4(c, 1)); 3891 } 3892 3893 void main() { 3894 FsrRcasCon(con0, sharpening_data); 3895 3896 AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); 3897 CurrFilter(gxy); 3898 gxy.x += 8u; 3899 CurrFilter(gxy); 3900 gxy.y += 8u; 3901 CurrFilter(gxy); 3902 gxy.x -= 8u; 3903 CurrFilter(gxy); 3904 }