/ vulkan-shaders / generic_binary_head.comp
generic_binary_head.comp
1 #extension GL_EXT_shader_16bit_storage : require 2 3 layout (push_constant) uniform parameter 4 { 5 uint ne; 6 uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; 7 uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; 8 uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; 9 uint d_offset; 10 float param1; float param2; 11 } p; 12 13 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 14 15 layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 16 layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; 17 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; 18 19 uint src0_idx(uint idx) { 20 const uint i03 = idx / (p.ne02*p.ne01*p.ne00); 21 const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; 22 const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); 23 const uint i02_offset = i02*p.ne01*p.ne00; 24 const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; 25 const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; 26 return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; 27 } 28 29 uint src1_idx(uint idx) { 30 const uint i03 = idx / (p.ne02*p.ne01*p.ne00); 31 const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; 32 const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); 33 const uint i02_offset = i02*p.ne01*p.ne00; 34 const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; 35 const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; 36 37 return (i03 % p.ne13)*p.nb13 + (i02 % p.ne12)*p.nb12 + (i01 % p.ne11)*p.nb11 + (i00 % p.ne10)*p.nb10; 38 } 39 40 uint dst_idx(uint idx) { 41 const uint i23 = idx / (p.ne22*p.ne21*p.ne20); 42 const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20; 43 const uint i22 = (idx - i23_offset) / (p.ne21*p.ne20); 44 const uint i22_offset = i22*p.ne21*p.ne20; 45 const uint i21 = (idx - i23_offset - i22_offset) / p.ne20; 46 const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20; 47 return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20; 48 }