/ ggml-vulkan.cpp
ggml-vulkan.cpp
   1  #include "ggml-vulkan.h"
   2  #include <vulkan/vulkan_core.h>
   3  #ifdef GGML_VULKAN_RUN_TESTS
   4  #include <chrono>
   5  #endif
   6  
   7  #include <vulkan/vulkan.hpp>
   8  
   9  #include <algorithm>
  10  #include <cmath>
  11  #include <iomanip>
  12  #include <iostream>
  13  #include <tuple>
  14  #include <vector>
  15  #include <sstream>
  16  #include <utility>
  17  #include <memory>
  18  #include <limits>
  19  #include <map>
  20  
  21  #include "ggml.h"
  22  #include "ggml-backend-impl.h"
  23  
  24  #include "ggml-vulkan-shaders.hpp"
  25  
  26  #define VK_API_VERSION VK_API_VERSION_1_2
  27  
  28  #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
  29  
  30  #define VK_VENDOR_ID_AMD 0x1002
  31  #define VK_VENDOR_ID_APPLE 0x106b
  32  #define VK_VENDOR_ID_INTEL 0x8086
  33  #define VK_VENDOR_ID_NVIDIA 0x10de
  34  
  35  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN 0
  36  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
  37  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
  38  
  39  #define VK_NUM_TYPES 16
  40  
  41  #define GGML_VK_MAX_NODES 8192
  42  
  43  #define MAX_VK_BUFFERS 256
  44  
  45  #ifndef K_QUANTS_PER_ITERATION
  46  #define K_QUANTS_PER_ITERATION 1
  47  #else
  48  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
  49  #endif
  50  
  51  #define VK_CHECK(err, msg)                                          \
  52      do {                                                            \
  53          vk::Result err_ = (err);                                    \
  54          if (err_ != vk::Result::eSuccess) {                         \
  55              fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n",  \
  56                  #err, to_string(err_).c_str(), __FILE__, __LINE__); \
  57              exit(1);                                                \
  58          }                                                           \
  59      } while (0)
  60  
  61  #ifdef GGML_VULKAN_DEBUG
  62  #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
  63  #else
  64  #define VK_LOG_DEBUG(msg) ((void) 0)
  65  #endif // GGML_VULKAN_DEBUG
  66  
  67  struct ggml_backend_vk_context;
  68  
  69  struct vk_queue {
  70      uint32_t queue_family_index;
  71      vk::Queue queue;
  72      vk::CommandPool pool;
  73      uint32_t cmd_buffer_idx;
  74      std::vector<vk::CommandBuffer> cmd_buffers;
  75  
  76      vk::PipelineStageFlags stage_flags;
  77  };
  78  
  79  struct vk_pipeline_struct {
  80      std::string name;
  81      vk::ShaderModule shader_module;
  82      vk::DescriptorSetLayout dsl;
  83      std::vector<vk::DescriptorPool> descriptor_pools;
  84      std::vector<vk::DescriptorSet> descriptor_sets;
  85      uint32_t descriptor_set_idx;
  86      vk::PipelineLayout layout;
  87      vk::Pipeline pipeline;
  88      uint32_t push_constant_size;
  89      uint32_t parameter_count;
  90      std::array<uint32_t, 3> wg_denoms;
  91      uint32_t align;
  92  };
  93  
  94  typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
  95  typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
  96  
  97  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
  98  
  99  struct vk_matmul_pipeline_struct {
 100      vk_pipeline l, m, s;
 101      vk_pipeline a_l, a_m, a_s;
 102  };
 103  
 104  typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
 105  
 106  struct vk_device {
 107      vk::PhysicalDevice physical_device;
 108      vk::PhysicalDeviceProperties properties;
 109      std::string name;
 110      uint64_t max_memory_allocation_size;
 111      bool fp16;
 112      vk::Device device;
 113      uint32_t vendor_id;
 114      vk_queue compute_queue;
 115      vk_queue transfer_queue;
 116      bool single_queue;
 117      uint32_t descriptor_set_mode;
 118      uint32_t subgroup_size;
 119      bool uma;
 120  
 121      bool initialized;
 122      size_t idx;
 123  
 124      vk_matmul_pipeline pipeline_matmul_f32;
 125      vk_matmul_pipeline pipeline_matmul_f32_f16;
 126      vk_matmul_pipeline pipeline_matmul_f16;
 127      vk_matmul_pipeline pipeline_matmul_f16_f32;
 128      vk_pipeline pipeline_matmul_split_k_reduce;
 129  
 130      vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
 131  
 132      vk_matmul_pipeline pipeline_matmul_id_f32;
 133      vk_matmul_pipeline pipeline_matmul_id_f16;
 134      vk_matmul_pipeline pipeline_matmul_id_f16_f32;
 135  
 136      vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
 137  
 138      vk_pipeline pipeline_dequant[VK_NUM_TYPES];
 139      vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
 140      vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
 141      vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
 142  
 143      vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
 144      vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
 145      vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
 146      vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
 147      vk_pipeline pipeline_mul_f32;
 148      vk_pipeline pipeline_div_f32;
 149      vk_pipeline pipeline_add_f32;
 150      vk_pipeline pipeline_scale_f32;
 151      vk_pipeline pipeline_sqr_f32;
 152      vk_pipeline pipeline_clamp_f32;
 153      vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
 154      vk_pipeline pipeline_norm_f32;
 155      vk_pipeline pipeline_rms_norm_f32;
 156      vk_pipeline pipeline_gelu_f32;
 157      vk_pipeline pipeline_silu_f32;
 158      vk_pipeline pipeline_relu_f32;
 159      vk_pipeline pipeline_diag_mask_inf_f32;
 160      vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
 161      vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
 162      vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
 163      vk_pipeline pipeline_argsort_f32;
 164      vk_pipeline pipeline_sum_rows_f32;
 165  
 166      std::vector<vk_pipeline_ref> pipelines;
 167  
 168      ~vk_device() {
 169          VK_LOG_DEBUG("destroy device " << name);
 170          device.destroyCommandPool(compute_queue.pool);
 171          if (!single_queue) {
 172              device.destroyCommandPool(transfer_queue.pool);
 173          }
 174  
 175          for (auto& pipeline : pipelines) {
 176              if (pipeline.expired()) {
 177                  continue;
 178              }
 179  
 180              vk_pipeline pl = pipeline.lock();
 181              ggml_vk_destroy_pipeline(device, pl);
 182          }
 183          pipelines.clear();
 184  
 185          device.destroy();
 186      }
 187  };
 188  
 189  struct vk_buffer_struct {
 190      vk::Buffer buffer;
 191      vk::DeviceMemory device_memory;
 192      vk::MemoryPropertyFlags memory_property_flags;
 193      void * ptr;
 194      size_t size = 0;
 195  
 196      ggml_backend_vk_context * ctx;
 197  
 198      std::shared_ptr<vk_device> device;
 199  
 200      ~vk_buffer_struct() {
 201          if (size == 0) {
 202              return;
 203          }
 204          VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
 205  
 206          device->device.freeMemory(device_memory);
 207          device->device.destroyBuffer(buffer);
 208      }
 209  };
 210  
 211  typedef std::shared_ptr<vk_buffer_struct> vk_buffer;
 212  typedef std::weak_ptr<vk_buffer_struct> vk_buffer_ref;
 213  
 214  struct vk_subbuffer {
 215      vk_buffer buffer;
 216      uint64_t offset;
 217      uint64_t size;
 218  };
 219  
 220  struct vk_semaphore {
 221      vk::Semaphore s;
 222      uint64_t value;
 223  };
 224  
 225  struct vk_submission {
 226      vk::CommandBuffer buffer;
 227      std::vector<vk_semaphore> wait_semaphores;
 228      std::vector<vk_semaphore> signal_semaphores;
 229  };
 230  
 231  typedef std::vector<vk_submission> vk_sequence;
 232  
 233  struct vk_mat_mat_push_constants {
 234      uint32_t M; uint32_t N; uint32_t K;
 235      uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
 236      uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
 237      uint32_t k_split;
 238      uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
 239  };
 240  struct vk_mat_vec_push_constants {
 241      uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
 242      uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
 243      uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
 244  };
 245  
 246  struct vk_mat_mat_id_push_constants {
 247      uint32_t M; uint32_t N; uint32_t K;
 248      uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
 249      uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
 250      uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
 251  };
 252  struct vk_mat_vec_id_push_constants {
 253      uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
 254      uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
 255      uint32_t nei0; uint32_t ne11;
 256  };
 257  
 258  struct vk_op_push_constants {
 259      uint32_t KX;
 260      uint32_t KY;
 261      float param1;
 262      float param2;
 263  };
 264  
 265  struct vk_op_unary_push_constants {
 266      uint32_t ne;
 267      uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
 268      uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
 269      uint32_t d_offset;
 270      float param1; float param2;
 271  };
 272  
 273  struct vk_op_binary_push_constants {
 274      uint32_t ne;
 275      uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
 276      uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
 277      uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
 278      uint32_t d_offset;
 279      float param1; float param2;
 280  };
 281  
 282  struct vk_op_diag_mask_push_constants {
 283      uint32_t ncols;
 284      uint32_t rows_per_channel;
 285      int32_t n_past;
 286  };
 287  
 288  struct vk_op_rope_push_constants {
 289      uint32_t ncols;
 290      uint32_t n_dims;
 291      float freq_scale;
 292      uint32_t p_delta_rows;
 293      float freq_base;
 294      float ext_factor;
 295      float attn_factor;
 296      float corr_dims[2];
 297      float theta_scale;
 298      uint32_t has_ff;
 299  };
 300  
 301  struct vk_op_soft_max_push_constants {
 302      uint32_t KX;
 303      uint32_t KY;
 304      float scale;
 305      float max_bias;
 306      float m0;
 307      float m1;
 308      uint32_t n_head_log2;
 309  };
 310  
 311  struct vk_op_argsort_push_constants {
 312      uint32_t ncols;
 313      uint32_t ncols_pad;
 314      int32_t order;
 315  };
 316  
 317  // Allow pre-recording command buffers
 318  struct vk_staging_memcpy {
 319      vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
 320  
 321      void * dst;
 322      const void * src;
 323      size_t n;
 324  };
 325  
 326  struct vk_context {
 327      size_t idx;
 328  
 329      vk_submission * s;
 330      std::vector<vk_sequence> seqs;
 331  
 332      ggml_tensor * exit_tensor;
 333  
 334      std::vector<vk_staging_memcpy> in_memcpys;
 335      std::vector<vk_staging_memcpy> out_memcpys;
 336  
 337      vk_queue * q;
 338  };
 339  
 340  struct ggml_tensor_extra_gpu {
 341      size_t ctx_idx;
 342  
 343      vk_buffer_ref buffer_gpu;
 344      uint64_t offset;
 345  
 346      void reset() {
 347          ctx_idx = 0;
 348          buffer_gpu.reset();
 349          offset = 0;
 350      }
 351  };
 352  
 353  struct ggml_vk_garbage_collector {
 354      std::vector<vk_semaphore> tl_semaphores;
 355      std::vector<vk_semaphore> semaphores;
 356      std::vector<vk::Event> events;
 357      std::vector<vk_buffer> temp_buffers;
 358      std::vector<vk_context> contexts;
 359  };
 360  
 361  #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
 362  #include <mutex>
 363  
 364  #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
 365  
 366  static std::string format_size(size_t size) {
 367      const size_t kib = 1024;
 368      const size_t mib = kib * 1024;
 369      const size_t gib = mib * 1024;
 370  
 371      std::ostringstream oss;
 372      oss << std::fixed << std::setprecision(2);
 373  
 374      if (size >= gib) {
 375          oss << static_cast<double>(size) / gib << " GiB";
 376      } else if (size >= mib) {
 377          oss << static_cast<double>(size) / mib << " MiB";
 378      } else if (size >= kib) {
 379          oss << static_cast<double>(size) / kib << " KiB";
 380      } else {
 381          oss << size << " B";
 382      }
 383  
 384      return oss.str();
 385  }
 386  
 387  static std::mutex log_mutex;
 388  
 389  class vk_memory_logger {
 390  public:
 391      vk_memory_logger(): total_device(0), total_host(0) {}
 392      void log_allocation(vk_buffer_ref buf_ref, size_t size);
 393      void log_deallocation(vk_buffer_ref buf_ref);
 394  
 395  private:
 396      std::map<vk::Buffer, size_t> allocations; // Track allocations
 397      size_t total_device;
 398      size_t total_host;
 399  };
 400  #else
 401  #define VK_LOG_MEMORY(msg) ((void) 0)
 402  #endif // GGML_VULKAN_MEMORY_DEBUG
 403  
 404  struct ggml_backend_vk_context {
 405      std::string name;
 406  
 407      std::shared_ptr<vk_device> device;
 408  
 409      size_t semaphore_idx, event_idx;
 410      ggml_vk_garbage_collector gc;
 411      std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
 412      size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
 413      vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
 414      vk::Fence fence;
 415      vk_buffer staging;
 416      size_t staging_size;
 417      size_t staging_offset;
 418      vk_buffer sync_staging;
 419  
 420      vk_buffer buffer_pool[MAX_VK_BUFFERS];
 421  
 422      vk_context * compute_ctx;
 423      vk_context * transfer_ctx;
 424  
 425      bool initialized;
 426  
 427      size_t idx;
 428  
 429  #ifdef GGML_VULKAN_MEMORY_DEBUG
 430      vk_memory_logger memory_logger;
 431  #endif
 432  };
 433  
 434  #ifdef GGML_VULKAN_MEMORY_DEBUG
 435  void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
 436      std::lock_guard<std::mutex> guard(log_mutex);
 437      vk_buffer buf = buf_ref.lock();
 438      const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
 439      const std::string type = device ? "device" : "host";
 440      allocations[buf->buffer] = size;
 441      total_device += device ? size : 0;
 442      total_host += device ? 0 : size;
 443      VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
 444  }
 445  
 446  void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
 447      if (buf_ref.expired() || buf_ref.lock()->size == 0) {
 448          return;
 449      }
 450  
 451      std::lock_guard<std::mutex> guard(log_mutex);
 452      vk_buffer buf = buf_ref.lock();
 453      const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
 454      std::string type = device ? "device" : "host";
 455      auto it = allocations.find(buf->buffer);
 456      total_device -= device ? it->second : 0;
 457      total_host -= device ? 0 : it->second;
 458      if (it != allocations.end()) {
 459          VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
 460          allocations.erase(it);
 461      } else {
 462          VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
 463      }
 464  }
 465  #endif // GGML_VULKAN_MEMORY_DEBUG
 466  
 467  struct vk_instance_t {
 468      vk::Instance instance;
 469  
 470      std::vector<size_t> device_indices;
 471  
 472      ggml_backend_t backends[GGML_VK_MAX_DEVICES];
 473      ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES];
 474      ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES];
 475      bool initialized[GGML_VK_MAX_DEVICES];
 476  };
 477  
 478  static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
 479      VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
 480      static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
 481  
 482      if (devices[idx].expired()) {
 483          VK_LOG_DEBUG("Initializing new vk_device");
 484          std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
 485          device->initialized = false;
 486          devices[idx] = device;
 487          return device;
 488      }
 489  
 490      return devices[idx].lock();
 491  }
 492  
 493  #ifdef GGML_VULKAN_CHECK_RESULTS
 494  static size_t vk_skip_checks;
 495  static size_t vk_output_tensor;
 496  
 497  static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
 498  static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
 499  static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
 500  #endif
 501  
 502  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 503  
 504  static bool vk_instance_initialized = false;
 505  static vk_instance_t vk_instance;
 506  
 507  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
 508  
 509  static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
 510      VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
 511      GGML_ASSERT(parameter_count > 0);
 512      GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
 513  
 514      pipeline = std::make_shared<vk_pipeline_struct>();
 515      pipeline->name = name;
 516      pipeline->parameter_count = parameter_count;
 517      pipeline->push_constant_size = push_constant_size;
 518      pipeline->wg_denoms = wg_denoms;
 519      pipeline->align = align;
 520  
 521      vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
 522      pipeline->shader_module = ctx->device->device.createShaderModule(shader_module_create_info);
 523  
 524      std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
 525      std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
 526      for (uint32_t i = 0; i < parameter_count; i++) {
 527          dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
 528          dsl_binding_flags.push_back({});
 529      }
 530  
 531      vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
 532  
 533      vk::PushConstantRange pcr(
 534          vk::ShaderStageFlagBits::eCompute,
 535          0,
 536          pipeline->push_constant_size
 537      );
 538  
 539      vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
 540          {},
 541          dsl_binding);
 542      descriptor_set_layout_create_info.setPNext(&dslbfci);
 543      pipeline->dsl = ctx->device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
 544  
 545      // Check if device supports multiple descriptors per pool
 546      if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
 547          const uint32_t alloc_count = 2;
 548  
 549          // Try allocating multiple sets from one pool
 550          // This fails on AMD for some reason, so add a fall back to allocating one pool per set
 551          vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
 552          vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
 553          vk::DescriptorPool pool = ctx->device->device.createDescriptorPool(descriptor_pool_create_info);
 554  
 555          std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
 556          for (uint32_t i = 0; i < alloc_count; i++) {
 557              layouts[i] = pipeline->dsl;
 558          }
 559          try {
 560              vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
 561              std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
 562          } catch(vk::OutOfPoolMemoryError const&) {
 563              ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
 564          }
 565  
 566          ctx->device->device.destroyDescriptorPool(pool);
 567      }
 568  
 569      if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
 570          vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
 571          vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
 572          pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
 573      }
 574  
 575      pipeline->descriptor_set_idx = 0;
 576  
 577      vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
 578      pipeline->layout = ctx->device->device.createPipelineLayout(pipeline_layout_create_info);
 579  
 580      std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
 581  
 582      for (size_t i = 0; i < specialization_constants.size(); i++) {
 583          specialization_entries[i].constantID = i;
 584          specialization_entries[i].offset = i * sizeof(uint32_t);
 585          specialization_entries[i].size = sizeof(uint32_t);
 586      }
 587  
 588      vk::SpecializationInfo specialization_info(
 589          specialization_entries.size(),
 590          specialization_entries.data(),
 591          specialization_constants.size() * sizeof(uint32_t),
 592          specialization_constants.data()
 593      );
 594  
 595      vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
 596              vk::PipelineShaderStageCreateFlags(),
 597              vk::ShaderStageFlagBits::eCompute,
 598              pipeline->shader_module,
 599              entrypoint.c_str(),
 600              &specialization_info);
 601      vk::ComputePipelineCreateInfo compute_pipeline_create_info(
 602          vk::PipelineCreateFlags(),
 603          pipeline_shader_create_info,
 604          pipeline->layout);
 605      pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
 606  
 607      ctx->device->pipelines.push_back(pipeline);
 608  }
 609  
 610  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
 611      VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
 612      for (auto& pool : pipeline->descriptor_pools) {
 613          device.destroyDescriptorPool(pool);
 614      }
 615      pipeline->descriptor_pools.clear();
 616      pipeline->descriptor_sets.clear();
 617      pipeline->descriptor_set_idx = 0;
 618  
 619      device.destroyDescriptorSetLayout(pipeline->dsl);
 620  
 621      device.destroyPipelineLayout(pipeline->layout);
 622  
 623      device.destroyShaderModule(pipeline->shader_module);
 624  
 625      device.destroyPipeline(pipeline->pipeline);
 626  }
 627  
 628  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
 629      VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
 630      if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
 631          // Enough descriptors are available
 632          return;
 633      }
 634  
 635      if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
 636          const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
 637  
 638          std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
 639          for (uint32_t i = 0; i < alloc_count; i++) {
 640              layouts[i] = pipeline->dsl;
 641          }
 642          vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data());
 643          std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
 644          pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
 645      } else {
 646          for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) {
 647              vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
 648              vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
 649              pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
 650  
 651              vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl);
 652              std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
 653              pipeline->descriptor_sets.push_back(sets[0]);
 654          }
 655      }
 656  }
 657  
 658  static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
 659      VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
 660      pipeline->descriptor_set_idx = 0;
 661  }
 662  
 663  static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
 664      VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
 665      if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
 666          // Reuse command buffer
 667          return q.cmd_buffers[q.cmd_buffer_idx++];
 668      }
 669  
 670      vk::CommandBufferAllocateInfo command_buffer_alloc_info(
 671          q.pool,
 672          vk::CommandBufferLevel::ePrimary,
 673          1);
 674      const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device->device.allocateCommandBuffers(command_buffer_alloc_info);
 675      auto buf = cmd_buffers.front();
 676  
 677      q.cmd_buffers.push_back(buf);
 678      q.cmd_buffer_idx++;
 679  
 680      return buf;
 681  }
 682  
 683  static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
 684      VK_LOG_DEBUG("ggml_vk_create_submission()");
 685      vk_submission s;
 686      s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
 687      s.wait_semaphores = std::move(wait_semaphores);
 688      s.signal_semaphores = std::move(signal_semaphores);
 689      return s;
 690  }
 691  
 692  static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
 693      VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
 694      if (ctx->seqs.empty()) {
 695          return;
 696      }
 697  
 698      std::vector<std::vector<uint64_t>> tl_wait_vals;
 699      std::vector<std::vector<uint64_t>> tl_signal_vals;
 700      std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
 701      std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
 702      std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
 703      std::vector<vk::SubmitInfo> submit_infos;
 704      int idx = -1;
 705      std::vector<std::vector<vk::PipelineStageFlags>> stage_flags;
 706  
 707      size_t reserve = 0;
 708  
 709      for (const auto& sequence : ctx->seqs) {
 710          reserve += sequence.size();
 711      }
 712  
 713      // Pre-reserve vectors to prevent reallocation, which invalidates pointers
 714      tl_wait_semaphores.reserve(reserve);
 715      tl_wait_vals.reserve(reserve);
 716      tl_signal_semaphores.reserve(reserve);
 717      tl_signal_vals.reserve(reserve);
 718      tl_submit_infos.reserve(reserve);
 719      submit_infos.reserve(reserve);
 720      stage_flags.reserve(reserve);
 721  
 722      for (const auto& sequence : ctx->seqs) {
 723          for (const auto& submission : sequence) {
 724              stage_flags.push_back({});
 725              idx++;
 726              tl_wait_vals.push_back({});
 727              tl_wait_semaphores.push_back({});
 728              tl_signal_vals.push_back({});
 729              tl_signal_semaphores.push_back({});
 730              for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
 731                  stage_flags[idx].push_back(ctx->q->stage_flags);
 732                  tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
 733                  tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
 734              }
 735              for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
 736                  tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
 737                  tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
 738              }
 739              tl_submit_infos.push_back({
 740                  (uint32_t) submission.wait_semaphores.size(),
 741                  tl_wait_vals[idx].data(),
 742                  (uint32_t) submission.signal_semaphores.size(),
 743                  tl_signal_vals[idx].data(),
 744              });
 745              tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo;
 746              tl_submit_infos[idx].pNext = nullptr;
 747              vk::SubmitInfo si{
 748                  (uint32_t) submission.wait_semaphores.size(),
 749                  tl_wait_semaphores[idx].data(),
 750                  stage_flags[idx].data(),
 751                  1,
 752                  &submission.buffer,
 753                  (uint32_t) submission.signal_semaphores.size(),
 754                  tl_signal_semaphores[idx].data(),
 755              };
 756              si.setPNext(&tl_submit_infos[idx]);
 757              submit_infos.push_back(si);
 758          }
 759      }
 760  
 761      ctx->q->queue.submit(submit_infos, fence);
 762  
 763      ctx->seqs.clear();
 764  }
 765  
 766  static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
 767      VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
 768      const uint32_t qfsize = queue_family_props.size();
 769  
 770      // Try with avoid preferences first
 771      for (uint32_t i = 0; i < qfsize; i++) {
 772          if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) {
 773              return i;
 774          }
 775      }
 776  
 777      // Fall back to only required
 778      for (size_t i = 0; i < qfsize; i++) {
 779          if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) {
 780              return i;
 781          }
 782      }
 783  
 784      // Fall back to reusing compute queue
 785      for (size_t i = 0; i < qfsize; i++) {
 786          if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) {
 787              return i;
 788          }
 789      }
 790  
 791      // Fall back to ignoring min_num_queries
 792      for (size_t i = 0; i < qfsize; i++) {
 793          if (queue_family_props[i].queueFlags & required) {
 794              return i;
 795          }
 796      }
 797  
 798      // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
 799      // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
 800      if (compute_index >= 0) {
 801          return compute_index;
 802      }
 803  
 804      std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
 805  
 806      for(auto &q_family : queue_family_props) {
 807          std::cerr << "Queue number: "  + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl;
 808      }
 809      abort();
 810  }
 811  
 812  static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
 813      VK_LOG_DEBUG("ggml_vk_create_queue()");
 814      q.queue_family_index = queue_family_index;
 815  
 816      vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
 817      q.pool = ctx->device->device.createCommandPool(command_pool_create_info_compute);
 818  
 819      q.cmd_buffer_idx = 0;
 820  
 821      q.queue = ctx->device->device.getQueue(queue_family_index, queue_index);
 822  
 823      q.stage_flags = stage_flags;
 824  }
 825  
 826  static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
 827      VK_LOG_DEBUG("ggml_vk_create_context()");
 828      ctx->gc.contexts.emplace_back();
 829      vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
 830      memset((void *) result, 0, sizeof(vk_context));
 831      result->idx = ctx->gc.contexts.size() - 1;
 832      result->q = &q;
 833      return result;
 834  }
 835  
 836  static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
 837      VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
 838      vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
 839      vk::SemaphoreCreateInfo ci{};
 840      ci.setPNext(&tci);
 841      vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
 842      ctx->gc.semaphores.push_back({ semaphore, 0 });
 843      return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
 844  }
 845  
 846  static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
 847      VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
 848      if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
 849          vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
 850          vk::SemaphoreCreateInfo ci{};
 851          ci.setPNext(&tci);
 852          vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
 853          ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
 854      }
 855      return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
 856  }
 857  
 858  static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
 859      if (ctx->event_idx >= ctx->gc.events.size()) {
 860          ctx->gc.events.push_back(ctx->device->device.createEvent({}));
 861      }
 862      return ctx->gc.events[ctx->event_idx++];
 863  }
 864  
 865  static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
 866      VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
 867      // Requires command buffers to be done
 868  
 869      ctx->device->device.resetCommandPool(q.pool);
 870      q.cmd_buffer_idx = 0;
 871  }
 872  
 873  static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
 874      for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
 875          vk::MemoryType memory_type = mem_props->memoryTypes[i];
 876          if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
 877              (flags & memory_type.propertyFlags) == flags &&
 878              mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
 879              return static_cast<int32_t>(i);
 880          }
 881      }
 882      return UINT32_MAX;
 883  }
 884  
 885  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
 886      VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
 887      vk_buffer buf = std::make_shared<vk_buffer_struct>();
 888  
 889      if (size == 0) {
 890          buf->size = 0;
 891          return buf;
 892      }
 893  
 894      buf->size = size;
 895      vk::BufferCreateInfo buffer_create_info{
 896          vk::BufferCreateFlags(),
 897          size,
 898          vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
 899          vk::SharingMode::eExclusive,
 900          0,
 901          nullptr,
 902      };
 903  
 904      buf->buffer = ctx->device->device.createBuffer(buffer_create_info);
 905  
 906      vk::MemoryRequirements mem_req = ctx->device->device.getBufferMemoryRequirements(buf->buffer);
 907  
 908      vk::PhysicalDeviceMemoryProperties mem_props = ctx->device->physical_device.getMemoryProperties();
 909  
 910      uint32_t memory_type_index = UINT32_MAX;
 911  
 912      memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
 913      buf->memory_property_flags = req_flags;
 914  
 915      if (memory_type_index == UINT32_MAX && fallback_flags) {
 916          memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
 917          buf->memory_property_flags = fallback_flags;
 918      }
 919  
 920      if (memory_type_index == UINT32_MAX) {
 921          ctx->device->device.destroyBuffer(buf->buffer);
 922          buf->size = 0;
 923          throw vk::OutOfDeviceMemoryError("No suitable memory type found");
 924      }
 925  
 926      try {
 927          buf->device_memory = ctx->device->device.allocateMemory({ mem_req.size, memory_type_index });
 928      } catch (const vk::SystemError& e) {
 929          // Out of Host/Device memory, clean up buffer
 930          ctx->device->device.destroyBuffer(buf->buffer);
 931          buf->size = 0;
 932          throw e;
 933      }
 934      buf->ptr = nullptr;
 935  
 936      if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
 937          buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
 938      }
 939  
 940      ctx->device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
 941  
 942      buf->ctx = ctx;
 943  
 944      buf->device = ctx->device;
 945  
 946  #ifdef GGML_VULKAN_MEMORY_DEBUG
 947      ctx->memory_logger.log_allocation(buf, size);
 948  #endif
 949  
 950      return buf;
 951  }
 952  
 953  static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
 954      try {
 955          return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
 956      } catch (const vk::SystemError& e) {
 957          std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
 958          std::cerr << "ggml_vulkan: " << e.what() << std::endl;
 959          throw e;
 960      }
 961  }
 962  
 963  static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
 964      vk_buffer buf;
 965      try {
 966          if (ctx->device->uma) {
 967              // Fall back to host memory type
 968              buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
 969          } else {
 970              buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
 971          }
 972      } catch (const vk::SystemError& e) {
 973          std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
 974          std::cerr << "ggml_vulkan: " << e.what() << std::endl;
 975          throw e;
 976      }
 977  
 978      return buf;
 979  }
 980  
 981  static void ggml_vk_destroy_buffer(vk_buffer& buf) {
 982      if (buf == nullptr) {
 983          return;
 984      }
 985  
 986  #ifdef GGML_VULKAN_MEMORY_DEBUG
 987      buf->ctx->memory_logger.log_deallocation(buf);
 988  #endif
 989  
 990      buf.reset();
 991  }
 992  
 993  static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
 994      return { buf, 0, VK_WHOLE_SIZE };
 995  }
 996  
 997  static void ggml_vk_sync_buffers(vk_context * ctx) {
 998      VK_LOG_DEBUG("ggml_vk_sync_buffers()");
 999      const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
1000  
1001      ctx->s->buffer.pipelineBarrier(
1002          ctx->q->stage_flags,
1003          ctx->q->stage_flags,
1004          {},
1005          mem_barriers,
1006          {},
1007          {}
1008      );
1009  }
1010  
1011  static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
1012      VK_LOG_DEBUG("ggml_vk_wait_events()");
1013      if (events.empty()) {
1014          return;
1015      }
1016  
1017      ctx->s->buffer.waitEvents(
1018          events,
1019          ctx->q->stage_flags,
1020          ctx->q->stage_flags,
1021          {},
1022          {},
1023          {}
1024      );
1025  }
1026  
1027  static bool ggml_vk_build_shader(ggml_type type) {
1028      switch(type) {
1029      case GGML_TYPE_F16:
1030      case GGML_TYPE_Q4_0:
1031      case GGML_TYPE_Q4_1:
1032      case GGML_TYPE_Q5_0:
1033      case GGML_TYPE_Q5_1:
1034      case GGML_TYPE_Q8_0:
1035      case GGML_TYPE_Q2_K:
1036      case GGML_TYPE_Q3_K:
1037      case GGML_TYPE_Q4_K:
1038      case GGML_TYPE_Q5_K:
1039      case GGML_TYPE_Q6_K:
1040          return true;
1041      default:
1042          return false;
1043      }
1044  }
1045  
1046  static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1047      VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
1048  
1049      const std::shared_ptr<vk_device> device = ctx->device;
1050  
1051      // mulmat
1052      std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
1053      std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
1054      std::initializer_list<uint32_t> warptile_s = { device->subgroup_size,  32,  32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
1055  
1056      std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
1057      std::initializer_list<uint32_t> warptile_mmq_m = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
1058      std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size,  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
1059  
1060      std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
1061      std::array<uint32_t, 3> m_wg_denoms = { 64,  64, 1 };
1062      std::array<uint32_t, 3> s_wg_denoms = { 32,  32, 1 };
1063  
1064      uint32_t l_align = 128;
1065      uint32_t m_align =  64;
1066      uint32_t s_align =  32;
1067  
1068      ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1069      ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1070      ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1071      ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1072      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
1073      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>();
1074      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
1075      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
1076      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
1077      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
1078      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
1079      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1080      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1081      ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1082  
1083      ctx->device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1084      ctx->device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1085      ctx->device->pipeline_matmul_id_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1086      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
1087      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>();
1088      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
1089      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
1090      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
1091      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
1092      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
1093      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1094      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1095      ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1096  
1097      if (device->fp16) {
1098          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1099          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1100          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1101          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1102          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1103          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1104  
1105          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1106          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1107          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1108          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1109          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1110          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1111  
1112          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1113          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1114          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1115          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1116          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1117          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1118  
1119          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1120          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1121          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1122          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1123          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1124          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1125  
1126          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1127          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1128          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1129          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1130          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1131          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1132  
1133          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1134          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1135          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1136          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1137          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1138          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1139  
1140          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1141          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1142          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1143          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1144          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1145          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1146  
1147          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1148          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1149          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1150          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1151          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1152          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1153  
1154          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1155          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1156          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1157          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1158          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1159          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1160  
1161          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1162          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1163          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1164          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1165          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1166          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1167  
1168          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1169          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1170          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1171          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1172          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1173          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1174  
1175          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1176          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1177          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1178          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1179          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1180          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1181  
1182          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1183          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1184          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1185          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1186          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1187          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1188  
1189          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1190          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1191          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1192          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1193          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1194          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1195  
1196          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1197          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1198          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1199          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1200          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1201          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1202  
1203          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1204          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1205          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1206          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1207          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1208          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1209  
1210          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1211          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1212          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1213          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1214          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1215          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1216  
1217          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1218          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1219          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1220          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1221          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1222          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1223  
1224          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1225          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1226          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1227          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1228          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1229          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1230  
1231          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1232          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1233          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1234          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1235          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1236          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1237  
1238          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1239          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1240          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1241          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1242          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1243          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1244  
1245          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1246          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1247          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1248          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1249          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1250          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1251  
1252          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1253          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1254          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1255          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1256          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1257          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1258  
1259          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1260          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1261          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1262          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1263          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1264          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1265  
1266          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1267          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1268          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1269          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1270          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1271          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1272  
1273          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1274          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1275          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1276          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1277          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1278          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1279  
1280          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1281          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1282          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1283          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1284          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1285          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1286      } else {
1287          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1288          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1289          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1290          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1291          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1292          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1293  
1294          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1295          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1296          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1297          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1298          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1299          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1300  
1301          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1302          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1303          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1304          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1305          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1306          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1307  
1308          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1309          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1310          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1311          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1312          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1313          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1314  
1315          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1316          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1317          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1318          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1319          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1320          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1321  
1322          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1323          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1324          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1325          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1326          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1327          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1328  
1329          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1330          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1331          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1332          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1333          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1334          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1335  
1336          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1337          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1338          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1339          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1340          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1341          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1342  
1343          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1344          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1345          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1346          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1347          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1348          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1349  
1350          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1351          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1352          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1353          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1354          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1355          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1356  
1357          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1358          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1359          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1360          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1361          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1362          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1363  
1364          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1365          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1366          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1367          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1368          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1369          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1370  
1371          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1372          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1373          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1374          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1375          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1376          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1377  
1378          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1379          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1380          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1381          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1382          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1383          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1384  
1385          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1386          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1387          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1388          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1389          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1390          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1391  
1392          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1393          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1394          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1395          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1396          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1397          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1398  
1399          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1400          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1401          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1402          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1403          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1404          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1405  
1406          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1407          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1408          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1409          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1410          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1411          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1412  
1413          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1414          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1415          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1416          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1417          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1418          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1419  
1420          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1421          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1422          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1423          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1424          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1425          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1426  
1427          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1428          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1429          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1430          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1431          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1432          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1433  
1434          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1435          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1436          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1437          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1438          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1439          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1440  
1441          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1442          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1443          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1444          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1445          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1446          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1447  
1448          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1449          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1450          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1451          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1452          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1453          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1454  
1455          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1456          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1457          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1458          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1459          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1460          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1461  
1462          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1463          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1464          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1465          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1466          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1467          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1468  
1469          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1470          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1471          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1472          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1473          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1474          ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1475      }
1476  
1477      // mul mat vec
1478      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1479      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1480      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1481      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1482      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1483      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1484      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1485      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1486      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1487      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1488      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1489      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1490  
1491      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1492      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1493      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1494      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1495      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1496      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1497      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1498      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1499      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1500      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1501      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1502      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1503  
1504      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1505      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1506      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1507      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1508      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1509      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1510      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1511      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1512      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1513      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1514      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1515      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1516  
1517      // dequant shaders
1518      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1519      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1520      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1521      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1522      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1523      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1524      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1525      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1526      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1527      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1528      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1529  
1530      // get_rows
1531      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1532      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1533      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1534      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1535      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1536      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1537      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1538  
1539      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1540      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1541      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1542      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1543      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1544      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1545      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1546  
1547      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1548  
1549      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
1550      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
1551  
1552      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
1553      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
1554  
1555      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1556      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1557      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1558  
1559      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1560  
1561      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1562  
1563      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1564  
1565      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1566  
1567      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1568  
1569      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1570  
1571      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1572      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1573      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1574  
1575      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1576  
1577      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1578      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1579  
1580      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1581      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1582  
1583      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1584      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1585  
1586      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1587  
1588      ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1589  }
1590  
1591  static void ggml_vk_print_gpu_info(size_t idx) {
1592      GGML_ASSERT(idx < vk_instance.device_indices.size());
1593      size_t dev_num = vk_instance.device_indices[idx];
1594      VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
1595      GGML_ASSERT(vk_instance.initialized);
1596  
1597      std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1598  
1599      if (dev_num >= devices.size()) {
1600          std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
1601          throw std::runtime_error("Device not found");
1602      }
1603  
1604      vk::PhysicalDevice physical_device = devices[dev_num];
1605      std::vector<vk::ExtensionProperties> ext_props = physical_device.enumerateDeviceExtensionProperties();
1606  
1607      vk::PhysicalDeviceProperties2 props2;
1608      vk::PhysicalDeviceMaintenance3Properties props3;
1609      vk::PhysicalDeviceSubgroupProperties subgroup_props;
1610      vk::PhysicalDeviceDriverProperties driver_props;
1611      props2.pNext = &props3;
1612      props3.pNext = &subgroup_props;
1613      subgroup_props.pNext = &driver_props;
1614      physical_device.getProperties2(&props2);
1615  
1616      const size_t subgroup_size = subgroup_props.subgroupSize;
1617      const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
1618  
1619      bool fp16_storage = false;
1620      bool fp16_compute = false;
1621  
1622      for (auto properties : ext_props) {
1623          if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1624              fp16_storage = true;
1625          } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
1626              fp16_compute = true;
1627          }
1628      }
1629  
1630      const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
1631      bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
1632  
1633      bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
1634  
1635      vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures();
1636  
1637      VkPhysicalDeviceFeatures2 device_features2;
1638      device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
1639      device_features2.pNext = nullptr;
1640      device_features2.features = (VkPhysicalDeviceFeatures)device_features;
1641  
1642      VkPhysicalDeviceVulkan11Features vk11_features;
1643      vk11_features.pNext = nullptr;
1644      vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
1645      device_features2.pNext = &vk11_features;
1646  
1647      VkPhysicalDeviceVulkan12Features vk12_features;
1648      vk12_features.pNext = nullptr;
1649      vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
1650      vk11_features.pNext = &vk12_features;
1651  
1652      vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
1653  
1654      fp16 = fp16 && vk12_features.shaderFloat16;
1655  
1656      std::string device_name = props2.properties.deviceName.data();
1657      std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1658  
1659      if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1660          std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
1661      }
1662  }
1663  
1664  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1665  static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1666  
1667  void ggml_vk_instance_init() {
1668      if (vk_instance_initialized) {
1669          return;
1670      }
1671      VK_LOG_DEBUG("ggml_vk_instance_init()");
1672  
1673      vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1674  
1675      const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1676      const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1677  #ifdef __APPLE__
1678      const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1679  #endif
1680  
1681      std::vector<const char*> layers;
1682  
1683      if (validation_ext) {
1684          layers.push_back("VK_LAYER_KHRONOS_validation");
1685      }
1686      std::vector<const char*> extensions;
1687      if (validation_ext) {
1688          extensions.push_back("VK_EXT_validation_features");
1689      }
1690  #ifdef __APPLE__
1691      if (portability_enumeration_ext) {
1692          extensions.push_back("VK_KHR_portability_enumeration");
1693      }
1694  #endif
1695      vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1696  #ifdef __APPLE__
1697      if (portability_enumeration_ext) {
1698          instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1699      }
1700  #endif
1701  
1702      std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1703      vk::ValidationFeaturesEXT validation_features;
1704  
1705      if (validation_ext) {
1706          features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1707          validation_features = {
1708              features_enable,
1709              {},
1710          };
1711          validation_features.setPNext(nullptr);
1712          instance_create_info.setPNext(&validation_features);
1713  
1714          std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1715      }
1716      vk_instance.instance = vk::createInstance(instance_create_info);
1717  
1718      memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
1719  
1720      size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
1721  
1722      // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
1723      char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
1724      if (devices_env != nullptr) {
1725          std::string devices(devices_env);
1726          std::replace(devices.begin(), devices.end(), ',', ' ');
1727  
1728          std::stringstream ss(devices);
1729          size_t tmp;
1730          while (ss >> tmp) {
1731              if(tmp >= num_available_devices) {
1732                  std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl;
1733                  throw std::runtime_error("Invalid Vulkan device index");
1734              }
1735              vk_instance.device_indices.push_back(tmp);
1736          }
1737      } else {
1738          std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1739  
1740          // Make sure at least one device exists
1741          if (devices.empty()) {
1742              std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1743              GGML_ASSERT(false);
1744          }
1745  
1746          // Default to using all dedicated GPUs
1747          for (size_t i = 0; i < devices.size(); i++) {
1748              vk::PhysicalDeviceProperties props = devices[i].getProperties();
1749  
1750              if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1751                  // Check if there are two physical devices corresponding to the same GPU
1752                  auto old_device = std::find_if(
1753                      vk_instance.device_indices.begin(),
1754                      vk_instance.device_indices.end(),
1755                      [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1756                  );
1757                  if (old_device == vk_instance.device_indices.end()) {
1758                      vk_instance.device_indices.push_back(i);
1759                  } else {
1760                      // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1761                      // This can cause error when splitting layers aross the devices, need to keep only 1
1762                      VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same device id");
1763  
1764                      vk::PhysicalDeviceProperties2 old_prop;
1765                      vk::PhysicalDeviceDriverProperties old_driver;
1766                      old_prop.pNext = &old_driver;
1767                      devices[*old_device].getProperties2(&old_prop);
1768  
1769                      vk::PhysicalDeviceProperties2 new_prop;
1770                      vk::PhysicalDeviceDriverProperties new_driver;
1771                      new_prop.pNext = &new_driver;
1772                      devices[i].getProperties2(&new_prop);
1773  
1774                      std::map<vk::DriverId, int> driver_priorities {};
1775                      int old_priority = std::numeric_limits<int>::max();
1776                      int new_priority = std::numeric_limits<int>::max();
1777  
1778                      // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1779                      // Smaller number -> higher priority
1780                      switch (old_prop.properties.vendorID) {
1781                          case VK_VENDOR_ID_AMD:
1782                              driver_priorities[vk::DriverId::eMesaRadv] = 1;
1783                              driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1784                              driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1785                              break;
1786                          case VK_VENDOR_ID_INTEL:
1787                              driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1788                              driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1789                              break;
1790                          case VK_VENDOR_ID_NVIDIA:
1791                              driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1792  #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1793                              driver_priorities[vk::DriverId::eMesaNvk] = 2;
1794  #endif
1795                              break;
1796                      }
1797  
1798                      if (driver_priorities.count(old_driver.driverID)) {
1799                          old_priority = driver_priorities[old_driver.driverID];
1800                      }
1801                      if (driver_priorities.count(new_driver.driverID)) {
1802                          new_priority = driver_priorities[new_driver.driverID];
1803                      }
1804  
1805                      if (new_priority < old_priority) {
1806                          auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1807                          vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1808                          vk_instance.device_indices.push_back(i);
1809  
1810                          VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
1811                      }
1812                      else {
1813                          VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
1814                      }
1815                  }
1816              }
1817          }
1818  
1819          // If no dedicated GPUs found, fall back to GPU 0
1820          if (vk_instance.device_indices.empty()) {
1821              vk_instance.device_indices.push_back(0);
1822          }
1823      }
1824  
1825      std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
1826  
1827      for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
1828          ggml_vk_print_gpu_info(i);
1829      }
1830  
1831      vk_instance_initialized = true;
1832  }
1833  
1834  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1835      GGML_ASSERT(idx < vk_instance.device_indices.size());
1836      size_t dev_num = vk_instance.device_indices[idx];
1837      VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
1838      ggml_vk_instance_init();
1839  
1840      std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1841  
1842      if (dev_num >= devices.size()) {
1843          std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
1844          throw std::runtime_error("Device not found");
1845      }
1846  
1847      ctx->device = ggml_vk_get_device(idx);
1848      if (!ctx->device->initialized) {
1849          ctx->device->physical_device = devices[dev_num];
1850          const std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
1851  
1852          bool maintenance4_support = false;
1853  
1854          // Check if maintenance4 is supported
1855          for (const auto& properties : ext_props) {
1856              if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
1857                  maintenance4_support = true;
1858              }
1859          }
1860  
1861          vk::PhysicalDeviceProperties2 props2;
1862          vk::PhysicalDeviceMaintenance3Properties props3;
1863          vk::PhysicalDeviceMaintenance4Properties props4;
1864          vk::PhysicalDeviceSubgroupProperties subgroup_props;
1865          props2.pNext = &props3;
1866          props3.pNext = &subgroup_props;
1867          if (maintenance4_support) {
1868              subgroup_props.pNext = &props4;
1869          }
1870          ctx->device->physical_device.getProperties2(&props2);
1871          ctx->device->properties = props2.properties;
1872  
1873          const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
1874  
1875          if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
1876              ctx->device->max_memory_allocation_size = std::stoi(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
1877          } else if (maintenance4_support) {
1878              ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
1879          } else {
1880              ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
1881          }
1882  
1883          ctx->device->vendor_id = ctx->device->properties.vendorID;
1884          ctx->device->subgroup_size = subgroup_props.subgroupSize;
1885          ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
1886  
1887          bool fp16_storage = false;
1888          bool fp16_compute = false;
1889  
1890          for (const auto& properties : ext_props) {
1891              if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1892                  fp16_storage = true;
1893              } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
1894                  fp16_compute = true;
1895              }
1896          }
1897  
1898          const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
1899          const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
1900  
1901          ctx->device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
1902  
1903          std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device->physical_device.getQueueFamilyProperties();
1904  
1905          // Try to find a non-graphics compute queue and transfer-focused queues
1906          const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
1907          const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
1908  
1909          const float priorities[] = { 1.0f, 1.0f };
1910          ctx->device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
1911  
1912          std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
1913          if (compute_queue_family_index != transfer_queue_family_index) {
1914              device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
1915              device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
1916          } else if(!ctx->device->single_queue) {
1917              device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
1918          } else {
1919              device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
1920          }
1921          vk::DeviceCreateInfo device_create_info;
1922          std::vector<const char *> device_extensions;
1923          vk::PhysicalDeviceFeatures device_features = ctx->device->physical_device.getFeatures();
1924  
1925          VkPhysicalDeviceFeatures2 device_features2;
1926          device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
1927          device_features2.pNext = nullptr;
1928          device_features2.features = (VkPhysicalDeviceFeatures)device_features;
1929  
1930          VkPhysicalDeviceVulkan11Features vk11_features;
1931          vk11_features.pNext = nullptr;
1932          vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
1933          device_features2.pNext = &vk11_features;
1934  
1935          VkPhysicalDeviceVulkan12Features vk12_features;
1936          vk12_features.pNext = nullptr;
1937          vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
1938          vk11_features.pNext = &vk12_features;
1939  
1940          vkGetPhysicalDeviceFeatures2(ctx->device->physical_device, &device_features2);
1941  
1942          ctx->device->fp16 = ctx->device->fp16 && vk12_features.shaderFloat16;
1943  
1944          if (!vk11_features.storageBuffer16BitAccess) {
1945              std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
1946              throw std::runtime_error("Unsupported device");
1947          }
1948  
1949          device_extensions.push_back("VK_KHR_16bit_storage");
1950  
1951  #ifdef GGML_VULKAN_VALIDATE
1952          device_extensions.push_back("VK_KHR_shader_non_semantic_info");
1953  #endif
1954  
1955          if (ctx->device->fp16) {
1956              device_extensions.push_back("VK_KHR_shader_float16_int8");
1957          }
1958          ctx->device->name = ctx->device->properties.deviceName.data();
1959  
1960          device_create_info = {
1961              vk::DeviceCreateFlags(),
1962              device_queue_create_infos,
1963              {},
1964              device_extensions
1965          };
1966          device_create_info.setPNext(&device_features2);
1967          ctx->device->device = ctx->device->physical_device.createDevice(device_create_info);
1968  
1969          ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
1970  
1971          // Queues
1972          ggml_vk_create_queue(ctx, ctx->device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
1973  
1974          // Shaders
1975          ggml_vk_load_shaders(ctx);
1976  
1977          if (!ctx->device->single_queue) {
1978              const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
1979              ggml_vk_create_queue(ctx, ctx->device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
1980          } else {
1981              // TODO: Use pointer or reference to avoid copy
1982              ctx->device->transfer_queue = ctx->device->compute_queue;
1983          }
1984  
1985          ctx->device->idx = dev_num;
1986          ctx->device->initialized = true;
1987      } else if (ctx->device->idx != dev_num) {
1988          std::cerr << "ggml_vulkan: Device " << ctx->device->name << " already initialized with index " << ctx->device->idx << ", but trying to reinitialize with index " << dev_num << std::endl;
1989          throw std::runtime_error("Device already initialized");
1990      }
1991  
1992      ctx->fence = ctx->device->device.createFence({});
1993  
1994      ctx->compute_ctx = nullptr;
1995      ctx->transfer_ctx = nullptr;
1996  
1997      ctx->initialized = true;
1998  
1999      ctx->idx = idx;
2000  
2001  #ifdef GGML_VULKAN_CHECK_RESULTS
2002      const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
2003      vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
2004      const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR");
2005      vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor));
2006  #endif
2007  }
2008  
2009  static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
2010      VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
2011      switch (type) {
2012          case GGML_TYPE_F32:
2013          case GGML_TYPE_Q4_0:
2014          case GGML_TYPE_Q4_1:
2015          case GGML_TYPE_Q5_0:
2016          case GGML_TYPE_Q5_1:
2017          case GGML_TYPE_Q8_0:
2018          case GGML_TYPE_Q2_K:
2019          case GGML_TYPE_Q3_K:
2020          case GGML_TYPE_Q4_K:
2021          case GGML_TYPE_Q5_K:
2022          case GGML_TYPE_Q6_K:
2023              break;
2024          default:
2025              return nullptr;
2026      }
2027  
2028      return ctx->device->pipeline_dequant[type];
2029  }
2030  
2031  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
2032      VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
2033      if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2034          return ctx->device->pipeline_matmul_f32;
2035      }
2036      if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
2037          return ctx->device->pipeline_matmul_f32_f16;
2038      }
2039      if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2040          return ctx->device->pipeline_matmul_f16_f32;
2041      }
2042      if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
2043          return ctx->device->pipeline_matmul_f16;
2044      }
2045  
2046      GGML_ASSERT(src1_type == GGML_TYPE_F32);
2047  
2048      switch (src0_type) {
2049          case GGML_TYPE_Q4_0:
2050          case GGML_TYPE_Q4_1:
2051          case GGML_TYPE_Q5_0:
2052          case GGML_TYPE_Q5_1:
2053          case GGML_TYPE_Q8_0:
2054          case GGML_TYPE_Q2_K:
2055          case GGML_TYPE_Q3_K:
2056          case GGML_TYPE_Q4_K:
2057          case GGML_TYPE_Q5_K:
2058          case GGML_TYPE_Q6_K:
2059              break;
2060          default:
2061              return nullptr;
2062      }
2063  
2064      return ctx->device->pipeline_dequant_mul_mat_mat[src0_type];
2065  }
2066  
2067  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2068      VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2069      GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
2070  
2071      switch (a_type) {
2072          case GGML_TYPE_F32:
2073          case GGML_TYPE_F16:
2074          case GGML_TYPE_Q4_0:
2075          case GGML_TYPE_Q4_1:
2076          case GGML_TYPE_Q5_0:
2077          case GGML_TYPE_Q5_1:
2078          case GGML_TYPE_Q8_0:
2079          case GGML_TYPE_Q2_K:
2080          case GGML_TYPE_Q3_K:
2081          case GGML_TYPE_Q4_K:
2082          case GGML_TYPE_Q5_K:
2083          case GGML_TYPE_Q6_K:
2084              break;
2085          default:
2086              return nullptr;
2087      }
2088  
2089      return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
2090  }
2091  
2092  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
2093      VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
2094      if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2095          return ctx->device->pipeline_matmul_id_f32;
2096      }
2097      if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2098          return ctx->device->pipeline_matmul_id_f16_f32;
2099      }
2100      if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
2101          return ctx->device->pipeline_matmul_id_f16;
2102      }
2103  
2104      GGML_ASSERT(src1_type == GGML_TYPE_F32);
2105  
2106      switch (src0_type) {
2107          case GGML_TYPE_Q4_0:
2108          case GGML_TYPE_Q4_1:
2109          case GGML_TYPE_Q5_0:
2110          case GGML_TYPE_Q5_1:
2111          case GGML_TYPE_Q8_0:
2112          case GGML_TYPE_Q2_K:
2113          case GGML_TYPE_Q3_K:
2114          case GGML_TYPE_Q4_K:
2115          case GGML_TYPE_Q5_K:
2116          case GGML_TYPE_Q6_K:
2117              break;
2118          default:
2119              return nullptr;
2120      }
2121  
2122      return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
2123  }
2124  
2125  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2126      VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2127      GGML_ASSERT(b_type == GGML_TYPE_F32);
2128  
2129      switch (a_type) {
2130          case GGML_TYPE_F32:
2131          case GGML_TYPE_F16:
2132          case GGML_TYPE_Q4_0:
2133          case GGML_TYPE_Q4_1:
2134          case GGML_TYPE_Q5_0:
2135          case GGML_TYPE_Q5_1:
2136          case GGML_TYPE_Q8_0:
2137          case GGML_TYPE_Q2_K:
2138          case GGML_TYPE_Q3_K:
2139          case GGML_TYPE_Q4_K:
2140          case GGML_TYPE_Q5_K:
2141          case GGML_TYPE_Q6_K:
2142              break;
2143          default:
2144              return nullptr;
2145      }
2146  
2147      return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[a_type];
2148  }
2149  
2150  static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
2151      VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
2152      VK_LOG_MEMORY("ggml_vk_pool_malloc");
2153  
2154      int best_i = -1;
2155      size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
2156      int worst_i = -1;
2157      size_t worst_size = 0; //largest unused buffer seen so far
2158      for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2159          vk_buffer &b = ctx->buffer_pool[i];
2160          if (b != nullptr && b->size >= size && b->size < best_size) {
2161              best_i = i;
2162              best_size = b->size;
2163          }
2164          if (b != nullptr && b->size > worst_size) {
2165              worst_i = i;
2166              worst_size = b->size;
2167          }
2168      }
2169      if(best_i != -1) {
2170          //found the smallest buffer that fits our needs
2171          vk_buffer b = ctx->buffer_pool[best_i];
2172          ctx->buffer_pool[best_i].reset();
2173          return b;
2174      }
2175      if(worst_i != -1) {
2176          //no buffer that fits our needs, resize largest one to save memory
2177          vk_buffer& b = ctx->buffer_pool[worst_i];
2178          ggml_vk_destroy_buffer(b);
2179      }
2180  
2181      return ggml_vk_create_buffer_device(ctx, size);
2182  }
2183  
2184  static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
2185      VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
2186      for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2187          vk_buffer& b = ctx->buffer_pool[i];
2188          if (b == nullptr) {
2189              b = buffer;
2190              return;
2191          }
2192      }
2193      std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl;
2194      ggml_vk_destroy_buffer(buffer);
2195  }
2196  
2197  // Returns an available temporary buffer that may only be used temporarily, it will be reused
2198  static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) {
2199      // Try to find existing temp buffer with enough capacity
2200      for (auto& buffer : ctx->gc.temp_buffers) {
2201          if (buffer->size >= size) {
2202              return buffer;
2203          }
2204      }
2205  
2206      VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
2207  
2208      // Otherwise create new buffer
2209      vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
2210      ctx->gc.temp_buffers.push_back(buf);
2211  
2212      return buf;
2213  }
2214  
2215  static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
2216      VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
2217      vk_buffer buf = ggml_vk_create_buffer(ctx, size,
2218          vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2219          vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
2220  
2221      if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
2222          fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
2223              size/1024.0/1024.0);
2224          ctx->device->device.freeMemory(buf->device_memory);
2225          ctx->device->device.destroyBuffer(buf->buffer);
2226          return nullptr;
2227      }
2228  
2229      ctx->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
2230  
2231      return buf->ptr;
2232  }
2233  
2234  static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
2235      if (ptr == nullptr) {
2236          return;
2237      }
2238      VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
2239      vk_buffer buf;
2240      size_t index;
2241      for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
2242          const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]);
2243          const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]);
2244          if (ptr >= addr && ptr < endr) {
2245              buf = std::get<2>(ctx->pinned_memory[i]);
2246              index = i;
2247              break;
2248          }
2249      }
2250      if (buf == nullptr) {
2251          fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n");
2252          return;
2253      }
2254  
2255      ggml_vk_destroy_buffer(buf);
2256  
2257      ctx->pinned_memory.erase(ctx->pinned_memory.begin() + index);
2258  }
2259  
2260  static void ggml_vk_host_get(ggml_backend_vk_context * ctx, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
2261      buf = nullptr;
2262      buf_offset = 0;
2263      for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
2264          const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]);
2265          const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]);
2266          if (ptr >= addr && ptr < endr) {
2267              buf = std::get<2>(ctx->pinned_memory[i]);
2268              buf_offset = ((const uint8_t *)ptr) - addr;
2269              break;
2270          }
2271      }
2272  }
2273  
2274  static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_queue& q, bool one_time = true) {
2275      vk_submission s;
2276      s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
2277      if (one_time) {
2278          s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
2279      } else {
2280          s.buffer.begin({ vk::CommandBufferUsageFlags{} });
2281      }
2282  
2283      return s;
2284  }
2285  
2286  static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
2287      const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2288      const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2289      const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2290      VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2291      for (auto& buffer : buffers) {
2292          std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2293      }
2294      std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2295      std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2296      std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2297      GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
2298      GGML_ASSERT(buffers.size() == pipeline->parameter_count);
2299      vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
2300      for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2301          descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
2302      }
2303      for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2304          write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
2305      }
2306  
2307      ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
2308  
2309      subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
2310      subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
2311      subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
2312                                  pipeline->layout,
2313                                  0,
2314                                  { descriptor_set },
2315                                  {});
2316      subctx->s->buffer.dispatch(wg0, wg1, wg2);
2317  }
2318  
2319  static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
2320      s.buffer.end();
2321  
2322      s.wait_semaphores = std::move(wait_semaphores);
2323      s.signal_semaphores = std::move(signal_semaphores);
2324  }
2325  
2326  static void ggml_vk_ctx_end(vk_context * ctx) {
2327      VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2328      if (ctx->s == nullptr) {
2329          return;
2330      }
2331  
2332      ctx->s->buffer.end();
2333      ctx->s = nullptr;
2334  }
2335  
2336  static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
2337      VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
2338      if (subctx->s != nullptr) {
2339          ggml_vk_ctx_end(subctx);
2340      }
2341  
2342      subctx->seqs.push_back({ ggml_vk_begin_submission(ctx, *subctx->q) });
2343      subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
2344  }
2345  
2346  static size_t ggml_vk_align_size(size_t width, size_t align) {
2347      VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
2348      return CEIL_DIV(width, align) * align;
2349  }
2350  
2351  static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr) {
2352      if (memcpys == nullptr) {
2353          memcpy(dst, src, size);
2354      } else {
2355          memcpys->emplace_back(dst, src, size);
2356      }
2357  }
2358  
2359  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
2360      if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
2361          VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
2362          ggml_vk_destroy_buffer(ctx->sync_staging);
2363          ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
2364              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2365              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
2366      }
2367  }
2368  
2369  static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2370      VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2371      GGML_ASSERT(!ggml_is_contiguous(tensor));
2372      // Buffer is already mapped
2373      if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2374          std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
2375          GGML_ASSERT(false);
2376      }
2377      // Check if src is pinned memory
2378      vk_buffer buf;
2379      size_t buf_offset;
2380      ggml_vk_host_get(ctx, tensor->data, buf, buf_offset);
2381  
2382      const uint64_t ne0 = tensor->ne[0];
2383      const uint64_t ne1 = tensor->ne[1];
2384      const uint64_t ne2 = tensor->ne[2];
2385      const uint64_t ne3 = tensor->ne[3];
2386      const uint64_t nb0 = tensor->nb[0];
2387      const uint64_t nb1 = tensor->nb[1];
2388      const uint64_t nb2 = tensor->nb[2];
2389      const uint64_t nb3 = tensor->nb[3];
2390      const ggml_type type = tensor->type;
2391      const uint64_t ts = ggml_type_size(type);
2392      const uint64_t bs = ggml_blck_size(type);
2393  
2394      const uint64_t dstnb0 = ts;
2395      const uint64_t dstnb1 = dstnb0*(ne0/bs);
2396      const uint64_t dstnb2 = dstnb1*ne1;
2397      const uint64_t dstnb3 = dstnb2*ne2;
2398  
2399      const uint64_t ne = ggml_nelements(tensor);
2400  
2401      if (buf != nullptr) {
2402          // Memory is pinned, use as staging buffer
2403          std::vector<vk::BufferCopy> slices;
2404  
2405          for (uint64_t i3 = 0; i3 < ne3; i3++) {
2406              for (uint64_t i2 = 0; i2 < ne2; i2++) {
2407                  // Find longest contiguous slice
2408                  if (ne1*nb1 == dstnb2) {
2409                      slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
2410                  } else {
2411                      for (uint64_t i1 = 0; i1 < ne1; i1++) {
2412                          if (ne0*nb0/bs == dstnb1) {
2413                              slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
2414                          } else {
2415                              const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
2416                              const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
2417                              for (uint64_t i0 = 0; i0 < ne0; i0++) {
2418                                  slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
2419                              }
2420                          }
2421                      }
2422                  }
2423              }
2424          }
2425  
2426          ggml_vk_sync_buffers(subctx);
2427          subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2428          return;
2429      }
2430  
2431      // Staging buffer required
2432      vk_buffer staging = ctx->staging;
2433      size_t staging_offset = ctx->staging_offset;
2434      const size_t copy_size = ts*ne/bs;
2435      if (ctx->staging->size < ctx->staging_offset + copy_size) {
2436          if (sync_staging) {
2437              // Create temporary larger buffer
2438              ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
2439  
2440              staging = ctx->sync_staging;
2441              staging_offset = 0;
2442          } else {
2443              GGML_ASSERT(false);
2444          }
2445      }
2446  
2447      VkBufferCopy buf_copy{ staging_offset, offset, copy_size };
2448  
2449      ggml_vk_sync_buffers(subctx);
2450      vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
2451  
2452      for (uint64_t i3 = 0; i3 < ne3; i3++) {
2453          for (uint64_t i2 = 0; i2 < ne2; i2++) {
2454              // Find longest contiguous slice
2455              if (ne1*nb1 == dstnb2) {
2456                  deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
2457              } else {
2458                  for (uint64_t i1 = 0; i1 < ne1; i1++) {
2459                      if (ne0*nb0/bs == dstnb1) {
2460                          deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
2461                      } else {
2462                          const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
2463                          const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
2464                          for (uint64_t i0 = 0; i0 < ne0; i0++) {
2465                              deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
2466                          }
2467                      }
2468                  }
2469              }
2470          }
2471      }
2472  }
2473  
2474  static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2475      VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2476      // Make sure ctx owns the buffer
2477      GGML_ASSERT(dst->ctx == ctx);
2478  
2479      // Buffer is already mapped
2480      if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2481          std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
2482          GGML_ASSERT(false);
2483      }
2484      // Check if src is pinned memory
2485      vk_buffer buf = nullptr;
2486      size_t buf_offset;
2487      ggml_vk_host_get(ctx, src, buf, buf_offset);
2488  
2489      if (buf != nullptr) {
2490          // Memory is pinned, use as staging buffer
2491          std::vector<vk::BufferCopy> slices(1);
2492          if (width == spitch) {
2493              // Only do single write if stride is equal
2494              slices[0].srcOffset = buf_offset;
2495              slices[0].dstOffset = offset;
2496              slices[0].size = width * height;
2497          } else {
2498              slices.resize(height);
2499              for (size_t i = 0; i < height; i++) {
2500                  slices[i].srcOffset = buf_offset + i * spitch;
2501                  slices[i].dstOffset = offset + i * width;
2502                  slices[i].size = width;
2503              }
2504          }
2505  
2506          ggml_vk_sync_buffers(subctx);
2507          subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2508          return;
2509      }
2510      VK_LOG_DEBUG("STAGING");
2511  
2512      // Staging buffer required
2513      vk_buffer staging = ctx->staging;
2514      size_t staging_offset = ctx->staging_offset;
2515      const size_t copy_size = width*height;
2516      if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) {
2517          if (sync_staging) {
2518              ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
2519  
2520              staging = ctx->sync_staging;
2521              staging_offset = 0;
2522          } else {
2523              GGML_ASSERT(false);
2524          }
2525      }
2526  
2527      VkBufferCopy buf_copy = {
2528          staging_offset,
2529          offset,
2530          copy_size};
2531  
2532      ggml_vk_sync_buffers(subctx);
2533      vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
2534  
2535      if (width == spitch) {
2536          deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &subctx->in_memcpys);
2537      } else {
2538          for (size_t i = 0; i < height; i++) {
2539              deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
2540          }
2541      }
2542  }
2543  
2544  static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2545      VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2546      return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
2547  }
2548  
2549  static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
2550      VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
2551      // Buffer is already mapped
2552      if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2553          GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2554  
2555          for (size_t i = 0; i < height; i++) {
2556              memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
2557          }
2558      } else {
2559          vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
2560          ggml_vk_ctx_begin(ctx, subctx);
2561          ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true);
2562          ggml_vk_ctx_end(subctx);
2563  
2564          for (auto& cpy : subctx->in_memcpys) {
2565              memcpy(cpy.dst, cpy.src, cpy.n);
2566          }
2567  
2568          ggml_vk_submit(subctx, ctx->fence);
2569          VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
2570          ctx->device->device.resetFences({ ctx->fence });
2571      }
2572  }
2573  
2574  static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
2575      VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
2576      ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
2577  }
2578  
2579  static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2580      VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2581      GGML_ASSERT(width > 0);
2582      GGML_ASSERT(height > 0);
2583      GGML_ASSERT(src != nullptr);
2584      // Make sure ctx owns the buffer
2585      GGML_ASSERT(src->ctx == ctx);
2586  
2587      // Check if dst is pinned memory
2588      vk_buffer buf = nullptr;
2589      size_t buf_offset;
2590      ggml_vk_host_get(ctx, dst, buf, buf_offset);
2591  
2592      std::vector<vk::BufferCopy> slices(1);
2593      if (width == spitch && width == dpitch) {
2594          // Only do single write if stride is equal
2595          slices[0].srcOffset = offset;
2596          slices[0].dstOffset = buf_offset;
2597          slices[0].size = width * height;
2598      } else {
2599          slices.resize(height);
2600          for (size_t i = 0; i < height; i++) {
2601              slices[i].srcOffset = offset + i * spitch;
2602              slices[i].dstOffset = buf_offset + i * dpitch;
2603              slices[i].size = width;
2604          }
2605      }
2606  
2607      if (buf != nullptr) {
2608          // Memory is pinned, use as staging buffer
2609          ggml_vk_sync_buffers(subctx);
2610          subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
2611  
2612          return;
2613      }
2614      VK_LOG_DEBUG("STAGING");
2615  
2616      // Fall back to staging buffer
2617      vk_buffer staging = ctx->staging;
2618      const size_t copy_size = dpitch * height;
2619      if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) {
2620          if (sync_staging) {
2621              // Create temporary larger buffer
2622              ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
2623  
2624              staging = ctx->sync_staging;
2625          } else {
2626              GGML_ASSERT(false);
2627          }
2628      }
2629  
2630      ggml_vk_sync_buffers(subctx);
2631      subctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices);
2632  
2633      deferred_memcpy(dst, staging->ptr, copy_size, &subctx->out_memcpys);
2634  }
2635  
2636  static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
2637      return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst, size, size, size, 1, sync_staging);
2638  }
2639  
2640  static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
2641      VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2642      if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2643          GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2644  
2645          memcpy(dst, (uint8_t *) src->ptr + offset, size);
2646      } else {
2647          vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
2648          ggml_vk_ctx_begin(ctx, subctx);
2649          ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true);
2650          ggml_vk_ctx_end(subctx);
2651  
2652          ggml_vk_submit(subctx, ctx->fence);
2653          VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
2654          ctx->device->device.resetFences({ ctx->fence });
2655  
2656          for (auto& cpy : subctx->out_memcpys) {
2657              memcpy(cpy.dst, cpy.src, cpy.n);
2658          }
2659      }
2660  }
2661  
2662  static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2663      VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2664      // Make sure both buffers are on same ctx
2665      GGML_ASSERT(src->ctx == dst->ctx);
2666  
2667      VkBufferCopy bc{ src_offset, dst_offset, size };
2668  
2669      vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
2670  }
2671  
2672  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2673      if (src->ctx == dst->ctx) {
2674      VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2675          // Copy within the device
2676          ggml_backend_vk_context * ctx = src->ctx;
2677  
2678          vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
2679          ggml_vk_ctx_begin(ctx, subctx);
2680          ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
2681          ggml_vk_ctx_end(subctx);
2682          ggml_vk_submit(subctx, ctx->fence);
2683          VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2684          ctx->device->device.resetFences({ ctx->fence });
2685      } else {
2686      VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2687          // Copy device to device
2688          ggml_backend_vk_context * src_ctx = src->ctx;
2689          ggml_backend_vk_context * dst_ctx = dst->ctx;
2690  
2691          ggml_vk_ensure_sync_staging_buffer(src_ctx, size);
2692          ggml_vk_ensure_sync_staging_buffer(dst_ctx, size);
2693  
2694          // Copy to src staging buffer
2695          ggml_vk_buffer_copy(src_ctx->sync_staging, 0, src, src_offset, size);
2696          // memcpy to dst staging buffer
2697          memcpy(dst_ctx->sync_staging->ptr, src_ctx->sync_staging->ptr, size);
2698          // Copy to dst buffer
2699          ggml_vk_buffer_copy(dst, dst_offset, dst_ctx->sync_staging, 0, size);
2700      }
2701  }
2702  
2703  static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2704      VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2705      // Make sure ctx owns the buffer
2706      GGML_ASSERT(dst->ctx == ctx);
2707  
2708      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
2709      ggml_vk_ctx_begin(ctx, subctx);
2710      subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
2711      ggml_vk_ctx_end(subctx);
2712  
2713      ggml_vk_submit(subctx, ctx->fence);
2714      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences");
2715      ctx->device->device.resetFences({ ctx->fence });
2716  }
2717  
2718  static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
2719      VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
2720      const uint64_t ne0 = src->ne[0];
2721      const uint64_t ne1 = src->ne[1];
2722      const uint64_t nb0 = src->nb[0];
2723      const uint64_t nb1 = src->nb[1];
2724      const uint64_t nb2 = src->nb[2];
2725      const uint64_t nb3 = src->nb[3];
2726      const enum ggml_type type = src->type;
2727      const size_t ts = ggml_type_size(type);
2728      const size_t bs = ggml_blck_size(type);
2729      const size_t row_length = ts*ne0/bs;
2730  
2731      const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
2732      if (nb0 == ts && nb1 == row_length) {
2733          return ggml_vk_buffer_write_async(ctx, subctx, dst, offset, x, i1*nb1);
2734      }
2735      if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) {
2736          return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, x, nb1, row_length, i1);
2737      }
2738  
2739      GGML_ASSERT(i3 == 0);
2740      GGML_ASSERT(i2 == 0);
2741      GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src));
2742  
2743      return ggml_vk_buffer_write_nc_async(ctx, subctx, dst, offset, src);
2744  }
2745  
2746  static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
2747      VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
2748      const uint64_t ne0 = dst->ne[0];
2749      const uint64_t ne1 = dst->ne[1];
2750      const uint64_t ne2 = dst->ne[2];
2751      const uint64_t ne3 = dst->ne[3];
2752      const uint64_t nb0 = dst->nb[0];
2753      const uint64_t nb1 = dst->nb[1];
2754      // const uint64_t nb2 = dst->nb[2];
2755      // const uint64_t nb3 = dst->nb[3];
2756      const enum ggml_type type = dst->type;
2757      const size_t ts = ggml_type_size(type);
2758      const size_t bs = ggml_blck_size(type);
2759      const size_t row_length = ts*ne0/bs;
2760  
2761      if (ggml_is_contiguous(dst)) {
2762          return ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst->data, ne1*nb1*ne2*ne3);
2763      }
2764      if (nb0 == ts) {
2765          return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3);
2766      }
2767      GGML_ASSERT(false);
2768  }
2769  
2770  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
2771      VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
2772      // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
2773      //     return 4;
2774      // }
2775  
2776      return 1;
2777  
2778      GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k);
2779  }
2780  
2781  static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2782      if (m <= 32 || n <= 32) {
2783          return aligned ? mmp->a_s : mmp->s;
2784      }
2785      return aligned ? mmp->a_m : mmp->m;
2786  
2787      GGML_UNUSED(ctx);
2788  }
2789  
2790  static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) {
2791      return aligned ? mmp->a_m : mmp->m;
2792  
2793      GGML_UNUSED(ctx);
2794  }
2795  
2796  static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) {
2797      return aligned ? mmp->a_s : mmp->s;
2798  
2799      GGML_UNUSED(ctx);
2800  }
2801  
2802  static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2803      VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
2804      switch (ctx->device->vendor_id) {
2805      case VK_VENDOR_ID_AMD:
2806          return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
2807      case VK_VENDOR_ID_APPLE:
2808          return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
2809      case VK_VENDOR_ID_INTEL:
2810          return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
2811      default:
2812          break;
2813      }
2814  
2815      if (m <= 32 || n <= 32) {
2816          return aligned ? mmp->a_s : mmp->s;
2817      }
2818      if (m <= 64 || n <= 64) {
2819          return aligned ? mmp->a_m : mmp->m;
2820      }
2821      return aligned ? mmp->a_l : mmp->l;
2822  }
2823  
2824  static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
2825      VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
2826      return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
2827  }
2828  
2829  static void ggml_vk_matmul(
2830          ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
2831          vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
2832          uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2833          uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2834          uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
2835          VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
2836      ggml_vk_sync_buffers(subctx);
2837      if (split_k == 1) {
2838          const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
2839          ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
2840          return;
2841      }
2842  
2843      GGML_ASSERT(batch_stride_d == m * n);
2844  
2845      const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 };
2846      // Make sure enough workgroups get assigned for split k to work
2847      ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
2848      ggml_vk_sync_buffers(subctx);
2849      const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
2850      ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
2851  }
2852  
2853  static void ggml_vk_matmul_id(
2854          ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
2855          vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
2856          uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2857          uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2858          uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
2859      VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2860          "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
2861          "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
2862          "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
2863      ggml_vk_sync_buffers(subctx);
2864      const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
2865                                                nei0, nei1, nbi1, ne11 };
2866      ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
2867  }
2868  
2869  static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
2870      return
2871          tensor->nb[0] == ggml_type_size(tensor->type) &&
2872          tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
2873          tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2874  }
2875  
2876  static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
2877      if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
2878          return ctx->device->pipeline_cpy_f32_f32;
2879      }
2880      if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
2881          return ctx->device->pipeline_cpy_f32_f16;
2882      }
2883      if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
2884          return ctx->device->pipeline_cpy_f16_f16;
2885      }
2886  
2887      std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
2888      GGML_ASSERT(false);
2889  }
2890  
2891  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2892      VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2893      std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2894      const int tensor_type_size = ggml_type_size(tensor->type);
2895  
2896      const uint32_t ne = ggml_nelements(tensor);
2897  
2898      const vk_op_unary_push_constants pc = {
2899          (uint32_t)ne,
2900          (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
2901          (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
2902          0,
2903          0.0f, 0.0f,
2904      };
2905      ggml_vk_sync_buffers(subctx);
2906      ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
2907  }
2908  
2909  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2910      VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2911      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2912      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
2913      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
2914      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
2915  
2916      const uint64_t ne00 = src0->ne[0];
2917      const uint64_t ne01 = src0->ne[1];
2918      const uint64_t ne02 = src0->ne[2];
2919      const uint64_t ne03 = src0->ne[3];
2920  
2921      const uint64_t ne10 = src1->ne[0];
2922      const uint64_t ne11 = src1->ne[1];
2923      const uint64_t ne12 = src1->ne[2];
2924      const uint64_t ne13 = src1->ne[3];
2925  
2926      const uint64_t ne20 = dst->ne[0];
2927      const uint64_t ne21 = dst->ne[1];
2928  
2929      const uint64_t r2 = ne12 / ne02;
2930      const uint64_t r3 = ne13 / ne03;
2931  
2932      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2933      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
2934      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
2935  
2936      vk_buffer d_Qx;
2937      size_t qx_buf_offset = 0;
2938      vk_buffer d_Qy;
2939      size_t qy_buf_offset = 0;
2940  
2941      bool src0_uma = false;
2942      bool src1_uma = false;
2943  
2944      if (ctx->device->uma) {
2945          ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
2946          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
2947          src0_uma = d_Qx != nullptr;
2948          src1_uma = d_Qy != nullptr;
2949      }
2950  
2951      const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2952      const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2953  
2954      const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
2955  
2956      vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type);
2957  
2958      const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
2959      const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
2960  
2961      if (mmp == nullptr) {
2962          // Fall back to dequant + f16 mulmat
2963          mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16);
2964      }
2965  
2966      // Not implemented
2967      GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
2968  
2969      const int x_ne = ne01 * ne00;
2970      const int y_ne = ne11 * ne10;
2971      const int d_ne = ne11 * ne01;
2972  
2973      const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
2974      const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
2975  
2976      const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
2977  
2978      vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
2979  
2980      const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
2981      const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
2982      const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
2983      const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
2984      const uint64_t d_sz = sizeof(float) * d_ne;
2985  
2986      vk_buffer d_D = extra->buffer_gpu.lock();
2987      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2988      GGML_ASSERT(d_D != nullptr);
2989      GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2990      vk_buffer d_X;
2991      uint64_t x_buf_offset = 0;
2992      vk_buffer d_Y;
2993      uint64_t y_buf_offset = 0;
2994      if (!src0_uma) {
2995          d_Qx = extra_src0->buffer_gpu.lock();
2996          qx_buf_offset = extra_src0->offset + src0->view_offs;
2997          GGML_ASSERT(d_Qx != nullptr);
2998      }
2999      if (!src1_uma) {
3000          d_Qy = extra_src1->buffer_gpu.lock();
3001          qy_buf_offset = extra_src1->offset + src1->view_offs;
3002          GGML_ASSERT(d_Qy != nullptr);
3003      }
3004      if (qx_needs_dequant) {
3005          d_X = ctx->prealloc_x;
3006          GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
3007      } else {
3008          d_X = d_Qx;
3009          x_buf_offset = qx_buf_offset;
3010          GGML_ASSERT(qx_sz == x_sz);
3011      }
3012      if (qy_needs_dequant) {
3013          d_Y = ctx->prealloc_y;
3014          GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
3015      } else {
3016          d_Y = d_Qy;
3017          y_buf_offset = qy_buf_offset;
3018          GGML_ASSERT(qy_sz == y_sz);
3019      }
3020  
3021      vk_pipeline to_fp16_vk_0 = nullptr;
3022      vk_pipeline to_fp16_vk_1 = nullptr;
3023  
3024      if (x_non_contig) {
3025          to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3026      } else {
3027          to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3028      }
3029      if (y_non_contig) {
3030          to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3031      } else {
3032          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3033      }
3034      GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
3035      GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
3036  
3037      // Allocate descriptor sets
3038      ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
3039      if (qx_needs_dequant) {
3040          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
3041      }
3042      if (qy_needs_dequant) {
3043          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1);
3044      }
3045      if (split_k > 1) {
3046          ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
3047      }
3048  
3049      if (x_non_contig) {
3050          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3051      } else if (qx_needs_dequant) {
3052          const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3053          ggml_vk_sync_buffers(subctx);
3054          ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3055      }
3056      if (y_non_contig) {
3057          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
3058      }
3059  
3060      uint32_t stride_batch_x = ne00*ne01;
3061      uint32_t stride_batch_y = ne10*ne11;
3062  
3063      if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
3064          stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
3065      }
3066  
3067      if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
3068          stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
3069      }
3070  
3071      // compute
3072      ggml_vk_matmul(
3073          ctx, subctx, pipeline,
3074          { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
3075          { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
3076          ne01, ne11, ne10,
3077          ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
3078          split_k, ne12*ne13, ne02, ne12, r2, r3
3079      );  // NOLINT
3080  }
3081  
3082  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3083      VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3084      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3085      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3086      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
3087      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
3088  
3089      const uint64_t ne00 = src0->ne[0];
3090      const uint64_t ne01 = src0->ne[1];
3091      const uint64_t ne02 = src0->ne[2];
3092      const uint64_t ne03 = src0->ne[3];
3093  
3094      const uint64_t ne10 = src1->ne[0];
3095      const uint64_t ne11 = src1->ne[1];
3096      const uint64_t ne12 = src1->ne[2];
3097      const uint64_t ne13 = src1->ne[3];
3098  
3099      GGML_ASSERT(ne11 == 1);
3100  
3101      const uint64_t ne20 = dst->ne[0];
3102      const uint64_t ne21 = dst->ne[1];
3103      const uint64_t ne22 = dst->ne[2];
3104      const uint64_t ne23 = dst->ne[3];
3105  
3106      const uint64_t r2 = ne12 / ne02;
3107      const uint64_t r3 = ne13 / ne03;
3108  
3109      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3110      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3111      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3112  
3113      vk_buffer d_Qx;
3114      size_t qx_buf_offset = 0;
3115      vk_buffer d_Qy;
3116      size_t qy_buf_offset = 0;
3117  
3118      bool src0_uma = false;
3119      bool src1_uma = false;
3120  
3121      if (ctx->device->uma) {
3122          ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
3123          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
3124          src0_uma = d_Qx != nullptr;
3125          src1_uma = d_Qy != nullptr;
3126      }
3127  
3128      const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
3129      const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
3130  
3131      const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
3132  
3133      const bool qx_needs_dequant = x_non_contig;
3134      const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
3135  
3136      // Not implemented
3137      GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
3138  
3139      const uint64_t x_ne = ne01 * ne00;
3140      const uint64_t y_ne = ne11 * ne10;
3141      const uint64_t d_ne = ne11 * ne01;
3142  
3143      const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3144      const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
3145      const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
3146      const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
3147      const uint64_t d_sz = sizeof(float) * d_ne;
3148  
3149      vk_buffer d_D = extra->buffer_gpu.lock();
3150      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3151      GGML_ASSERT(d_D != nullptr);
3152      vk_buffer d_X;
3153      uint64_t x_buf_offset = 0;
3154      vk_buffer d_Y;
3155      uint64_t y_buf_offset = 0;
3156      if(!src0_uma) {
3157          d_Qx = extra_src0->buffer_gpu.lock();
3158          qx_buf_offset = extra_src0->offset + src0->view_offs;
3159          GGML_ASSERT(d_Qx != nullptr);
3160      }
3161      if(!src1_uma) {
3162          d_Qy = extra_src1->buffer_gpu.lock();
3163          qy_buf_offset = extra_src1->offset + src1->view_offs;
3164          GGML_ASSERT(d_Qy != nullptr);
3165      }
3166      if (qx_needs_dequant) {
3167          d_X = ctx->prealloc_x;
3168      } else {
3169          d_X = d_Qx;
3170          x_buf_offset = qx_buf_offset;
3171          GGML_ASSERT(qx_sz == x_sz);
3172      }
3173      if (qy_needs_dequant) {
3174          d_Y = ctx->prealloc_y;
3175      } else {
3176          d_Y = d_Qy;
3177          y_buf_offset = qy_buf_offset;
3178          GGML_ASSERT(qy_sz == y_sz);
3179      }
3180  
3181      vk_pipeline to_fp16_vk_0 = nullptr;
3182      vk_pipeline to_fp16_vk_1 = nullptr;
3183      if (x_non_contig) {
3184          to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3185      }
3186      if (y_non_contig) {
3187          to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3188      } else {
3189          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3190      }
3191      vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
3192      GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
3193      GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
3194      GGML_ASSERT(dmmv != nullptr);
3195  
3196      // Allocate descriptor sets
3197      if (qx_needs_dequant) {
3198          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
3199      }
3200      if (qy_needs_dequant) {
3201          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
3202      }
3203      ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13);
3204  
3205      if (x_non_contig) {
3206          GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
3207          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3208      }
3209      if (y_non_contig) {
3210          GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
3211          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
3212      }
3213  
3214      uint32_t stride_batch_x = ne00*ne01;
3215      uint32_t stride_batch_y = ne10*ne11;
3216  
3217      if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
3218          stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
3219      }
3220  
3221      if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
3222          stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
3223      }
3224  
3225      // compute
3226      const vk_mat_vec_push_constants pc = {
3227          (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
3228          stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
3229          (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
3230      };
3231      ggml_vk_sync_buffers(subctx);
3232      ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
3233  }
3234  
3235  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3236      VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3237      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3238      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3239      GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3240      GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]);  // NOLINT
3241      GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]);  // NOLINT
3242      GGML_ASSERT(src0->type == GGML_TYPE_F16);
3243      GGML_ASSERT(src1->type == GGML_TYPE_F32);
3244  
3245      const uint64_t ne00 = src0->ne[0];
3246      const uint64_t ne01 = src0->ne[1];
3247      const uint64_t ne02 = src0->ne[2];
3248      // const uint64_t ne03 = src0->ne[3];
3249  
3250      const uint64_t ne10 = src1->ne[0];
3251      const uint64_t ne11 = src1->ne[1];
3252      const uint64_t ne12 = src1->ne[2];
3253      // const uint64_t ne13 = src1->ne[3];
3254  
3255      GGML_ASSERT(ne11 == 1);
3256  
3257      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3258      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3259      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3260  
3261      vk_buffer d_Qy;
3262      size_t qy_buf_offset = 0;
3263  
3264      bool src1_uma = false;
3265  
3266      if (ctx->device->uma) {
3267          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
3268          src1_uma = d_Qy != nullptr;
3269      }
3270  
3271      const uint64_t x_ne = ne00 * ne01 * ne02;
3272      const uint64_t y_ne = ne10 * ne11 * ne12;
3273      const uint64_t d_ne = ne01 * ne11 * ne12;
3274  
3275      const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3276      const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
3277      const uint64_t d_sz = sizeof(float) * d_ne;
3278  
3279      vk_buffer d_D = extra->buffer_gpu.lock();
3280      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3281      GGML_ASSERT(d_D != nullptr);
3282      vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3283      const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3284      GGML_ASSERT(d_Qx != nullptr);
3285      if (!src1_uma) {
3286          d_Qy = extra_src1->buffer_gpu.lock();
3287          qy_buf_offset = extra_src1->offset + src1->view_offs;
3288          GGML_ASSERT(d_Qx != nullptr);
3289      }
3290  
3291      // Allocate descriptor sets
3292      ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
3293  
3294      const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3295      const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
3296  
3297      const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3298      const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
3299  
3300      // compute
3301      const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3302      ggml_vk_sync_buffers(subctx);
3303      ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3304  }
3305  
3306  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3307      VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3308      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3309      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3310      GGML_ASSERT(!ggml_is_transposed(src0));
3311      GGML_ASSERT(!ggml_is_transposed(src1));
3312      GGML_ASSERT(!ggml_is_permuted(src0));
3313      GGML_ASSERT(src0->type == GGML_TYPE_F16);
3314      GGML_ASSERT(src1->type == GGML_TYPE_F32);
3315  
3316      const uint64_t ne00 = src0->ne[0];
3317      const uint64_t ne01 = src0->ne[1];
3318      const uint64_t ne02 = src0->ne[2];
3319      // const uint64_t ne03 = src0->ne[3];
3320  
3321      const uint64_t nb01 = src0->nb[1];
3322      const uint64_t nb02 = src0->nb[2];
3323  
3324      // const uint64_t ne10 = src1->ne[0];
3325      const uint64_t ne11 = src1->ne[1];
3326      const uint64_t ne12 = src1->ne[2];
3327      // const uint64_t ne13 = src1->ne[3];
3328  
3329      GGML_ASSERT(ne11 == 1);
3330  
3331      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3332      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3333      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3334  
3335      vk_buffer d_Qy = nullptr;
3336      size_t qy_buf_offset = 0;
3337  
3338      bool src1_uma = false;
3339  
3340      if (ctx->device->uma) {
3341          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
3342          src1_uma = d_Qy != nullptr;
3343      }
3344  
3345      const uint64_t d_ne = ne01 * ne11 * ne12;
3346  
3347      const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
3348      const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
3349  
3350      const uint64_t qx_sz = ggml_nbytes(src0);
3351      const uint64_t qy_sz = ggml_nbytes(src1);
3352      const uint64_t d_sz = sizeof(float) * d_ne;
3353  
3354      vk_buffer d_D = extra->buffer_gpu.lock();
3355      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3356      GGML_ASSERT(d_D != nullptr);
3357      vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3358      const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3359      GGML_ASSERT(d_Qx != nullptr);
3360      if (!src1_uma) {
3361          d_Qy = extra_src1->buffer_gpu.lock();
3362          qy_buf_offset = extra_src1->offset + src1->view_offs;
3363          GGML_ASSERT(d_Qx != nullptr);
3364      }
3365  
3366      // Allocate descriptor sets
3367      ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
3368  
3369      const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3370      const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
3371  
3372      const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3373      const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
3374  
3375      // compute
3376      const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3377      ggml_vk_sync_buffers(subctx);
3378      ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3379  }
3380  
3381  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3382      VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3383      if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3384          ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3385      } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
3386          ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst);
3387      } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3388          ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst);
3389      } else {
3390          ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst);
3391      }
3392  }
3393  
3394  static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3395      VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3396      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3397      std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3398      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3399      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
3400      GGML_ASSERT(ids->type == GGML_TYPE_I32);
3401  
3402      const uint64_t ne00 = src0->ne[0];
3403      const uint64_t ne01 = src0->ne[1];
3404      const uint64_t ne02 = src0->ne[2];
3405      const uint64_t ne03 = src0->ne[3];
3406  
3407      const uint64_t ne10 = src1->ne[0];
3408      const uint64_t ne11 = src1->ne[1];
3409      const uint64_t ne12 = src1->ne[2];
3410      const uint64_t ne13 = src1->ne[3];
3411  
3412      const uint64_t nei0 = ids->ne[0];
3413      const uint64_t nei1 = ids->ne[1];
3414      GGML_ASSERT(nei0 * nei1 <= 2048);
3415  
3416      const uint32_t nbi1 = ids->nb[1];
3417      const uint32_t nbi2 = ids->nb[2];
3418  
3419      const uint64_t ne20 = dst->ne[0];
3420      const uint64_t ne21 = dst->ne[1];
3421      const uint64_t ne22 = dst->ne[2];
3422      const uint64_t ne23 = dst->ne[3];
3423  
3424      const uint64_t n_as = ne02;
3425  
3426      GGML_ASSERT(n_as <= 8);
3427  
3428      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3429      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3430      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3431      ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3432  
3433      vk_buffer d_Qx;
3434      size_t qx_buf_offset = 0;
3435      vk_buffer d_Qy;
3436      size_t qy_buf_offset = 0;
3437      vk_buffer d_ids;
3438      size_t ids_buf_offset = 0;
3439  
3440      bool src0_uma = false;
3441      bool src1_uma = false;
3442      bool ids_uma = false;
3443  
3444      if (ctx->device->uma) {
3445          ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
3446          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
3447          ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset);
3448          src0_uma = d_Qx != nullptr;
3449          src1_uma = d_Qy != nullptr;
3450          ids_uma = d_ids != nullptr;
3451      }
3452  
3453      const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
3454      const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
3455  
3456      const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
3457  
3458      vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type);
3459  
3460      const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
3461      const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
3462  
3463      if (mmp == nullptr) {
3464          GGML_ASSERT(false);
3465      }
3466  
3467      // Not implemented
3468      GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
3469  
3470      const uint64_t x_ne = ne01 * ne00;
3471      const uint64_t y_ne = ne11 * ne10;
3472      const uint64_t d_ne = ne21 * ne20;
3473  
3474      const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, nei1));
3475      const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
3476  
3477      vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, nei1, aligned);
3478  
3479      const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
3480      const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
3481      const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
3482      const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
3483      const uint64_t ids_sz = nbi2;
3484      const uint64_t d_sz = sizeof(float) * d_ne;
3485  
3486      vk_buffer d_D = extra->buffer_gpu.lock();
3487      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3488      GGML_ASSERT(d_D != nullptr);
3489      vk_buffer d_X;
3490      uint64_t x_buf_offset = 0;
3491      vk_buffer d_Y;
3492      uint64_t y_buf_offset = 0;
3493      if (!src0_uma) {
3494          d_Qx = extra_src0->buffer_gpu.lock();
3495          qx_buf_offset = extra_src0->offset + src0->view_offs;
3496          GGML_ASSERT(d_Qx != nullptr);
3497      }
3498      if (!src1_uma) {
3499          d_Qy = extra_src1->buffer_gpu.lock();
3500          qy_buf_offset = extra_src1->offset + src1->view_offs;
3501          GGML_ASSERT(d_Qy != nullptr);
3502      }
3503      if (!ids_uma) {
3504          d_ids = extra_ids->buffer_gpu.lock();
3505          ids_buf_offset = extra_ids->offset + ids->view_offs;
3506          GGML_ASSERT(d_ids != nullptr);
3507      }
3508      if (qx_needs_dequant) {
3509          d_X = ctx->prealloc_x;
3510          GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
3511      } else {
3512          d_X = d_Qx;
3513          x_buf_offset = qx_buf_offset;
3514          GGML_ASSERT(qx_sz == x_sz);
3515      }
3516      if (qy_needs_dequant) {
3517          d_Y = ctx->prealloc_y;
3518          GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
3519      } else {
3520          d_Y = d_Qy;
3521          y_buf_offset = qy_buf_offset;
3522          GGML_ASSERT(qy_sz == y_sz);
3523      }
3524  
3525      vk_pipeline to_fp16_vk_0 = nullptr;
3526      vk_pipeline to_fp16_vk_1 = nullptr;
3527  
3528      if (x_non_contig) {
3529          to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3530      } else {
3531          to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3532      }
3533      if (y_non_contig) {
3534          to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3535      } else {
3536          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3537      }
3538      GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
3539      GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
3540  
3541      // Allocate descriptor sets
3542      ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
3543      if (qx_needs_dequant) {
3544          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
3545      }
3546      if (qy_needs_dequant) {
3547          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1);
3548      }
3549  
3550      if (x_non_contig) {
3551          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3552      } else if (qx_needs_dequant) {
3553          const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3554          ggml_vk_sync_buffers(subctx);
3555          ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3556      }
3557      if (y_non_contig) {
3558          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
3559      }
3560  
3561      uint32_t stride_batch_x = ne00*ne01;
3562      uint32_t stride_batch_y = ne10*ne11;
3563  
3564      if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
3565          stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
3566      }
3567  
3568      if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
3569          stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
3570      }
3571  
3572      // compute
3573      ggml_vk_matmul_id(
3574          ctx, subctx, pipeline,
3575          { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
3576          { d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz },
3577          ne01, ne21, ne10, ne10, ne10, ne01,
3578          stride_batch_x, stride_batch_y, ne20*ne21,
3579          n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11
3580      );  // NOLINT
3581  }
3582  
3583  static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3584      VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3585      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3586      std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3587      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3588      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
3589      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
3590      GGML_ASSERT(ids->type == GGML_TYPE_I32);
3591  
3592      const uint64_t ne00 = src0->ne[0];
3593      const uint64_t ne01 = src0->ne[1];
3594      const uint64_t ne02 = src0->ne[2];
3595      const uint64_t ne03 = src0->ne[3];
3596  
3597      const uint64_t ne10 = src1->ne[0];
3598      const uint64_t ne11 = src1->ne[1];
3599      const uint64_t ne12 = src1->ne[2];
3600      const uint64_t ne13 = src1->ne[3];
3601  
3602      const uint64_t nei0 = ids->ne[0];
3603      const uint64_t nei1 = ids->ne[1];
3604  
3605      const uint64_t nbi2 = ids->nb[2];
3606  
3607      GGML_ASSERT(nei1 == 1);
3608  
3609      const uint64_t ne20 = dst->ne[0];
3610      const uint64_t ne21 = dst->ne[1];
3611      const uint64_t ne22 = dst->ne[2];
3612      const uint64_t ne23 = dst->ne[3];
3613  
3614      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3615      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3616      ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3617      ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3618  
3619      vk_buffer d_Qx;
3620      size_t qx_buf_offset = 0;
3621      vk_buffer d_Qy;
3622      size_t qy_buf_offset = 0;
3623      vk_buffer d_ids;
3624      size_t ids_buf_offset = 0;
3625  
3626      bool src0_uma = false;
3627      bool src1_uma = false;
3628      bool ids_uma = false;
3629  
3630      if (ctx->device->uma) {
3631          ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
3632          ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
3633          ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset);
3634          src0_uma = d_Qx != nullptr;
3635          src1_uma = d_Qy != nullptr;
3636          ids_uma = d_ids != nullptr;
3637      }
3638  
3639      const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
3640      const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
3641  
3642      const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
3643  
3644      const bool qx_needs_dequant = x_non_contig;
3645      const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
3646  
3647      // Not implemented
3648      GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
3649  
3650      const uint64_t x_ne = ne01 * ne00;
3651      const uint64_t y_ne = ne11 * ne10;
3652      const uint64_t d_ne = ne21 * ne20;
3653  
3654      const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3655      const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
3656      const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
3657      const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
3658      const uint64_t ids_sz = nbi2;
3659      const uint64_t d_sz = sizeof(float) * d_ne;
3660  
3661      vk_buffer d_D = extra->buffer_gpu.lock();
3662      const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3663      GGML_ASSERT(d_D != nullptr);
3664      vk_buffer d_X;
3665      uint64_t x_buf_offset = 0;
3666      vk_buffer d_Y;
3667      uint64_t y_buf_offset = 0;
3668      if(!src0_uma) {
3669          d_Qx = extra_src0->buffer_gpu.lock();
3670          qx_buf_offset = extra_src0->offset + src0->view_offs;
3671          GGML_ASSERT(d_Qx != nullptr);
3672      }
3673      if(!src1_uma) {
3674          d_Qy = extra_src1->buffer_gpu.lock();
3675          qy_buf_offset = extra_src1->offset + src1->view_offs;
3676          GGML_ASSERT(d_Qy != nullptr);
3677      }
3678      if(!ids_uma) {
3679          d_ids = extra_ids->buffer_gpu.lock();
3680          ids_buf_offset = extra_ids->offset + ids->view_offs;
3681          GGML_ASSERT(d_ids != nullptr);
3682      }
3683      if (qx_needs_dequant) {
3684          d_X = ctx->prealloc_x;
3685      } else {
3686          d_X = d_Qx;
3687          x_buf_offset = qx_buf_offset;
3688          GGML_ASSERT(qx_sz == x_sz);
3689      }
3690      if (qy_needs_dequant) {
3691          d_Y = ctx->prealloc_y;
3692      } else {
3693          d_Y = d_Qy;
3694          y_buf_offset = qy_buf_offset;
3695          GGML_ASSERT(qy_sz == y_sz);
3696      }
3697  
3698      vk_pipeline to_fp16_vk_0 = nullptr;
3699      vk_pipeline to_fp16_vk_1 = nullptr;
3700      if (x_non_contig) {
3701          to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3702      }
3703      if (y_non_contig) {
3704          to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3705      } else {
3706          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3707      }
3708      vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
3709      GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
3710      GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
3711      GGML_ASSERT(dmmv != nullptr);
3712  
3713      // Allocate descriptor sets
3714      if (qx_needs_dequant) {
3715          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
3716      }
3717      if (qy_needs_dequant) {
3718          ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
3719      }
3720      ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13);
3721  
3722      if (x_non_contig) {
3723          GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
3724          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3725      }
3726      if (y_non_contig) {
3727          GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
3728          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
3729      }
3730  
3731      uint32_t stride_batch_y = ne10*ne11;
3732  
3733      if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
3734          stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
3735      }
3736  
3737      // compute
3738      const vk_mat_vec_id_push_constants pc = {
3739          (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
3740          (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
3741          (uint32_t)nei0, (uint32_t)ne11,
3742      };
3743      ggml_vk_sync_buffers(subctx);
3744      ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3745          { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
3746          sizeof(vk_mat_vec_id_push_constants), &pc, { (uint32_t)ne01, (uint32_t)nei0, 1 });
3747  }
3748  
3749  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
3750      VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3751      if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3752          ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3753      } else {
3754          ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3755      }
3756  }
3757  
3758  static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3759      // guaranteed to be an integer due to the check in ggml_can_repeat
3760      const uint64_t ne0 = dst->ne[0];
3761      const uint64_t ne1 = dst->ne[1];
3762      const uint64_t ne2 = dst->ne[2];
3763      const uint64_t ne3 = dst->ne[3];
3764  
3765      const uint64_t ne00 = src0->ne[0];
3766      const uint64_t ne01 = src0->ne[1];
3767      const uint64_t ne02 = src0->ne[2];
3768      const uint64_t ne03 = src0->ne[3];
3769  
3770      const uint64_t nb0 = dst->nb[0];
3771      const uint64_t nb1 = dst->nb[1];
3772      const uint64_t nb2 = dst->nb[2];
3773      const uint64_t nb3 = dst->nb[3];
3774  
3775      const uint64_t nb00 = src0->nb[0];
3776      const uint64_t nb01 = src0->nb[1];
3777      const uint64_t nb02 = src0->nb[2];
3778      const uint64_t nb03 = src0->nb[3];
3779  
3780      const uint64_t nr0 = ne0/ne00;
3781      const uint64_t nr1 = ne1/ne01;
3782      const uint64_t nr2 = ne2/ne02;
3783      const uint64_t nr3 = ne3/ne03;
3784  
3785      // TODO: support for transposed / permuted tensors
3786      GGML_ASSERT(nb0  == sizeof(float));
3787      GGML_ASSERT(nb00 == sizeof(float));
3788  
3789      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3790      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3791  
3792      const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3793      const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3794      vk_buffer dst_buf = extra->buffer_gpu.lock();
3795      const uint64_t dst_offset = extra->offset + dst->view_offs;
3796  
3797      std::vector<vk::BufferCopy> copies;
3798  
3799      for                         (uint64_t i3 = 0; i3 < nr3;  i3++) {
3800          for                     (uint64_t k3 = 0; k3 < ne03; k3++) {
3801              for                 (uint64_t i2 = 0; i2 < nr2;  i2++) {
3802                  for             (uint64_t k2 = 0; k2 < ne02; k2++) {
3803                      for         (uint64_t i1 = 0; i1 < nr1;  i1++) {
3804                          for     (uint64_t k1 = 0; k1 < ne01; k1++) {
3805                              for (uint64_t i0 = 0; i0 < nr0;  i0++) {
3806                                  copies.push_back({
3807                                      src_offset + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
3808                                      dst_offset + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
3809                                      ne00*nb0,
3810                                  });
3811                              }
3812                          }
3813                      }
3814                  }
3815              }
3816          }
3817      }
3818  
3819      ggml_vk_sync_buffers(subctx);
3820      subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies);
3821  
3822      GGML_UNUSED(ctx);
3823      GGML_UNUSED(src1);
3824  }
3825  
3826  
3827  static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
3828      switch (op) {
3829      case GGML_OP_ADD:
3830          if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3831              return ctx->device->pipeline_add_f32;
3832          }
3833          return nullptr;
3834      case GGML_OP_GET_ROWS:
3835          GGML_ASSERT(src1->type == GGML_TYPE_I32);
3836          if (dst->type == GGML_TYPE_F16) {
3837              return ctx->device->pipeline_get_rows[src0->type];
3838          }
3839          if (dst->type == GGML_TYPE_F32) {
3840              return ctx->device->pipeline_get_rows_f32[src0->type];
3841          }
3842          return nullptr;
3843      case GGML_OP_MUL:
3844          if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3845              return ctx->device->pipeline_mul_f32;
3846          }
3847          return nullptr;
3848      case GGML_OP_DIV:
3849          if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3850              return ctx->device->pipeline_div_f32;
3851          }
3852          return nullptr;
3853      case GGML_OP_SCALE:
3854          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3855              return ctx->device->pipeline_scale_f32;
3856          }
3857          return nullptr;
3858      case GGML_OP_SQR:
3859          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3860              return ctx->device->pipeline_sqr_f32;
3861          }
3862          return nullptr;
3863      case GGML_OP_CLAMP:
3864          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3865              return ctx->device->pipeline_clamp_f32;
3866          }
3867          return nullptr;
3868      case GGML_OP_CPY:
3869      case GGML_OP_CONT:
3870      case GGML_OP_DUP:
3871          return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
3872      case GGML_OP_NORM:
3873          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3874              return ctx->device->pipeline_norm_f32;
3875          }
3876          return nullptr;
3877      case GGML_OP_RMS_NORM:
3878          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3879              return ctx->device->pipeline_rms_norm_f32;
3880          }
3881          return nullptr;
3882      case GGML_OP_UNARY:
3883          switch (ggml_get_unary_op(dst)) {
3884              case GGML_UNARY_OP_SILU:
3885                  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3886                      return ctx->device->pipeline_silu_f32;
3887                  }
3888                  break;
3889              case GGML_UNARY_OP_GELU:
3890                  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3891                      return ctx->device->pipeline_gelu_f32;
3892                  }
3893                  break;
3894              case GGML_UNARY_OP_RELU:
3895                  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3896                      return ctx->device->pipeline_relu_f32;
3897                  }
3898                  break;
3899              default:
3900                  break;
3901          }
3902          return nullptr;
3903      case GGML_OP_DIAG_MASK_INF:
3904          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3905              return ctx->device->pipeline_diag_mask_inf_f32;
3906          }
3907          return nullptr;
3908      case GGML_OP_SOFT_MAX:
3909          GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
3910  
3911          if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3912              return ctx->device->pipeline_soft_max_f32;
3913          }
3914          if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3915              return ctx->device->pipeline_soft_max_f32_f16;
3916          }
3917          return nullptr;
3918      case GGML_OP_ROPE:
3919          {
3920              const int mode = ((const int32_t *) dst->op_params)[2];
3921              const bool is_neox = mode & 2;
3922  
3923              if (is_neox) {
3924                  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3925                      return ctx->device->pipeline_rope_neox_f32;
3926                  }
3927                  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3928                      return ctx->device->pipeline_rope_neox_f16;
3929                  }
3930              } else {
3931                  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3932                      return ctx->device->pipeline_rope_norm_f32;
3933                  }
3934                  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3935                      return ctx->device->pipeline_rope_norm_f16;
3936                  }
3937              }
3938              return nullptr;
3939          }
3940      case GGML_OP_ARGSORT:
3941          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
3942              return ctx->device->pipeline_argsort_f32;
3943          }
3944          return nullptr;
3945      case GGML_OP_SUM_ROWS:
3946          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3947              return ctx->device->pipeline_sum_rows_f32;
3948          }
3949          return nullptr;
3950      default:
3951          return nullptr;
3952      }
3953  
3954      GGML_UNUSED(src2);
3955  }
3956  
3957  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
3958      switch(op) {
3959      case GGML_OP_REPEAT:
3960          return ggml_vk_op_repeat;
3961      default:
3962          return nullptr;
3963      }
3964  }
3965  
3966  static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3967      switch (op) {
3968      case GGML_OP_CPY:
3969      case GGML_OP_GET_ROWS:
3970      case GGML_OP_ADD:
3971      case GGML_OP_MUL:
3972      case GGML_OP_DIV:
3973      case GGML_OP_SCALE:
3974      case GGML_OP_SQR:
3975      case GGML_OP_CLAMP:
3976          return true;
3977      default:
3978          return false;
3979      }
3980  }
3981  
3982  template<typename PC>
3983  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3984      VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3985      if (src1 != nullptr) {
3986          std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3987      }
3988      if (src2 != nullptr) {
3989          std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3990      }
3991      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
3992      GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT
3993      GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0));  // NOLINT
3994      GGML_ASSERT(dst->extra != nullptr);
3995      const uint64_t ne00 = src0->ne[0];
3996      const uint64_t ne01 = src0->ne[1];
3997      const uint64_t ne02 = src0->ne[2];
3998      const uint64_t ne03 = src0->ne[3];
3999      const uint64_t ne0 = ne00 * ne01;
4000  
4001      const bool use_src1 = src1 != nullptr;
4002      const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
4003      const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
4004      const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
4005      const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
4006      const uint64_t ne1 = ne10 * ne11;
4007      // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
4008  
4009      const bool use_src2 = src2 != nullptr;
4010      const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
4011      const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
4012      const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
4013      const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
4014      const uint64_t ne2 = ne20 * ne21;
4015  
4016      const uint64_t ned0 = dst->ne[0];
4017      const uint64_t ned1 = dst->ne[1];
4018      const uint64_t ned2 = dst->ne[2];
4019      const uint64_t ned3 = dst->ne[3];
4020      const uint64_t ned = ned0 * ned1;
4021  
4022      vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
4023      ggml_vk_func_t op_func;
4024  
4025      if (pipeline == nullptr) {
4026          op_func = ggml_vk_op_get_func(op);
4027          if (op_func == nullptr) {
4028              std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
4029              if (src1 != nullptr) {
4030                  std::cerr << " and " << ggml_type_name(src1->type);
4031              }
4032              std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
4033              GGML_ASSERT(false);
4034          }
4035  
4036          op_func(ctx, subctx, src0, src1, dst);
4037          return;
4038      }
4039  
4040      const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
4041  
4042      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4043      ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
4044      ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
4045      ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
4046  
4047      vk_buffer d_X = nullptr;
4048      size_t x_buf_offset = 0;
4049      vk_buffer d_Y = nullptr;
4050      size_t y_buf_offset = 0;
4051      vk_buffer d_Z = nullptr;
4052      size_t z_buf_offset = 0;
4053  
4054      bool src0_uma = false;
4055      bool src1_uma = false;
4056      bool src2_uma = false;
4057  
4058      if (ctx->device->uma) {
4059          ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
4060          src0_uma = d_X != nullptr;
4061          if (use_src1) {
4062              ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
4063              src1_uma = d_Y != nullptr;
4064          }
4065          if (use_src2) {
4066              ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
4067              src2_uma = d_Z != nullptr;
4068          }
4069      }
4070  
4071      uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
4072      uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
4073      uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
4074      uint64_t d_sz = ggml_type_size(dst->type) * ned;
4075  
4076      vk_buffer d_D = extra->buffer_gpu.lock();
4077  
4078      // Workaround for tiny tensor inputs on ROPE
4079      if (use_src1 && y_sz > d_D->size) {
4080          y_sz = VK_WHOLE_SIZE;
4081      }
4082  
4083      GGML_ASSERT(d_D != nullptr);
4084      uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4085      GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
4086      if(!src0_uma) {
4087          d_X = extra_src0->buffer_gpu.lock();
4088          x_buf_offset = extra_src0->offset + src0->view_offs;
4089          GGML_ASSERT(d_X != nullptr);
4090      }
4091      if (use_src1 && !src1_uma) {
4092          d_Y = extra_src1->buffer_gpu.lock();
4093          y_buf_offset = extra_src1->offset + src1->view_offs;
4094          GGML_ASSERT(d_Y != nullptr);
4095      }
4096      if (use_src2 && !src2_uma) {
4097          d_Z = extra_src2->buffer_gpu.lock();
4098          z_buf_offset = extra_src2->offset + src2->view_offs;
4099          GGML_ASSERT(d_Z != nullptr);
4100      }
4101  
4102      if (op_supports_incontiguous) {
4103          x_sz = ggml_nbytes(src0);
4104          y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4105          z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4106          d_sz = ggml_nbytes(dst);
4107  
4108          if (x_buf_offset + x_sz >= d_X->size) {
4109              x_sz = VK_WHOLE_SIZE;
4110          }
4111          if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4112              y_sz = VK_WHOLE_SIZE;
4113          }
4114          if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4115              z_sz = VK_WHOLE_SIZE;
4116          }
4117          if (d_buf_offset + d_sz >= d_D->size) {
4118              d_sz = VK_WHOLE_SIZE;
4119          }
4120      }
4121  
4122      std::array<uint32_t, 3> elements;
4123  
4124      // Single call if dimension 2 is contiguous
4125      if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
4126          ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
4127  
4128          switch (dst->op) {
4129          case GGML_OP_NORM:
4130          case GGML_OP_RMS_NORM:
4131          case GGML_OP_SOFT_MAX:
4132          case GGML_OP_SUM_ROWS:
4133              elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
4134              break;
4135          case GGML_OP_DIAG_MASK_INF:
4136          case GGML_OP_ROPE:
4137              elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4138              break;
4139          case GGML_OP_GET_ROWS:
4140              elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4141              break;
4142          case GGML_OP_ARGSORT:
4143              elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4144              break;
4145          default:
4146              elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
4147              break;
4148          }
4149  
4150          if (!op_supports_incontiguous) {
4151              if (x_sz != VK_WHOLE_SIZE) {
4152                  x_sz *= ne02 * ne03;
4153              }
4154              if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4155                  y_sz *= ne12 * ne13;
4156              }
4157              if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4158                  z_sz *= ne22 * ne23;
4159              }
4160              if (d_sz != VK_WHOLE_SIZE) {
4161                  d_sz *= ned2 * ned3;
4162              }
4163          }
4164  
4165          if (op == GGML_OP_SOFT_MAX) {
4166              // Empty src1 is possible in soft_max, but the shader needs a buffer
4167              vk_subbuffer subbuf_y;
4168              if (use_src1) {
4169                  subbuf_y = { d_Y, y_buf_offset, y_sz };
4170              } else {
4171                  subbuf_y = { d_X, 0, d_X->size };
4172              }
4173  
4174              ggml_vk_sync_buffers(subctx);
4175              ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4176          } else if (op == GGML_OP_ROPE) {
4177              // Empty src2 is possible in rope, but the shader needs a buffer
4178              vk_subbuffer subbuf_z;
4179              if (use_src2) {
4180                  subbuf_z = { d_Z, z_buf_offset, z_sz };
4181              } else {
4182                  subbuf_z = { d_X, 0, d_X->size };
4183              }
4184  
4185              ggml_vk_sync_buffers(subctx);
4186              ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4187          } else if (use_src2) {
4188              ggml_vk_sync_buffers(subctx);
4189              ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4190          } else if (use_src1) {
4191              ggml_vk_sync_buffers(subctx);
4192              ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4193          } else {
4194              ggml_vk_sync_buffers(subctx);
4195              ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4196          }
4197      } else {
4198          GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4199          GGML_ASSERT(op != GGML_OP_ARGSORT);
4200          GGML_ASSERT(!use_src2);
4201  
4202          ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4203  
4204          switch (dst->op) {
4205          case GGML_OP_NORM:
4206          case GGML_OP_RMS_NORM:
4207              elements = { (uint32_t)ne01, 1, 1 };
4208              break;
4209          case GGML_OP_DIAG_MASK_INF:
4210          case GGML_OP_ROPE:
4211              elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
4212              break;
4213          case GGML_OP_GET_ROWS:
4214              elements = {  (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4215              break;
4216          default:
4217              elements = { (uint32_t)ne0, 1, 1 };
4218              break;
4219          }
4220  
4221          for (uint64_t i03 = 0; i03 < ne03; i03++) {
4222              for (uint64_t i02 = 0; i02 < ne02; i02++) {
4223                  const uint32_t it_idx0 = (i03 * ne02 + i02);
4224                  const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0;
4225                  const uint32_t x_offset = x_sz * it_idx0;
4226                  const uint32_t y_offset = y_sz * it_idx1;
4227                  const uint32_t d_offset = d_sz * it_idx0;
4228  
4229                  if (use_src1) {
4230                      ggml_vk_sync_buffers(subctx);
4231                      ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4232                  } else {
4233                      ggml_vk_sync_buffers(subctx);
4234                      ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4235                  }
4236              }
4237          }
4238      }
4239  }
4240  
4241  static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4242      ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
4243  }
4244  
4245  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4246      const uint32_t src0_type_size = ggml_type_size(src0->type);
4247      const uint32_t src1_type_size = ggml_type_size(src1->type);
4248      const uint32_t dst_type_size = ggml_type_size(dst->type);
4249  
4250      ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
4251          (uint32_t)ggml_nelements(src0),
4252          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4253          (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4254          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4255          0,
4256          0.0f, 0.0f,
4257      });
4258  }
4259  
4260  static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4261      const uint32_t src0_type_size = ggml_type_size(src0->type);
4262      const uint32_t src1_type_size = ggml_type_size(src1->type);
4263      const uint32_t dst_type_size = ggml_type_size(dst->type);
4264  
4265      ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
4266          (uint32_t)ggml_nelements(src0),
4267          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4268          (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4269          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4270          0,
4271          0.0f, 0.0f,
4272      });
4273  }
4274  
4275  static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4276      const uint32_t src0_type_size = ggml_type_size(src0->type);
4277      const uint32_t src1_type_size = ggml_type_size(src1->type);
4278      const uint32_t dst_type_size = ggml_type_size(dst->type);
4279  
4280      ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
4281          (uint32_t)ggml_nelements(src0),
4282          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4283          (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4284          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4285          0,
4286          0.0f, 0.0f,
4287      });
4288  }
4289  
4290  static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4291      const uint32_t src0_type_size = ggml_type_size(src0->type);
4292      const uint32_t src1_type_size = ggml_type_size(src1->type);
4293      const uint32_t dst_type_size = ggml_type_size(dst->type);
4294  
4295      ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_DIV, {
4296          (uint32_t)ggml_nelements(src0),
4297          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4298          (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4299          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4300          0,
4301          0.0f, 0.0f,
4302      });
4303  }
4304  
4305  static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4306      float * op_params = (float *)dst->op_params;
4307      const uint32_t src0_type_size = ggml_type_size(src0->type);
4308      const uint32_t dst_type_size = ggml_type_size(dst->type);
4309  
4310      ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
4311          (uint32_t)ggml_nelements(src0),
4312          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4313          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4314          0,
4315          op_params[0], 0.0f
4316      });
4317  }
4318  
4319  static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4320      const uint32_t src0_type_size = ggml_type_size(src0->type);
4321      const uint32_t dst_type_size = ggml_type_size(dst->type);
4322  
4323      ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
4324          (uint32_t)ggml_nelements(src0),
4325          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4326          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4327          0,
4328          0.0f, 0.0f,
4329      });
4330  }
4331  
4332  static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4333      float * op_params = (float *)dst->op_params;
4334      const uint32_t src0_type_size = ggml_type_size(src0->type);
4335      const uint32_t dst_type_size = ggml_type_size(dst->type);
4336  
4337      ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
4338          (uint32_t)ggml_nelements(src0),
4339          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4340          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4341          0,
4342          op_params[0], op_params[1],
4343      });
4344  }
4345  
4346  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4347      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4348      const uint32_t src0_type_size = ggml_type_size(src0->type);
4349      const uint32_t dst_type_size = ggml_type_size(dst->type);
4350      const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4351  
4352      ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4353          (uint32_t)ggml_nelements(src0),
4354          (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4355          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
4356          d_offset,
4357          0.0f, 0.0f,
4358      });
4359  }
4360  
4361  static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4362      float * op_params = (float *)dst->op_params;
4363  
4364      ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4365  }
4366  
4367  static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4368      float * op_params = (float *)dst->op_params;
4369      ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4370  }
4371  
4372  static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4373      ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
4374  }
4375  
4376  static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4377      int32_t * op_params = (int32_t *)dst->op_params;
4378      ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4379  }
4380  
4381  static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4382      float * op_params = (float *)dst->op_params;
4383  
4384      float scale = op_params[0];
4385      float max_bias = op_params[1];
4386  
4387      const uint32_t ncols =   (uint32_t)src0->ne[0];
4388      const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
4389      const uint32_t nrows_y = (uint32_t)src0->ne[1];
4390  
4391      const uint32_t n_head_kv   = nrows_x/nrows_y;
4392      const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
4393  
4394      const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
4395      const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4396  
4397      ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4398          ncols,
4399          src1 != nullptr ? nrows_y : (uint32_t)0,
4400          scale, max_bias,
4401          m0, m1,
4402          n_head_log2,
4403      });
4404  }
4405  
4406  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4407      const int n_dims        = ((int32_t *) dst->op_params)[1];
4408      // const int mode          = ((int32_t *) dst->op_params)[2];
4409      // const int n_ctx         = ((int32_t *) dst->op_params)[3];
4410      const int n_ctx_orig    = ((int32_t *) dst->op_params)[4];
4411      const float freq_base   = ((float *)   dst->op_params)[5];
4412      const float freq_scale  = ((float *)   dst->op_params)[6];
4413      const float ext_factor  = ((float *)   dst->op_params)[7];
4414      const float attn_factor = ((float *)   dst->op_params)[8];
4415      const float beta_fast   = ((float *)   dst->op_params)[9];
4416      const float beta_slow   = ((float *)   dst->op_params)[10];
4417  
4418      float corr_dims[2];
4419      ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4420  
4421      const float theta_scale = powf(freq_base, -2.0f/n_dims);
4422  
4423      ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4424          (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4425          freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4426          src2 != nullptr,
4427      });
4428  }
4429  
4430  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4431      int32_t * op_params = (int32_t *)dst->op_params;
4432  
4433      uint32_t ncols = src0->ne[0];
4434  
4435      uint32_t ncols_pad = 1;
4436      while (ncols_pad < ncols) {
4437          ncols_pad *= 2;
4438      }
4439  
4440      GGML_ASSERT(ncols_pad <= 1024);
4441  
4442      ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4443          ncols,
4444          ncols_pad,
4445          op_params[0],
4446      });
4447  }
4448  
4449  static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4450      ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f });
4451  }
4452  
4453  #ifdef GGML_VULKAN_RUN_TESTS
4454  static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
4455      if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
4456          return;
4457      }
4458      i0 = std::max(i0, 5);
4459      i1 = std::max(i1, 5);
4460      i2 = std::max(i2, 0);
4461      fprintf(stderr, "         ");
4462      for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
4463          fprintf(stderr, "%7d ", idx1);
4464      }
4465      fprintf(stderr, "\n");
4466      for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
4467          fprintf(stderr, "%7d: ", idx0);
4468          for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
4469              if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) {
4470                  float val;
4471                  if (type == GGML_TYPE_F32) {
4472                      val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
4473                  } else if (type == GGML_TYPE_F16) {
4474                      val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
4475                  } else {
4476                      GGML_ASSERT(false);
4477                  }
4478                  fprintf(stderr, "% 7.2f ", val);
4479              } else {
4480                  fprintf(stderr, "        ");
4481              }
4482          }
4483          fprintf(stderr, "\n");
4484      }
4485  }
4486  
4487  template <typename X_TYPE, typename Y_TYPE>
4488  static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
4489      VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
4490      const size_t x_ne = m * k * batch;
4491      const size_t y_ne = k * n * batch;
4492      const size_t d_ne = m * n * batch;
4493  
4494      vk_pipeline p;
4495      std::string shname;
4496      if (shader_size == 0) {
4497          if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4498              p = ctx->device->pipeline_matmul_f32->a_s;
4499              shname = "F32_ALIGNED_S";
4500          } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4501              p = ctx->device->pipeline_matmul_f32_f16->a_s;
4502              shname = "F32_F16_ALIGNED_S";
4503          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4504              p = ctx->device->pipeline_matmul_f16_f32->a_s;
4505              shname = "F16_F32_ALIGNED_S";
4506          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4507              p = ctx->device->pipeline_matmul_f16->a_s;
4508              shname = "F16_ALIGNED_S";
4509          } else {
4510              GGML_ASSERT(false);
4511          }
4512      } else if (shader_size == 1) {
4513          if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4514              p = ctx->device->pipeline_matmul_f32->a_m;
4515              shname = "F32_ALIGNED_M";
4516          } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4517              p = ctx->device->pipeline_matmul_f32_f16->a_m;
4518              shname = "F32_F16_ALIGNED_M";
4519          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4520              p = ctx->device->pipeline_matmul_f16_f32->a_m;
4521              shname = "F16_F32_ALIGNED_M";
4522          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4523              p = ctx->device->pipeline_matmul_f16->a_m;
4524              shname = "F16_ALIGNED_M";
4525          } else {
4526              GGML_ASSERT(false);
4527          }
4528      } else if (shader_size == 2) {
4529          if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4530              p = ctx->device->pipeline_matmul_f32->a_l;
4531              shname = "F32_ALIGNED_L";
4532          } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4533              p = ctx->device->pipeline_matmul_f32_f16->a_l;
4534              shname = "F32_F16_ALIGNED_L";
4535          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4536              p = ctx->device->pipeline_matmul_f16_f32->a_l;
4537              shname = "F16_F32_ALIGNED_L";
4538          } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4539              p = ctx->device->pipeline_matmul_f16->a_l;
4540              shname = "F16_ALIGNED_L";
4541          } else {
4542              GGML_ASSERT(false);
4543          }
4544      } else {
4545          GGML_ASSERT(0);
4546      }
4547  
4548      const size_t kpad = ggml_vk_align_size(k, p->align);
4549  
4550      if (k != kpad) {
4551          if (shader_size == 0) {
4552              if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4553                  p = ctx->device->pipeline_matmul_f32->s;
4554                  shname = "F32_S";
4555              } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4556                  p = ctx->device->pipeline_matmul_f32_f16->s;
4557                  shname = "F32_F16_S";
4558              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4559                  p = ctx->device->pipeline_matmul_f16_f32->s;
4560                  shname = "F16_F32_S";
4561              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4562                  p = ctx->device->pipeline_matmul_f16->s;
4563                  shname = "F16_S";
4564              }
4565          } else if (shader_size == 1) {
4566              if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4567                  p = ctx->device->pipeline_matmul_f32->m;
4568                  shname = "F32_M";
4569              } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4570                  p = ctx->device->pipeline_matmul_f32_f16->m;
4571                  shname = "F32_F16_M";
4572              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4573                  p = ctx->device->pipeline_matmul_f16_f32->m;
4574                  shname = "F16_F32_M";
4575              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4576                  p = ctx->device->pipeline_matmul_f16->m;
4577                  shname = "F16_M";
4578              }
4579          } else if (shader_size == 2) {
4580              if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4581                  p = ctx->device->pipeline_matmul_f32->l;
4582                  shname = "F32_L";
4583              } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4584                  p = ctx->device->pipeline_matmul_f32_f16->l;
4585                  shname = "F32_F16_L";
4586              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4587                  p = ctx->device->pipeline_matmul_f16_f32->l;
4588                  shname = "F16_F32_L";
4589              } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4590                  p = ctx->device->pipeline_matmul_f16->l;
4591                  shname = "F16_L";
4592              }
4593          }
4594      }
4595  
4596      ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
4597      if (split_k > 1) {
4598          ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
4599  
4600          if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
4601              // Resize buffer
4602              if (ctx->prealloc_split_k != nullptr) {
4603                  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
4604              }
4605              ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
4606          }
4607      }
4608  
4609      vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4610      vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4611      vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4612  
4613      X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
4614      Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
4615      float* d = (float *) malloc(sizeof(float) * d_ne);
4616  
4617      for (size_t i = 0; i < x_ne; i++) {
4618          if (std::is_same<float, X_TYPE>()) {
4619              x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
4620          } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4621              x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4622          } else {
4623              GGML_ASSERT(false);
4624          }
4625      }
4626      for (size_t i = 0; i < y_ne; i++) {
4627          if (std::is_same<float, Y_TYPE>()) {
4628              // y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
4629              y[i] = (i % k == i / k) ? 1.0f : 0.0f;
4630          } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
4631              // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4632              y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
4633          } else {
4634              GGML_ASSERT(false);
4635          }
4636      }
4637  
4638      ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
4639      ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
4640  
4641      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4642      for (size_t i = 0; i < num_it; i++) {
4643          ggml_vk_ctx_begin(ctx, subctx);
4644          ggml_vk_matmul(
4645              ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
4646              m, n, k,
4647              k, k, m, k*m, k*n, m*n,
4648              split_k, batch, batch, batch, 1, 1
4649          );
4650          ggml_vk_ctx_end(subctx);
4651      }
4652  
4653      auto begin = std::chrono::high_resolution_clock::now();
4654      ggml_vk_submit(subctx, ctx->fence);
4655      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
4656      ctx->device->device.resetFences({ ctx->fence });
4657  
4658      auto end = std::chrono::high_resolution_clock::now();
4659      double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4660  
4661      // copy dst to host
4662      ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
4663  
4664      float * d_chk = (float *) malloc(sizeof(float) * d_ne);
4665  
4666      ggml_init_params iparams = {
4667          /*.mem_size   =*/ 1024*1024*1024,
4668          /*.mem_buffer =*/ NULL,
4669          /*.no_alloc   =*/ true,
4670      };
4671  
4672      ggml_context * ggml_ctx = ggml_init(iparams);
4673  
4674      ggml_type src0_type;
4675      ggml_type src1_type;
4676  
4677      if (std::is_same<float, X_TYPE>()) {
4678          src0_type = GGML_TYPE_F32;
4679      } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4680          src0_type = GGML_TYPE_F16;
4681      } else {
4682          GGML_ASSERT(false);
4683      }
4684      if (std::is_same<float, Y_TYPE>()) {
4685          src1_type = GGML_TYPE_F32;
4686      } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
4687          src1_type = GGML_TYPE_F16;
4688      } else {
4689          GGML_ASSERT(false);
4690      }
4691  
4692      ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
4693      ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch);
4694      ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
4695  
4696      src0_ggml->data = x;
4697      src1_ggml->data = y;
4698      tensor_ggml->data = d_chk;
4699  
4700      ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4701      ggml_build_forward_expand(cgraph, tensor_ggml);
4702  
4703      ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4704  
4705      ggml_free(ggml_ctx);
4706  
4707      double avg_err = 0.0;
4708      int first_err_n = -1;
4709      int first_err_m = -1;
4710      int first_err_b = -1;
4711  
4712      for (size_t i = 0; i < m*n*batch; i++) {
4713          double err = std::fabs(d[i] - d_chk[i]);
4714          avg_err += err;
4715  
4716          if (err > 0.05f && first_err_n == -1) {
4717              first_err_b = i / (m * n);
4718              first_err_n = (i % (m * n)) / m;
4719              first_err_m = (i % (m * n)) % m;
4720          }
4721      }
4722  
4723      avg_err /= m * n;
4724  
4725      std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl;
4726  
4727      if (avg_err > 0.1) {
4728          std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
4729          std::cerr << "Actual result: " << std::endl << std::endl;
4730          ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4731          std::cerr << std::endl;
4732          ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n + 15, first_err_b);
4733          std::cerr << "Expected result: " << std::endl << std::endl;
4734          ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4735  
4736          if (split_k > 1) {
4737              float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
4738              ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
4739  
4740              std::cerr << "d_buf0: " << std::endl << std::endl;
4741              ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4742  
4743              std::cerr << "d_buf1: " << std::endl << std::endl;
4744              ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4745  
4746              std::cerr << "d_buf2: " << std::endl << std::endl;
4747              ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4748  
4749              std::cerr << "d_buf3: " << std::endl << std::endl;
4750              ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
4751  
4752              free(split_k_buf);
4753          }
4754      }
4755  
4756      free(d_chk);
4757  
4758      ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
4759      ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
4760  
4761      ggml_vk_destroy_buffer(d_X);
4762      ggml_vk_destroy_buffer(d_Y);
4763      ggml_vk_destroy_buffer(d_D);
4764  
4765      ggml_pipeline_cleanup(p);
4766      ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
4767  
4768      free(x);
4769      free(y);
4770      free(d);
4771  }
4772  
4773  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
4774      if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
4775          return;
4776      }
4777      i0 = std::max(i0, 5);
4778      i1 = std::max(i1, 5);
4779      i2 = std::max(i2, 0);
4780      i3 = std::max(i3, 0);
4781      fprintf(stderr, "         ");
4782      for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
4783          fprintf(stderr, "%7d ", idx1);
4784      }
4785      fprintf(stderr, "\n");
4786      for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
4787          fprintf(stderr, "%7d: ", idx0);
4788          for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
4789              if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
4790                  float val;
4791                  if (tensor->type == GGML_TYPE_F32) {
4792                      val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
4793                  } else if (tensor->type == GGML_TYPE_F16) {
4794                      val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
4795                  } else {
4796                      GGML_ASSERT(false);
4797                  }
4798                  fprintf(stderr, "% 7.2f ", val);
4799              } else {
4800                  fprintf(stderr, "        ");
4801              }
4802          }
4803          fprintf(stderr, "\n");
4804      }
4805  }
4806  
4807  static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_t ne1, size_t ne2, size_t ne3) {
4808      const size_t ne = ne0 * ne1 * ne2 * ne3;
4809  
4810      ggml_init_params iparams = {
4811          /*.mem_size   =*/ 1024*1024*1024,
4812          /*.mem_buffer =*/ NULL,
4813          /*.no_alloc   =*/ true,
4814      };
4815  
4816      ggml_context * ggml_ctx = ggml_init(iparams);
4817  
4818      ggml_tensor * tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne2, ne1, ne3);  // NOLINT
4819      ggml_tensor * result_tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne1, ne2, ne3);
4820  
4821      float * data = (float *) ggml_vk_host_malloc(ctx, ggml_nbytes(tensor));
4822      tensor->data = data;
4823  
4824      float * result_data = (float *) malloc(ggml_nbytes(tensor));
4825      result_tensor->data = result_data;
4826  
4827      // Permute
4828      {
4829          size_t tmp = tensor->nb[2];
4830          tensor->nb[2] = tensor->nb[1];
4831          tensor->nb[1] = tmp;
4832  
4833          tensor->ne[2] = ne2;
4834          tensor->ne[1] = ne1;
4835      }
4836  
4837      for (size_t i = 0; i < ne; i++) {
4838          data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
4839      }
4840  
4841      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4842      ggml_vk_ctx_begin(ctx, subctx);
4843  
4844      vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
4845  
4846      ggml_vk_h2d_tensor_2d(ctx, subctx, buffer, 0, tensor, 0, 0, ggml_nrows(tensor));
4847  
4848      ggml_vk_ctx_end(subctx);
4849      ggml_vk_submit(subctx, ctx->fence);
4850      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences");
4851      ctx->device->device.resetFences({ ctx->fence });
4852  
4853      ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor));
4854  
4855      double avg_err = 0.0;
4856      int first_err_i0 = -1;
4857      int first_err_i1 = -1;
4858      int first_err_i2 = -1;
4859      int first_err_i3 = -1;
4860  
4861      for (size_t i3 = 0; i3 < ne3; i3++) {
4862          for (size_t i2 = 0; i2 < ne2; i2++) {
4863              for (size_t i1 = 0; i1 < ne1; i1++) {
4864                  for (size_t i0 = 0; i0 < ne0; i0++) {
4865                      float correct = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
4866                      float result = *(float *) ((char *) result_data + i3*ne2*ne1*ne0*sizeof(float) + i2*ne1*ne0*sizeof(float) + i1*ne0*sizeof(float) + i0*sizeof(float));
4867                      double err = std::fabs(result - correct);
4868  
4869                      avg_err += err;
4870  
4871                      if (err > 0.05f && first_err_i0 == -1) {
4872                          first_err_i0 = i0;
4873                          first_err_i1 = i1;
4874                          first_err_i2 = i2;
4875                          first_err_i3 = i3;
4876                      }
4877                  }
4878              }
4879          }
4880      }
4881  
4882      avg_err /= ne;
4883  
4884      std::cerr << "TEST nc copy ne0=" << ne0 << " ne1=" << ne1 << " ne2=" << ne2 << " ne3=" << ne3 << " avg_err=" << avg_err << std::endl;
4885  
4886      if (avg_err > 0.1) {
4887          std::cerr << "i0 = " << first_err_i0 << " i1 = " << first_err_i1 << " i2 = " << first_err_i2 << " i3 = " << first_err_i3 << std::endl;
4888          std::cerr << "Actual result: " << std::endl << std::endl;
4889          ggml_vk_print_tensor_area(result_tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3);
4890          std::cerr << "Expected result: " << std::endl << std::endl;
4891          ggml_vk_print_tensor_area(tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3);
4892      }
4893  
4894      ggml_free(ggml_ctx);
4895  
4896      ggml_vk_destroy_buffer(buffer);
4897  
4898      ggml_vk_host_free(ctx, data);
4899      free(result_data);
4900  }
4901  
4902  static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4903      VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4904      // Check transfers are correct
4905      vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4906  
4907      float * x;
4908      float * y;
4909      if (pinned) {
4910          x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
4911          y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
4912      } else {
4913          x = (float *) malloc(sizeof(float) * ne);
4914          y = (float *) malloc(sizeof(float) * ne);
4915      }
4916  
4917      for (size_t i = 0; i < ne; i++) {
4918          x[i] = rand() / (float)RAND_MAX;
4919      }
4920  
4921      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4922      ggml_vk_ctx_begin(ctx, subctx);
4923  
4924      auto begin = std::chrono::high_resolution_clock::now();
4925  
4926      ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
4927  
4928      for (auto& cpy : subctx->in_memcpys) {
4929          memcpy(cpy.dst, cpy.src, cpy.n);
4930      }
4931      subctx->in_memcpys.clear();
4932  
4933      ggml_vk_ctx_end(subctx);
4934      ggml_vk_submit(subctx, ctx->fence);
4935      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
4936      ctx->device->device.resetFences({ ctx->fence });
4937  
4938      auto end = std::chrono::high_resolution_clock::now();
4939  
4940      double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4941  
4942      ggml_vk_ctx_begin(ctx, subctx);
4943  
4944      begin = std::chrono::high_resolution_clock::now();
4945  
4946      ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
4947  
4948      ggml_vk_ctx_end(subctx);
4949      ggml_vk_submit(subctx, ctx->fence);
4950      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
4951      ctx->device->device.resetFences({ ctx->fence });
4952  
4953      for (auto& cpy : subctx->out_memcpys) {
4954          memcpy(cpy.dst, cpy.src, cpy.n);
4955      }
4956      subctx->out_memcpys.clear();
4957  
4958      end = std::chrono::high_resolution_clock::now();
4959  
4960      double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4961  
4962      double avg_err = 0.0;
4963      for (size_t i = 0; i < ne; i++) {
4964          avg_err += std::fabs(x[i] - y[i]);
4965      }
4966  
4967      double kb = ne * sizeof(float) / 1024.0;
4968  
4969      std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
4970  
4971      ggml_vk_destroy_buffer(buffer);
4972  
4973      if (pinned) {
4974          ggml_vk_host_free(ctx, x);
4975          ggml_vk_host_free(ctx, y);
4976      } else {
4977          free(x);
4978          free(y);
4979      }
4980  }
4981  
4982  static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
4983      ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
4984  }
4985  
4986  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
4987      VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
4988      const size_t x_sz = sizeof(float) * ne;
4989      const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
4990      const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
4991      float * x = (float *) malloc(x_sz);
4992      void * qx = malloc(qx_sz);
4993      vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
4994      vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
4995      ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
4996  
4997      for (size_t i = 0; i < ne; i++) {
4998          x[i] = rand() / (float)RAND_MAX;
4999      }
5000  
5001      vk_pipeline p = ctx->device->pipeline_dequant[quant];
5002  
5003      ggml_vk_quantize_data(x, qx, ne, quant);
5004  
5005      ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
5006  
5007      ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
5008  
5009      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5010      ggml_vk_ctx_begin(ctx, subctx);
5011      const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
5012      ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
5013      ggml_vk_ctx_end(subctx);
5014  
5015      auto begin = std::chrono::high_resolution_clock::now();
5016  
5017      ggml_vk_submit(subctx, ctx->fence);
5018      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
5019      ctx->device->device.resetFences({ ctx->fence });
5020  
5021      auto end = std::chrono::high_resolution_clock::now();
5022  
5023      double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
5024      ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
5025  
5026      int first_err = -1;
5027  
5028      double avg_err = 0.0;
5029      for (size_t i = 0; i < ne; i++) {
5030          double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
5031          avg_err += error;
5032  
5033          if (first_err < 0 && error > 0.05) {
5034              first_err = i;
5035          }
5036      }
5037  
5038      avg_err /= ne;
5039  
5040      std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl;
5041  
5042      if (avg_err > 0.1) {
5043          std::cerr << "first_error = " << first_err << std::endl;
5044          std::cerr << "Actual result: " << std::endl << std::endl;
5045          for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
5046              std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", ";
5047          }
5048          std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
5049          for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
5050              std::cerr << x[i] << ", ";
5051          }
5052          std::cerr << std::endl;
5053      }
5054  
5055      ggml_vk_destroy_buffer(x_buf);
5056      ggml_vk_destroy_buffer(qx_buf);
5057  
5058      free(x);
5059      free(qx);
5060      free(x_chk);
5061  }
5062  
5063  static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
5064      VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
5065      const size_t x_ne = m * k * batch;
5066      const size_t y_ne = k * n * batch;
5067      const size_t d_ne = m * n * batch;
5068  
5069      vk_pipeline p;
5070      std::string shname;
5071      if (shader_size == 0) {
5072          p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_s;
5073          shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
5074      } else if (shader_size == 1) {
5075          p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_m;
5076          shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
5077      } else if (shader_size == 2) {
5078          p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_l;
5079          shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
5080      } else {
5081          GGML_ASSERT(0);
5082      }
5083  
5084      const size_t kpad = ggml_vk_align_size(k, p->align);
5085  
5086      if (k != kpad) {
5087          if (shader_size == 0) {
5088              p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->s;
5089              shname = std::string(ggml_type_name(quant)) + "_S";
5090          } else if (shader_size == 1) {
5091              p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->m;
5092              shname = std::string(ggml_type_name(quant)) + "_M";
5093          } else if (shader_size == 2) {
5094              p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->l;
5095              shname = std::string(ggml_type_name(quant)) + "_L";
5096          } else {
5097              GGML_ASSERT(0);
5098          }
5099      }
5100  
5101      const size_t x_sz = sizeof(float) * x_ne;
5102      const size_t y_sz = sizeof(float) * y_ne;
5103      const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant);
5104      const size_t d_sz = sizeof(float) * d_ne;
5105      float * x = (float *) malloc(x_sz);
5106      float * y = (float *) malloc(y_sz);
5107      void * qx = malloc(qx_sz);
5108      vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5109      vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5110      vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5111      float * d = (float *) malloc(d_sz);
5112      float * d_chk = (float *) malloc(d_sz);
5113  
5114      for (size_t i = 0; i < x_ne; i++) {
5115          x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
5116      }
5117  
5118      ggml_vk_quantize_data(x, qx, x_ne, quant);
5119  
5120      for (size_t i = 0; i < y_ne; i++) {
5121          // y[i] = rand() / (float)RAND_MAX;
5122          y[i] = (i % k == i / k) ? 1.0f : 0.0f;
5123      }
5124  
5125      ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
5126      if (split_k > 1) {
5127          ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5128  
5129          if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
5130              // Resize buffer
5131              if (ctx->prealloc_split_k != nullptr) {
5132                  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5133              }
5134              ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
5135          }
5136      }
5137  
5138      ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
5139      ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
5140  
5141      vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5142      for (size_t i = 0; i < num_it; i++) {
5143          ggml_vk_ctx_begin(ctx, subctx);
5144          ggml_vk_matmul(
5145              ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
5146              m, n, k,
5147              k, k, m, k*m, k*n, m*n,
5148              split_k, batch, batch, batch, 1, 1
5149          );
5150          ggml_vk_ctx_end(subctx);
5151      }
5152  
5153      auto begin = std::chrono::high_resolution_clock::now();
5154  
5155      ggml_vk_submit(subctx, ctx->fence);
5156      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
5157      ctx->device->device.resetFences({ ctx->fence });
5158  
5159      auto end = std::chrono::high_resolution_clock::now();
5160  
5161      double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
5162      ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
5163  
5164      ggml_init_params iparams = {
5165          /*.mem_size   =*/ 1024*1024*1024,
5166          /*.mem_buffer =*/ NULL,
5167          /*.no_alloc   =*/ true,
5168      };
5169  
5170      ggml_context * ggml_ctx = ggml_init(iparams);
5171  
5172      ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch);
5173      ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch);
5174      ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
5175  
5176      src0_ggml->data = qx;
5177      src1_ggml->data = y;
5178      tensor_ggml->data = d_chk;
5179  
5180      ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5181      ggml_build_forward_expand(cgraph, tensor_ggml);
5182  
5183      ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5184  
5185      ggml_free(ggml_ctx);
5186  
5187      double avg_err = 0.0;
5188      int first_err_n = -1;
5189      int first_err_m = -1;
5190      int first_err_b = -1;
5191  
5192      for (size_t i = 0; i < m*n*batch; i++) {
5193          double err = std::fabs(d[i] - d_chk[i]);
5194          avg_err += err;
5195  
5196          if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) {
5197              first_err_b = i / (m * n);
5198              first_err_n = (i % (m * n)) / m;
5199              first_err_m = (i % (m * n)) % m;
5200          }
5201      }
5202  
5203      avg_err /= m * n;
5204  
5205      std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
5206  
5207      if (avg_err > 0.01 || std::isnan(avg_err)) {
5208          std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
5209          std::cerr << "Actual result: " << std::endl << std::endl;
5210          ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5211          std::cerr << std::endl;
5212          std::cerr << "Expected result: " << std::endl << std::endl;
5213          ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5214  
5215          if (split_k > 1) {
5216              float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
5217              ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
5218  
5219              std::cerr << "d_buf0: " << std::endl << std::endl;
5220              ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5221  
5222              std::cerr << "d_buf1: " << std::endl << std::endl;
5223              ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5224  
5225              std::cerr << "d_buf2: " << std::endl << std::endl;
5226              ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5227  
5228              std::cerr << "d_buf3: " << std::endl << std::endl;
5229              ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
5230  
5231              free(split_k_buf);
5232          }
5233      }
5234  
5235      ggml_vk_destroy_buffer(qx_buf);
5236      ggml_vk_destroy_buffer(y_buf);
5237      ggml_vk_destroy_buffer(d_buf);
5238  
5239      free(x);
5240      free(qx);
5241      free(y);
5242      free(d);
5243      free(d_chk);
5244  }
5245  #endif
5246  
5247  static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5248      VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5249      ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5250      extra->reset();
5251      tensor->extra = extra;
5252      return extra;
5253  }
5254  
5255  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5256      VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5257      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5258  
5259      if (extra == nullptr) {
5260          return;
5261      }
5262  
5263      ggml_tensor * src0 = node->src[0];
5264      ggml_tensor * src1 = node->src[1];
5265  
5266      const bool use_src0 = src0 != nullptr;
5267      const int64_t ne00 = use_src0 ? src0->ne[0] : 0;
5268      const int64_t ne01 = use_src0 ? src0->ne[1] : 0;
5269      const int64_t ne02 = use_src0 ? src0->ne[2] : 0;
5270      const int64_t ne03 = use_src0 ? src0->ne[3] : 0;
5271      const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP;
5272      const int64_t ne10 = use_src1 ? src1->ne[0] : 0;
5273      const int64_t ne11 = use_src1 ? src1->ne[1] : 0;
5274      const int64_t ne12 = use_src1 ? src1->ne[2] : 0;
5275      const int64_t ne13 = use_src1 ? src1->ne[3] : 0;
5276      const int64_t ne20 = node->ne[0];
5277      const int64_t ne21 = node->ne[1];
5278      const int64_t ne22 = node->ne[2];
5279      const int64_t ne23 = node->ne[3];
5280  
5281      const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
5282      const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
5283  
5284      const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
5285      const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
5286  
5287      const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
5288  
5289      bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5290  
5291      const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5292      const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5293  
5294      int split_k;
5295      if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
5296          split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
5297      } else {
5298          split_k = 1;
5299      }
5300      const uint32_t x_ne = ne00 * ne01;
5301      const uint32_t y_ne = ne10 * ne11;
5302      const uint32_t d_ne = ne20 * ne21;
5303  
5304      const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
5305      const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
5306      uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
5307      const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
5308  
5309      if (extra->buffer_gpu.expired()) {
5310          // Workaround for CPU backend BLAS matmul calls
5311          extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz);
5312      }
5313  
5314      switch (node->op) {
5315      case GGML_OP_REPEAT:
5316      case GGML_OP_GET_ROWS:
5317      case GGML_OP_RESHAPE:
5318      case GGML_OP_VIEW:
5319      case GGML_OP_PERMUTE:
5320      case GGML_OP_TRANSPOSE:
5321      case GGML_OP_ADD:
5322      case GGML_OP_SCALE:
5323      case GGML_OP_SQR:
5324      case GGML_OP_CLAMP:
5325      case GGML_OP_CPY:
5326      case GGML_OP_CONT:
5327      case GGML_OP_DUP:
5328      case GGML_OP_MUL:
5329      case GGML_OP_DIV:
5330      case GGML_OP_NORM:
5331      case GGML_OP_RMS_NORM:
5332      case GGML_OP_DIAG_MASK_INF:
5333      case GGML_OP_SOFT_MAX:
5334      case GGML_OP_ROPE:
5335      case GGML_OP_ARGSORT:
5336      case GGML_OP_SUM_ROWS:
5337          break;
5338      case GGML_OP_UNARY:
5339          switch (ggml_get_unary_op(node)) {
5340          case GGML_UNARY_OP_SILU:
5341          case GGML_UNARY_OP_GELU:
5342          case GGML_UNARY_OP_RELU:
5343              break;
5344          default:
5345              return;
5346          }
5347          break;
5348      case GGML_OP_MUL_MAT:
5349      case GGML_OP_MUL_MAT_ID:
5350          if (ctx->prealloc_size_x < x_sz) {
5351              ctx->prealloc_size_x = x_sz;
5352          }
5353          if (ctx->prealloc_size_y < y_sz) {
5354              ctx->prealloc_size_y = y_sz;
5355          }
5356          if (ctx->prealloc_size_split_k < split_k_size) {
5357              ctx->prealloc_size_split_k = split_k_size;
5358          }
5359          if (ctx->staging_size < x_sz + y_sz) {
5360              ctx->staging_size = x_sz + y_sz;
5361          }
5362          break;
5363      default:
5364          return;
5365      }
5366  }
5367  
5368  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5369  #if defined(GGML_VULKAN_RUN_TESTS)
5370      ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5371          vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5372          vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
5373      ggml_vk_test_transfer(ctx, 8192 * 1000, false);
5374      ggml_vk_test_transfer(ctx, 8192 * 1000, true);
5375  
5376      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
5377      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
5378      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
5379      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0);
5380      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1);
5381      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0);
5382      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K);
5383      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K);
5384      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
5385      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
5386      ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
5387  
5388      ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
5389  
5390      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
5391      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
5392      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
5393      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
5394      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
5395      ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
5396  
5397      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
5398      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
5399      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
5400      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
5401      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
5402      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
5403  
5404      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
5405      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
5406      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
5407      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
5408      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
5409      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
5410  
5411      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
5412      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
5413      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
5414      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
5415      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
5416      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
5417  
5418      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
5419      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
5420      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
5421      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
5422      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
5423      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
5424  
5425      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
5426      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
5427      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
5428      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
5429      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
5430      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
5431  
5432      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
5433      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
5434      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
5435      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
5436      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
5437      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
5438  
5439      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
5440      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
5441      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
5442      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
5443      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
5444      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
5445  
5446      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
5447      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
5448      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
5449      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
5450      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
5451      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
5452  
5453      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
5454      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
5455      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
5456      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
5457      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
5458      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
5459  
5460      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
5461      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
5462      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
5463      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
5464      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
5465      ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
5466  
5467      std::cerr << std::endl;
5468  
5469      const std::vector<size_t> vals {
5470          8, 8, 8,
5471          100, 46, 576,
5472          623, 111, 128,
5473          100, 46, 558,
5474          512, 1, 256,
5475          128, 110, 622,
5476          511, 511, 127,
5477          511, 511, 7,
5478          511, 511, 17,
5479          49, 49, 128,
5480          128, 49, 49,
5481          4096, 49, 4096,
5482          11008, 49, 4096,
5483          4096, 49, 11008,
5484          32000, 49, 4096,
5485          512, 512, 128,
5486          128, 512, 512,
5487          4096, 512, 4096,
5488          11008, 512, 4096,
5489          4096, 512, 11008,
5490          32000, 512, 4096,
5491      };
5492      const size_t num_it = 1;
5493      for (size_t i = 0; i < vals.size(); i += 3) {
5494          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
5495          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
5496          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
5497          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
5498          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
5499          ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
5500          std::cerr << std::endl;
5501      }
5502  
5503      GGML_ASSERT(false);
5504  #endif
5505  
5506      if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
5507          VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
5508          // Resize buffer
5509          if (ctx->prealloc_x != nullptr) {
5510              ggml_vk_destroy_buffer(ctx->prealloc_x);
5511          }
5512          ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
5513      }
5514      if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
5515          VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
5516          // Resize buffer
5517          if (ctx->prealloc_y != nullptr) {
5518              ggml_vk_destroy_buffer(ctx->prealloc_y);
5519          }
5520          ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
5521      }
5522      if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
5523          VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
5524          // Resize buffer
5525          if (ctx->prealloc_split_k != nullptr) {
5526              ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5527          }
5528          ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
5529      }
5530      if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5531          VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5532          // Resize buffer
5533          if (ctx->staging != nullptr) {
5534              ggml_vk_destroy_buffer(ctx->staging);
5535          }
5536          ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
5537              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5538              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
5539      }
5540  }
5541  
5542  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5543      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5544  
5545      if (ggml_is_empty(node) || extra == nullptr) {
5546          return;
5547      }
5548  
5549      VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5550      ctx->semaphore_idx = 0;
5551      ctx->staging_offset = 0;
5552  
5553      const ggml_tensor * src0 = node->src[0];
5554      const ggml_tensor * src1 = node->src[1];
5555      const ggml_tensor * src2 = node->src[2];
5556  
5557      switch (node->op) {
5558      // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5559      case GGML_OP_RESHAPE:
5560      case GGML_OP_VIEW:
5561      case GGML_OP_PERMUTE:
5562      case GGML_OP_TRANSPOSE:
5563      case GGML_OP_NONE:
5564          return;
5565      case GGML_OP_UNARY:
5566          switch (ggml_get_unary_op(node)) {
5567          case GGML_UNARY_OP_SILU:
5568          case GGML_UNARY_OP_GELU:
5569          case GGML_UNARY_OP_RELU:
5570              break;
5571          default:
5572              return;
5573          }
5574          break;
5575      case GGML_OP_REPEAT:
5576      case GGML_OP_GET_ROWS:
5577      case GGML_OP_ADD:
5578      case GGML_OP_MUL:
5579      case GGML_OP_DIV:
5580      case GGML_OP_SCALE:
5581      case GGML_OP_SQR:
5582      case GGML_OP_CLAMP:
5583      case GGML_OP_CPY:
5584      case GGML_OP_CONT:
5585      case GGML_OP_DUP:
5586      case GGML_OP_NORM:
5587      case GGML_OP_RMS_NORM:
5588      case GGML_OP_DIAG_MASK_INF:
5589      case GGML_OP_SOFT_MAX:
5590      case GGML_OP_ROPE:
5591      case GGML_OP_MUL_MAT:
5592      case GGML_OP_MUL_MAT_ID:
5593      case GGML_OP_ARGSORT:
5594      case GGML_OP_SUM_ROWS:
5595          break;
5596      default:
5597          std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
5598          GGML_ASSERT(false);
5599          return;
5600      }
5601  
5602      if (ctx->compute_ctx == nullptr) {
5603          ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5604          ggml_vk_ctx_begin(ctx, ctx->compute_ctx);
5605      }
5606  
5607      switch (node->op) {
5608      case GGML_OP_REPEAT:
5609          ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node);
5610  
5611          break;
5612      case GGML_OP_GET_ROWS:
5613          ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node);
5614  
5615          break;
5616      case GGML_OP_ADD:
5617          ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node);
5618  
5619          break;
5620      case GGML_OP_MUL:
5621          ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
5622  
5623          break;
5624      case GGML_OP_DIV:
5625          ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node);
5626  
5627          break;
5628      case GGML_OP_SCALE:
5629          ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
5630  
5631          break;
5632      case GGML_OP_SQR:
5633          ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node);
5634  
5635          break;
5636      case GGML_OP_CLAMP:
5637          ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node);
5638  
5639          break;
5640      case GGML_OP_CPY:
5641      case GGML_OP_CONT:
5642      case GGML_OP_DUP:
5643          ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5644  
5645          break;
5646      case GGML_OP_NORM:
5647          ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
5648  
5649          break;
5650      case GGML_OP_RMS_NORM:
5651          ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node);
5652  
5653          break;
5654      case GGML_OP_UNARY:
5655          switch (ggml_get_unary_op(node)) {
5656          case GGML_UNARY_OP_SILU:
5657          case GGML_UNARY_OP_GELU:
5658          case GGML_UNARY_OP_RELU:
5659              ggml_vk_unary(ctx, ctx->compute_ctx, src0, node);
5660              break;
5661          default:
5662              return;
5663          }
5664          break;
5665      case GGML_OP_DIAG_MASK_INF:
5666          ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node);
5667  
5668          break;
5669      case GGML_OP_SOFT_MAX:
5670          ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5671  
5672          break;
5673      case GGML_OP_ROPE:
5674          ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5675  
5676          break;
5677      case GGML_OP_ARGSORT:
5678          ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
5679  
5680          break;
5681      case GGML_OP_SUM_ROWS:
5682          ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node);
5683  
5684          break;
5685      case GGML_OP_MUL_MAT:
5686          ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
5687  
5688          break;
5689      case GGML_OP_MUL_MAT_ID:
5690          ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node);
5691  
5692          break;
5693      default:
5694          return;
5695      }
5696  
5697      extra->ctx_idx = ctx->compute_ctx->idx;
5698  
5699  #ifdef GGML_VULKAN_CHECK_RESULTS
5700      // Force context reset on each node so that each tensor ends up in its own context
5701      // and can be run and compared to its CPU equivalent separately
5702      last_node = true;
5703  #endif
5704  
5705      if (last_node) {
5706          ggml_vk_ctx_end(ctx->compute_ctx);
5707          ctx->compute_ctx->exit_tensor = node;
5708          ctx->compute_ctx = nullptr;
5709      }
5710  }
5711  
5712  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5713      ggml_tensor_extra_gpu * extra = nullptr;
5714  
5715      switch (tensor->op) {
5716      case GGML_OP_ADD:
5717      case GGML_OP_GET_ROWS:
5718      case GGML_OP_MUL:
5719      case GGML_OP_DIV:
5720      case GGML_OP_SCALE:
5721      case GGML_OP_SQR:
5722      case GGML_OP_CLAMP:
5723      case GGML_OP_CPY:
5724      case GGML_OP_CONT:
5725      case GGML_OP_DUP:
5726      case GGML_OP_NORM:
5727      case GGML_OP_RMS_NORM:
5728      case GGML_OP_DIAG_MASK_INF:
5729      case GGML_OP_SOFT_MAX:
5730      case GGML_OP_ROPE:
5731      case GGML_OP_RESHAPE:
5732      case GGML_OP_VIEW:
5733      case GGML_OP_PERMUTE:
5734      case GGML_OP_TRANSPOSE:
5735      case GGML_OP_NONE:
5736      case GGML_OP_ARGSORT:
5737      case GGML_OP_SUM_ROWS:
5738          extra = (ggml_tensor_extra_gpu *) tensor->extra;
5739  
5740          break;
5741      case GGML_OP_UNARY:
5742          switch (ggml_get_unary_op(tensor)) {
5743          case GGML_UNARY_OP_SILU:
5744          case GGML_UNARY_OP_GELU:
5745          case GGML_UNARY_OP_RELU:
5746              extra = (ggml_tensor_extra_gpu *) tensor->extra;
5747              break;
5748          default:
5749              return false;
5750          }
5751          break;
5752      case GGML_OP_MUL_MAT:
5753      case GGML_OP_MUL_MAT_ID:
5754          extra = (ggml_tensor_extra_gpu *) tensor->extra;
5755  
5756          break;
5757      default:
5758          return false;
5759      }
5760  
5761      if (extra == nullptr) {
5762          return false;
5763      }
5764  
5765      if (params->ith != 0) {
5766          return true;
5767      }
5768      if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
5769          return true;
5770      }
5771  
5772      VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5773  
5774  #ifdef GGML_VULKAN_CHECK_RESULTS
5775      ggml_vk_check_results_0(ctx, params, tensor);
5776  #endif
5777  
5778      vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5779  
5780      // Only run if ctx hasn't been submitted yet
5781      if (!subctx.seqs.empty()) {
5782          // Do staging buffer copies
5783          for (auto& cpy : subctx.in_memcpys) {
5784              memcpy(cpy.dst, cpy.src, cpy.n);
5785          }
5786  
5787          ggml_vk_submit(&subctx, ctx->fence);
5788      }
5789  
5790      if (tensor == subctx.exit_tensor) {
5791          VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
5792          ctx->device->device.resetFences({ ctx->fence });
5793  
5794          // Do staging buffer copies
5795          for (auto& cpy : subctx.out_memcpys) {
5796              memcpy(cpy.dst, cpy.src, cpy.n);
5797          }
5798          subctx.in_memcpys.clear();
5799          subctx.out_memcpys.clear();
5800      }
5801  
5802      return true;
5803  }
5804  
5805  // Clean up after graph processing is done
5806  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5807      VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
5808      for (auto& buffer : ctx->gc.temp_buffers) {
5809          ggml_vk_pool_free(ctx, buffer);
5810      }
5811      ctx->gc.temp_buffers.clear();
5812  
5813      for (auto& pipeline : ctx->device->pipelines) {
5814          if (pipeline.expired()) {
5815              continue;
5816          }
5817  
5818          vk_pipeline pl = pipeline.lock();
5819          ggml_pipeline_cleanup(pl);
5820      }
5821  
5822      ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
5823      ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
5824  
5825      for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
5826          ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
5827      }
5828      ctx->gc.semaphores.clear();
5829  
5830      for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
5831          ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
5832      }
5833      ctx->gc.tl_semaphores.clear();
5834      ctx->semaphore_idx = 0;
5835  
5836      ctx->event_idx = 0;
5837  
5838      for (auto& event : ctx->gc.events) {
5839          ctx->device->device.resetEvent(event);
5840      }
5841  
5842      ctx->staging_offset = 0;
5843  
5844      ctx->compute_ctx = nullptr;
5845      ctx->transfer_ctx = nullptr;
5846      ctx->gc.contexts.clear();
5847  }
5848  
5849  // Clean up on backend free
5850  static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5851      VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
5852      ggml_vk_graph_cleanup(ctx);
5853  
5854      ggml_vk_destroy_buffer(ctx->prealloc_x);
5855      ggml_vk_destroy_buffer(ctx->prealloc_y);
5856      ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5857      ggml_vk_destroy_buffer(ctx->staging);
5858      ggml_vk_destroy_buffer(ctx->sync_staging);
5859  
5860      for (auto& buffer : ctx->buffer_pool) {
5861          ggml_vk_destroy_buffer(buffer);
5862      }
5863  
5864      ctx->prealloc_size_x = 0;
5865      ctx->prealloc_size_y = 0;
5866      ctx->prealloc_size_split_k = 0;
5867      ctx->staging_size = 0;
5868  
5869      for (auto& event : ctx->gc.events) {
5870          ctx->device->device.destroyEvent(event);
5871      }
5872      ctx->gc.events.clear();
5873  
5874      ctx->device->device.destroyFence(ctx->fence);
5875  }
5876  
5877  GGML_CALL static int ggml_vk_get_device_count() {
5878      ggml_vk_instance_init();
5879  
5880      return vk_instance.device_indices.size();
5881  }
5882  
5883  GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
5884      ggml_vk_instance_init();
5885  
5886      std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
5887  
5888      vk::PhysicalDeviceProperties props;
5889      devices[device].getProperties(&props);
5890  
5891      snprintf(description, description_size, "%s", props.deviceName.data());
5892  }
5893  
5894  // backend interface
5895  
5896  #define UNUSED GGML_UNUSED
5897  
5898  // device backend
5899  
5900  static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
5901  
5902  struct ggml_backend_vk_buffer_context {
5903      ggml_backend_vk_context * ctx;
5904      vk_buffer dev_buffer;
5905      ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
5906      size_t temp_tensor_extra_index = 0;
5907      std::string name;
5908  
5909      ggml_backend_vk_buffer_context(ggml_backend_vk_context * ctx, vk_buffer&& dev_buffer, std::string& name) :
5910          ctx(ctx),
5911          dev_buffer(dev_buffer),
5912          name(name) {
5913      }
5914  
5915      ~ggml_backend_vk_buffer_context() {
5916          ggml_vk_destroy_buffer(dev_buffer);
5917          if (temp_tensor_extras != nullptr) {
5918              delete[] temp_tensor_extras;
5919          }
5920      }
5921  
5922      ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
5923          if (temp_tensor_extras == nullptr) {
5924              temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
5925          }
5926  
5927          size_t alloc_index = temp_tensor_extra_index;
5928          temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
5929          ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
5930          extra->reset();
5931  
5932          return extra;
5933      }
5934  };
5935  
5936  GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
5937      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5938      return ctx->name.c_str();
5939  }
5940  
5941  GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
5942      return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
5943  }
5944  
5945  GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5946      VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
5947      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5948      ggml_vk_destroy_buffer(ctx->dev_buffer);
5949      delete ctx;
5950  }
5951  
5952  GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
5953      return vk_ptr_base;
5954  
5955      UNUSED(buffer);
5956  }
5957  
5958  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
5959      VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
5960      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5961  
5962      if (tensor->view_src != nullptr) {
5963          GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5964          GGML_ASSERT(tensor->view_src->extra != nullptr);
5965          tensor->extra = tensor->view_src->extra;
5966      } else {
5967          ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5968          extra->buffer_gpu = ctx->dev_buffer;
5969          extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5970          tensor->extra = extra;
5971      }
5972  }
5973  
5974  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
5975      VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
5976      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5977  
5978      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5979  
5980      vk_buffer buf = extra->buffer_gpu.lock();
5981  
5982      ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
5983  }
5984  
5985  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
5986      VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
5987      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5988  
5989      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5990  
5991      vk_buffer buf = extra->buffer_gpu.lock();
5992  
5993      ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
5994  }
5995  
5996  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
5997      if (ggml_backend_buffer_is_vk(src->buffer)) {
5998          ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
5999          ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6000  
6001          vk_buffer src_buf = src_extra->buffer_gpu.lock();
6002          vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6003  
6004          ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6005  
6006          return true;
6007      }
6008      return false;
6009  
6010      UNUSED(buffer);
6011  }
6012  
6013  GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
6014      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6015  
6016      ggml_vk_buffer_memset(ctx->ctx, ctx->dev_buffer, 0, value, buffer->size);
6017  }
6018  
6019  static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
6020      /* .get_name        = */ ggml_backend_vk_buffer_get_name,
6021      /* .free_buffer     = */ ggml_backend_vk_buffer_free_buffer,
6022      /* .get_base        = */ ggml_backend_vk_buffer_get_base,
6023      /* .init_tensor     = */ ggml_backend_vk_buffer_init_tensor,
6024      /* .set_tensor      = */ ggml_backend_vk_buffer_set_tensor,
6025      /* .get_tensor      = */ ggml_backend_vk_buffer_get_tensor,
6026      /* .cpy_tensor      = */ ggml_backend_vk_buffer_cpy_tensor,
6027      /* .clear           = */ ggml_backend_vk_buffer_clear,
6028      /* .reset           = */ NULL,
6029  };
6030  
6031  // vk buffer type
6032  struct ggml_backend_vk_buffer_type_context {
6033      std::string name;
6034      ggml_backend_vk_context * ctx;
6035  };
6036  
6037  GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
6038      ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6039  
6040      return ctx->name.c_str();
6041  }
6042  
6043  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6044      VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6045      ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6046  
6047      vk_buffer dev_buffer = nullptr;
6048      try {
6049          dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6050      } catch (const vk::SystemError& e) {
6051          return nullptr;
6052      }
6053  
6054      ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6055  
6056      return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
6057  }
6058  
6059  GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6060      ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6061      return ctx->ctx->device->properties.limits.minStorageBufferOffsetAlignment;
6062  }
6063  
6064  GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
6065      ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6066      return ctx->ctx->device->max_memory_allocation_size;
6067  }
6068  
6069  GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
6070      return ggml_nbytes(tensor);
6071  
6072      UNUSED(buft);
6073  }
6074  
6075  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6076      /* .get_name         = */ ggml_backend_vk_buffer_type_name,
6077      /* .alloc_buffer     = */ ggml_backend_vk_buffer_type_alloc_buffer,
6078      /* .get_alignment    = */ ggml_backend_vk_buffer_type_get_alignment,
6079      /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
6080      /* .get_alloc_size   = */ ggml_backend_vk_buffer_type_get_alloc_size,
6081      /* .is_host          = */ NULL,
6082  };
6083  
6084  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6085      ggml_vk_instance_init();
6086  
6087      VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
6088  
6089      GGML_ASSERT(dev_num < vk_instance.device_indices.size());
6090  
6091      ggml_backend_vk_init(dev_num);
6092  
6093      return &vk_instance.buffer_types[dev_num];
6094  }
6095  
6096  // host buffer type
6097  
6098  GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
6099      return GGML_VK_NAME "_Host";
6100  
6101      UNUSED(buft);
6102  }
6103  
6104  GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
6105      return GGML_VK_NAME "_Host";
6106  
6107      UNUSED(buffer);
6108  }
6109  
6110  GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6111      VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6112      ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
6113  }
6114  
6115  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6116      VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6117      size += 32;  // Behave like the CPU buffer type
6118      void * ptr = nullptr;
6119      try {
6120          ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
6121      } catch (vk::SystemError& e) {
6122          std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
6123          std::cerr << "ggml_vulkan: " << e.what() << std::endl;
6124          // fallback to cpu buffer
6125          return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
6126      }
6127  
6128      ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
6129      buffer->buft = buft;
6130      buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
6131      buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
6132  
6133      return buffer;
6134  }
6135  
6136  GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6137      return vk_instance.contexts[0].device->properties.limits.minMemoryMapAlignment;
6138  
6139      UNUSED(buft);
6140  }
6141  
6142  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6143      static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
6144          /* .iface    = */ {
6145              /* .get_name         = */ ggml_backend_vk_host_buffer_type_name,
6146              /* .alloc_buffer     = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
6147              /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
6148              /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
6149              /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6150              /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6151          },
6152          /* .context  = */ nullptr,
6153      };
6154  
6155      if (!vk_instance.contexts[0].initialized) {
6156          // Fall back to CPU
6157          return ggml_backend_cpu_buffer_type();
6158      }
6159  
6160      return &ggml_backend_vk_buffer_type_host;
6161  }
6162  
6163  // backend
6164  
6165  GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6166      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6167  
6168      return ctx->name.c_str();
6169  }
6170  
6171  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6172      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6173      VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6174  
6175      size_t idx = ctx->idx;
6176  
6177      ggml_vk_cleanup(ctx);
6178  
6179      ctx->device.reset();
6180      ctx->initialized = false;
6181  
6182      vk_instance.initialized[idx] = false;
6183      vk_instance.backends[idx] = nullptr;
6184      memset(&vk_instance.buffer_types[idx], 0, sizeof(ggml_backend_buffer_type));
6185      delete backend;
6186  }
6187  
6188  GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
6189      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6190  
6191      GGML_ASSERT(ctx->initialized);
6192  
6193      return ggml_backend_vk_buffer_type(ctx->idx);
6194  }
6195  
6196  GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6197      VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6198      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6199      GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6200  
6201      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6202  
6203      if (ctx->transfer_ctx == nullptr) {
6204          // Initialize new transfer context
6205          ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6206          ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
6207      }
6208  
6209      vk_buffer buf = extra->buffer_gpu.lock();
6210  
6211      ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6212  }
6213  
6214  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6215      VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6216      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6217      GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6218  
6219      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6220  
6221      if (ctx->transfer_ctx == nullptr) {
6222          // Initialize new transfer context
6223          ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6224          ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
6225      }
6226  
6227      vk_buffer buf = extra->buffer_gpu.lock();
6228  
6229      ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6230  }
6231  
6232  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6233      VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6234      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6235      if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6236          ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
6237          ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6238  
6239          if (ctx->transfer_ctx == nullptr) {
6240              // Initialize new transfer context
6241              ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6242              ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
6243          }
6244  
6245          vk_buffer src_buf = src_extra->buffer_gpu.lock();
6246          vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6247  
6248          ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6249          return true;
6250      }
6251  
6252      return false;
6253  }
6254  
6255  GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6256      VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6257      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6258      if(ctx->transfer_ctx == nullptr) {
6259          return;
6260      }
6261  
6262      ggml_vk_ctx_end(ctx->transfer_ctx);
6263  
6264      for (auto& cpy : ctx->transfer_ctx->in_memcpys) {
6265          memcpy(cpy.dst, cpy.src, cpy.n);
6266      }
6267  
6268      ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
6269      VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
6270      ctx->device->device.resetFences({ ctx->fence });
6271  
6272      for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
6273          memcpy(cpy.dst, cpy.src, cpy.n);
6274      }
6275  
6276      ctx->transfer_ctx = nullptr;
6277  }
6278  
6279  static bool ggml_vk_is_empty(ggml_tensor * node) {
6280      return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6281  }
6282  
6283  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6284      VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6285      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6286  
6287      for (int i = 0; i < cgraph->n_nodes; i++) {
6288          ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]);
6289      }
6290      ggml_vk_preallocate_buffers(ctx);
6291  
6292      int last_node = cgraph->n_nodes - 1;
6293  
6294      // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6295      while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6296          last_node -= 1;
6297      }
6298  
6299      for (int i = 0; i < cgraph->n_nodes; i++) {
6300          ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
6301      }
6302  
6303      ggml_compute_params params = {};
6304      params.type = GGML_TASK_TYPE_COMPUTE;
6305      params.ith = 0;
6306      for (int i = 0; i < cgraph->n_nodes; i++) {
6307          ggml_tensor * node = cgraph->nodes[i];
6308  
6309          if (ggml_vk_is_empty(node)) {
6310              continue;
6311          }
6312  
6313          bool ok = ggml_vk_compute_forward(ctx, &params, node);
6314          if (!ok) {
6315              fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
6316          }
6317  #ifdef GGML_VULKAN_CHECK_RESULTS
6318          else {
6319              ggml_vk_check_results_1(ctx, &params, node);
6320          }
6321  #endif
6322          GGML_ASSERT(ok);
6323      }
6324  
6325      ggml_vk_graph_cleanup(ctx);
6326  
6327      return GGML_STATUS_SUCCESS;
6328  
6329      UNUSED(backend);
6330  }
6331  
6332  GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
6333      // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
6334  
6335      switch (op->op) {
6336          case GGML_OP_UNARY:
6337              switch (ggml_get_unary_op(op)) {
6338                  case GGML_UNARY_OP_GELU:
6339                  case GGML_UNARY_OP_SILU:
6340                  case GGML_UNARY_OP_RELU:
6341                      return ggml_is_contiguous(op->src[0]);
6342                  default:
6343                      return false;
6344              }
6345              break;
6346          case GGML_OP_MUL_MAT:
6347          case GGML_OP_MUL_MAT_ID:
6348              {
6349                  switch (op->src[0]->type) {
6350                      case GGML_TYPE_F32:
6351                      case GGML_TYPE_F16:
6352                      case GGML_TYPE_Q4_0:
6353                      case GGML_TYPE_Q4_1:
6354                      case GGML_TYPE_Q5_0:
6355                      case GGML_TYPE_Q5_1:
6356                      case GGML_TYPE_Q8_0:
6357                      case GGML_TYPE_Q2_K:
6358                      case GGML_TYPE_Q3_K:
6359                      case GGML_TYPE_Q4_K:
6360                      case GGML_TYPE_Q5_K:
6361                      case GGML_TYPE_Q6_K:
6362                          break;
6363                      default:
6364                          return false;
6365                  }
6366                  struct ggml_tensor * a;
6367                  struct ggml_tensor * b;
6368                  if (op->op == GGML_OP_MUL_MAT) {
6369                      a = op->src[0];
6370                      b = op->src[1];
6371                  } else {
6372                      a = op->src[2];
6373                      b = op->src[1];
6374                  }
6375                  if (a->ne[3] != b->ne[3]) {
6376                      return false;
6377                  }
6378                  return true;
6379              } break;
6380          case GGML_OP_GET_ROWS:
6381              {
6382                  switch (op->src[0]->type) {
6383                      case GGML_TYPE_F32:
6384                      case GGML_TYPE_F16:
6385                      case GGML_TYPE_Q4_0:
6386                      case GGML_TYPE_Q4_1:
6387                      case GGML_TYPE_Q5_0:
6388                      case GGML_TYPE_Q5_1:
6389                      case GGML_TYPE_Q8_0:
6390                          return true;
6391                      default:
6392                          return false;
6393                  }
6394              } break;
6395          case GGML_OP_CPY:
6396          case GGML_OP_DUP:
6397              {
6398                  ggml_type src0_type = op->src[0]->type;
6399                  ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
6400                  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
6401                      return true;
6402                  }
6403                  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
6404                      return true;
6405                  }
6406                  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
6407                      return true;
6408                  }
6409                  return false;
6410              } break;
6411          // case GGML_OP_REPEAT:
6412          //     {
6413          //         ggml_type src0_type = op->src[0]->type;
6414          //         return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6415          //     } break;
6416          case GGML_OP_ROPE:
6417              return ggml_is_contiguous(op->src[0]);
6418          case GGML_OP_NONE:
6419          case GGML_OP_RESHAPE:
6420          case GGML_OP_VIEW:
6421          case GGML_OP_PERMUTE:
6422          case GGML_OP_TRANSPOSE:
6423          case GGML_OP_NORM:
6424          case GGML_OP_ADD:
6425          case GGML_OP_MUL:
6426          case GGML_OP_DIV:
6427          case GGML_OP_RMS_NORM:
6428          case GGML_OP_SCALE:
6429          case GGML_OP_SQR:
6430          case GGML_OP_CLAMP:
6431          case GGML_OP_CONT:
6432          case GGML_OP_DIAG_MASK_INF:
6433          case GGML_OP_SOFT_MAX:
6434          case GGML_OP_ARGSORT:
6435          case GGML_OP_SUM_ROWS:
6436              return true;
6437          default:
6438              return false;
6439      }
6440  
6441      UNUSED(backend);
6442  }
6443  
6444  GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
6445      const int min_batch_size = 32;
6446  
6447      return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
6448             (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
6449  
6450      UNUSED(backend);
6451  }
6452  
6453  GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6454      if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6455          return false;
6456      }
6457  
6458      ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6459      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6460  
6461      return buft_ctx->ctx->idx == ctx->idx;
6462  }
6463  
6464  // TODO: enable async and synchronize
6465  static ggml_backend_i ggml_backend_vk_interface = {
6466      /* .get_name                = */ ggml_backend_vk_name,
6467      /* .free                    = */ ggml_backend_vk_free,
6468      /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
6469      /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
6470      /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
6471      /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
6472      /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
6473      /* .graph_plan_create       = */ NULL,
6474      /* .graph_plan_free         = */ NULL,
6475      /* .graph_plan_update       = */ NULL,
6476      /* .graph_plan_compute      = */ NULL,
6477      /* .graph_compute           = */ ggml_backend_vk_graph_compute,
6478      /* .supports_op             = */ ggml_backend_vk_supports_op,
6479      /* .supports_buft           = */ ggml_backend_vk_supports_buft,
6480      /* .offload_op              = */ ggml_backend_vk_offload_op,
6481      /* .event_new               = */ NULL,
6482      /* .event_free              = */ NULL,
6483      /* .event_record            = */ NULL,
6484      /* .event_wait              = */ NULL,
6485      /* .event_synchronize       = */ NULL,
6486  };
6487  
6488  static ggml_guid_t ggml_backend_vk_guid() {
6489      static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
6490      return &guid;
6491  }
6492  
6493  GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6494      if (vk_instance.initialized[dev_num]) {
6495          return vk_instance.backends[dev_num];
6496      }
6497      VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6498  
6499      ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
6500      ggml_vk_init(ctx, dev_num);
6501      ctx->name = GGML_VK_NAME + std::to_string(dev_num);
6502      vk_instance.buffer_types[dev_num] = {
6503          /* .iface    = */ ggml_backend_vk_buffer_type_interface,
6504          /* .context  = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
6505      };
6506      vk_instance.initialized[dev_num] = true;
6507  
6508      ggml_backend_t vk_backend = new ggml_backend {
6509          /* .guid      = */ ggml_backend_vk_guid(),
6510          /* .interface = */ ggml_backend_vk_interface,
6511          /* .context   = */ &vk_instance.contexts[ctx->idx],
6512      };
6513  
6514      vk_instance.backends[dev_num] = vk_backend;
6515  
6516      return vk_backend;
6517  }
6518  
6519  GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
6520      return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
6521  }
6522  
6523  GGML_CALL int ggml_backend_vk_get_device_count() {
6524      return ggml_vk_get_device_count();
6525  }
6526  
6527  GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
6528      ggml_vk_get_device_description(device, description, description_size);
6529  }
6530  
6531  GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
6532      GGML_ASSERT(device < (int) vk_instance.device_indices.size());
6533  
6534      vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
6535  
6536      vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
6537  
6538      for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
6539          if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
6540              *total = heap.size;
6541              *free = heap.size;
6542              break;
6543          }
6544      }
6545  }
6546  
6547  // backend registry
6548  GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
6549      ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
6550      return vk_backend;
6551  
6552      UNUSED(params);
6553  }
6554  
6555  extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
6556  
6557  GGML_CALL int ggml_backend_vk_reg_devices() {
6558      ggml_vk_instance_init();
6559  
6560      for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
6561          char name[128];
6562          snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
6563          ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i);  // NOLINT
6564      }
6565      return vk_instance.device_indices.size();
6566  }
6567  
6568  // Extension availability
6569  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
6570  #ifdef GGML_VULKAN_VALIDATE
6571      bool portability_enumeration_ext = false;
6572      // Check for portability enumeration extension for MoltenVK support
6573      for (const auto& properties : instance_extensions) {
6574          if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
6575              return true;
6576          }
6577      }
6578      if (!portability_enumeration_ext) {
6579          std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
6580      }
6581  #endif
6582      return false;
6583  
6584      UNUSED(instance_extensions);
6585  }
6586  static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
6587  #ifdef __APPLE__
6588      bool portability_enumeration_ext = false;
6589      // Check for portability enumeration extension for MoltenVK support
6590      for (const auto& properties : instance_extensions) {
6591          if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
6592              return true;
6593          }
6594      }
6595      if (!portability_enumeration_ext) {
6596          std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
6597      }
6598  #endif
6599      return false;
6600  
6601      UNUSED(instance_extensions);
6602  }
6603  
6604  // checks
6605  
6606  #ifdef GGML_VULKAN_CHECK_RESULTS
6607  static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
6608      if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
6609          return;
6610      }
6611      for (int j = 0; j < level; j++) {
6612          std::cerr << " ";
6613      }
6614      std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6615  
6616      done.push_back(tensor);
6617  
6618      for (int i = 0; i < GGML_MAX_SRC; i++) {
6619          if (tensor->src[i] != nullptr) {
6620              ggml_vk_print_graph_origin(tensor->src[i], done, level + 1);
6621          }
6622      }
6623  }
6624  
6625  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
6626      if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
6627          return;
6628      }
6629      i0 = std::max(i0, 5);
6630      i1 = std::max(i1, 5);
6631      i2 = std::max(i2, 0);
6632      i3 = std::max(i3, 0);
6633      fprintf(stderr, "         ");
6634      for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
6635          fprintf(stderr, "%7d ", idx1);
6636      }
6637      fprintf(stderr, "\n");
6638      for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
6639          fprintf(stderr, "%7d: ", idx0);
6640          for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
6641              if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
6642                  float val;
6643                  if (tensor->type == GGML_TYPE_F32) {
6644                      val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6645                  } else if (tensor->type == GGML_TYPE_F16) {
6646                      val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
6647                  } else if (tensor->type == GGML_TYPE_I32) {
6648                      val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6649                  } else {
6650                      GGML_ASSERT(false);
6651                  }
6652                  fprintf(stderr, "% 7.2f ", val);
6653              } else {
6654                  fprintf(stderr, "        ");
6655              }
6656          }
6657          fprintf(stderr, "\n");
6658      }
6659  }
6660  
6661  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6662      void * tensor_data = tensor->data;
6663  
6664      if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6665          const size_t tensor_size = ggml_nbytes(tensor);
6666          tensor_data = malloc(tensor_size);
6667  
6668          ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6669  
6670          vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6671          ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6672      }
6673  
6674      std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6675      std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6676      if (tensor->src[0] != nullptr) {
6677          std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6678      }
6679      if (tensor->src[1] != nullptr) {
6680          std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6681      }
6682      std::cerr << std::endl << "Result:" << std::endl;
6683      ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
6684      std::cerr << std::endl;
6685      std::cerr << std::endl << "Result:" << std::endl;
6686      ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
6687      std::cerr << std::endl;
6688      std::vector<const ggml_tensor *> done;
6689      ggml_vk_print_graph_origin(tensor, done);
6690  
6691      if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6692          free(tensor_data);
6693      }
6694  }
6695  
6696  void * comp_result;
6697  size_t comp_size;
6698  size_t comp_nb[GGML_MAX_DIMS];
6699  size_t check_counter = 0;
6700  static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
6701      if (params->ith != 0) {
6702          return;
6703      }
6704      if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
6705          return;
6706      }
6707  
6708      check_counter++;
6709      if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
6710          return;
6711      }
6712  
6713      VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
6714  
6715      ggml_tensor * src0 = tensor->src[0];
6716      ggml_tensor * src1 = tensor->src[1];
6717      ggml_tensor * src2 = tensor->src[2];
6718  
6719      struct ggml_init_params iparams = {
6720          /*.mem_size   =*/ 1024*1024*1024,
6721          /*.mem_buffer =*/ NULL,
6722          /*.no_alloc   =*/ false,
6723      };
6724  
6725      struct ggml_context * ggml_ctx = ggml_init(iparams);
6726  
6727      struct ggml_tensor * src0_clone = nullptr;
6728      struct ggml_tensor * src1_clone = nullptr;
6729      struct ggml_tensor * src2_clone = nullptr;
6730      struct ggml_tensor * tensor_clone = nullptr;
6731  
6732      size_t src0_size;
6733      size_t src1_size;
6734      size_t src2_size;
6735  
6736      void * src0_buffer = nullptr;
6737      void * src1_buffer = nullptr;
6738      void * src2_buffer = nullptr;
6739  
6740      if (src0 != nullptr) {
6741          src0_clone = ggml_dup_tensor(ggml_ctx, src0);
6742  
6743          src0_size = ggml_nbytes(src0);
6744  
6745          src0_buffer = malloc(src0_size);
6746          src0_clone->data = src0_buffer;
6747          if (ggml_backend_buffer_is_host(src0->buffer)) {
6748              memcpy(src0_clone->data, src0->data, src0_size);
6749              memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6750          } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6751              ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6752              vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6753              uint64_t offset = extra->offset + src0->view_offs;
6754              if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6755                  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6756                      for (int i2 = 0; i2 < src0->ne[2]; i2++) {
6757                          const int idx = i3*src0->ne[2] + i2;
6758                          ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
6759                      }
6760                  }
6761  
6762                  src0_clone->nb[0] = src0->nb[0];
6763                  src0_clone->nb[1] = src0->nb[1];
6764                  for (int i = 2; i < GGML_MAX_DIMS; i++) {
6765                      src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1];
6766                  }
6767              } else {
6768                  if (offset + src0_size >= buffer_gpu->size) {
6769                      src0_size = buffer_gpu->size - offset;
6770                  }
6771                  ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size);
6772                  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6773              }
6774          } else {
6775              GGML_ASSERT(false);
6776          }
6777  
6778          if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6779              ggml_vk_print_tensor(ctx, src0, "src0");
6780          }
6781      }
6782      if (src1 != nullptr) {
6783          src1_clone = ggml_dup_tensor(ggml_ctx, src1);
6784  
6785          src1_size = ggml_nbytes(src1);
6786  
6787          src1_buffer = malloc(src1_size);
6788          src1_clone->data = src1_buffer;
6789          if (ggml_backend_buffer_is_host(src1->buffer)) {
6790              memcpy(src1_clone->data, src1->data, src1_size);
6791              memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6792          } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6793              ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6794              vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6795              uint64_t offset = extra->offset + src1->view_offs;
6796              if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6797                  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6798                      for (int i2 = 0; i2 < src1->ne[2]; i2++) {
6799                          const int idx = i3*src1->ne[2] + i2;
6800                          ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
6801                      }
6802                  }
6803  
6804                  src1_clone->nb[0] = src1->nb[0];
6805                  src1_clone->nb[1] = src1->nb[1];
6806                  for (int i = 2; i < GGML_MAX_DIMS; i++) {
6807                      src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1];
6808                  }
6809              } else {
6810                  if (offset + src1_size >= buffer_gpu->size) {
6811                      src1_size = buffer_gpu->size - offset;
6812                  }
6813                  ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size);
6814                  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6815              }
6816          } else {
6817              GGML_ASSERT(false);
6818          }
6819  
6820          if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6821              ggml_vk_print_tensor(ctx, src1, "src1");
6822              std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6823              std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6824              if (src1->src[0] != nullptr) {
6825                  std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6826              }
6827              if (src1->src[1] != nullptr) {
6828                  std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6829              }
6830              std::cerr << std::endl << "Result:" << std::endl;
6831              ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
6832              std::cerr << std::endl;
6833              std::cerr << std::endl << "Result:" << std::endl;
6834              ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
6835              std::cerr << std::endl;
6836              std::vector<const ggml_tensor *> done;
6837              ggml_vk_print_graph_origin(src1_clone, done);
6838          }
6839      }
6840      if (src2 != nullptr) {
6841          src2_clone = ggml_dup_tensor(ggml_ctx, src2);
6842  
6843          src2_size = ggml_nbytes(src2);
6844  
6845          src2_buffer = malloc(src2_size);
6846          src2_clone->data = src2_buffer;
6847          if (ggml_backend_buffer_is_host(src2->buffer)) {
6848              memcpy(src2_clone->data, src2->data, src2_size);
6849              memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6850          } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6851              ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6852              vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6853              uint64_t offset = extra->offset + src2->view_offs;
6854              if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6855                  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6856                      for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6857                          const int idx = i3*src2->ne[2] + i2;
6858                          ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6859                      }
6860                  }
6861  
6862                  src2_clone->nb[0] = src2->nb[0];
6863                  src2_clone->nb[1] = src2->nb[1];
6864                  for (int i = 2; i < GGML_MAX_DIMS; i++) {
6865                      src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6866                  }
6867              } else {
6868                  if (offset + src2_size >= buffer_gpu->size) {
6869                      src2_size = buffer_gpu->size - offset;
6870                  }
6871                  ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6872                  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6873              }
6874          } else {
6875              GGML_ASSERT(false);
6876          }
6877  
6878          if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6879              ggml_vk_print_tensor(ctx, src2, "src2");
6880              std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6881              std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6882              if (src2->src[0] != nullptr) {
6883                  std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6884              }
6885              if (src2->src[1] != nullptr) {
6886                  std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6887              }
6888              std::cerr << std::endl << "Result:" << std::endl;
6889              ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
6890              std::cerr << std::endl;
6891              std::cerr << std::endl << "Result:" << std::endl;
6892              ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
6893              std::cerr << std::endl;
6894              std::vector<const ggml_tensor *> done;
6895              ggml_vk_print_graph_origin(src2_clone, done);
6896          }
6897      }
6898  
6899      if (tensor->op == GGML_OP_MUL_MAT) {
6900          tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
6901      } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
6902          tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone);
6903      } else if (tensor->op == GGML_OP_MUL) {
6904          tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
6905      } else if (tensor->op == GGML_OP_DIV) {
6906          tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
6907      } else if (tensor->op == GGML_OP_SCALE) {
6908          tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
6909      } else if (tensor->op == GGML_OP_SQR) {
6910          tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
6911      } else if (tensor->op == GGML_OP_CLAMP) {
6912          tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6913      } else if (tensor->op == GGML_OP_ADD) {
6914          tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
6915      } else if (tensor->op == GGML_OP_NORM) {
6916          tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6917      } else if (tensor->op == GGML_OP_RMS_NORM) {
6918          tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6919      } else if (tensor->op == GGML_OP_SOFT_MAX) {
6920          if (src1 != nullptr) {
6921              tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6922          } else {
6923              tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
6924          }
6925      } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
6926          tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(int *)tensor->op_params);
6927      } else if (tensor->op == GGML_OP_ROPE) {
6928          const int n_dims      = ((int32_t *) tensor->op_params)[1];
6929          const int mode        = ((int32_t *) tensor->op_params)[2];
6930          //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
6931          const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
6932          float freq_base       = ((float *)   tensor->op_params)[5];
6933          float freq_scale      = ((float *)   tensor->op_params)[6];
6934          float ext_factor      = ((float *)   tensor->op_params)[7];
6935          float attn_factor     = ((float *)   tensor->op_params)[8];
6936          float beta_fast       = ((float *)   tensor->op_params)[9];
6937          float beta_slow       = ((float *)   tensor->op_params)[10];
6938          tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6939      } else if (tensor->op == GGML_OP_UNARY) {
6940          switch (ggml_get_unary_op(tensor)) {
6941          case GGML_UNARY_OP_SILU:
6942              tensor_clone = ggml_silu(ggml_ctx, src0_clone);
6943              break;
6944          case GGML_UNARY_OP_GELU:
6945              tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
6946              break;
6947          case GGML_UNARY_OP_RELU:
6948              tensor_clone = ggml_relu(ggml_ctx, src0_clone);
6949              break;
6950          default:
6951              std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6952              GGML_ASSERT(false);
6953          }
6954      } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
6955          if (src1 == nullptr) {
6956              tensor_clone = ggml_dup(ggml_ctx, src0_clone);
6957              tensor_clone->type = tensor->type;
6958          } else {
6959              tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone);
6960          }
6961      } else if (tensor->op == GGML_OP_CONT) {
6962          tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
6963      } else if (tensor->op == GGML_OP_RESHAPE) {
6964          tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
6965      } else if (tensor->op == GGML_OP_VIEW) {
6966          tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
6967      } else if (tensor->op == GGML_OP_PERMUTE) {
6968          int32_t * params = (int32_t *)tensor->op_params;
6969          tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
6970      } else if (tensor->op == GGML_OP_TRANSPOSE) {
6971          tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
6972      } else if (tensor->op == GGML_OP_GET_ROWS) {
6973          tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
6974      } else if (tensor->op == GGML_OP_ARGSORT) {
6975          tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
6976      } else if (tensor->op == GGML_OP_SUM_ROWS) {
6977          tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
6978      } else {
6979          std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6980          GGML_ASSERT(false);
6981      }
6982  
6983      ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6984      ggml_build_forward_expand(cgraph, tensor_clone);
6985  
6986      ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6987  
6988      if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6989          ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6990      }
6991  
6992      comp_size = ggml_nbytes(tensor_clone);
6993  
6994      comp_result = malloc(comp_size);
6995      memcpy(comp_result, tensor_clone->data, comp_size);
6996      memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
6997  
6998      if (src0 != nullptr) {
6999          free(src0_buffer);
7000      }
7001      if (src1 != nullptr) {
7002          free(src1_buffer);
7003      }
7004  
7005      ggml_free(ggml_ctx);
7006  }
7007  
7008  static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
7009      if (params->ith != 0) {
7010          return;
7011      }
7012      if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
7013          return;
7014      }
7015      if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
7016          return;
7017      }
7018  
7019      VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
7020  
7021      ggml_tensor * src0 = tensor->src[0];
7022      ggml_tensor * src1 = tensor->src[1];
7023      ggml_tensor * src2 = tensor->src[2];
7024  
7025      void * tensor_data = tensor->data;
7026  
7027      if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7028          size_t tensor_size = ggml_nbytes(tensor);
7029          tensor_data = malloc(tensor_size);
7030  
7031          ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7032  
7033          vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7034          if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7035              tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7036          }
7037  
7038          ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7039      }
7040  
7041      float first_error_result = -1.0f;
7042      float first_error_correct = -1.0f;
7043      std::array<int, 4> first_error = { -1, -1, -1, -1 };
7044      double avg_err = 0.0;
7045      size_t counter = 0;
7046  
7047      for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
7048          for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
7049              for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
7050                  for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
7051                      const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size;
7052                      float correct = 0.0f;
7053                      float result = 0.0f;
7054  
7055                      if (buffer_size_fit) {
7056                          if (tensor->type == GGML_TYPE_F32) {
7057                              correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7058                              result  = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7059                          } else if (tensor->type == GGML_TYPE_F16) {
7060                              correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
7061                              result  = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
7062                          } else if (tensor->type == GGML_TYPE_I32) {
7063                              correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7064                              result  = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7065                          } else {
7066                              std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
7067                          }
7068                      } else {
7069                          std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
7070                          GGML_ASSERT(false);
7071                      }
7072  
7073                      if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7074                          std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7075                          std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7076                          if (src0 != nullptr) {
7077                              std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7078                          }
7079                          if (src1 != nullptr) {
7080                              std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7081                          }
7082                          if (src2 != nullptr) {
7083                              std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
7084                          }
7085                          std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7086                          std::cerr << std::endl << "Result:" << std::endl;
7087                          ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
7088                          std::cerr << std::endl << "Correct:" << std::endl;
7089                          ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3);
7090                          std::cerr << std::endl;
7091                          std::vector<const ggml_tensor *> done;
7092                          ggml_vk_print_graph_origin(tensor, done);
7093                          GGML_ASSERT(false);
7094                      }
7095                      if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
7096                          first_error[0] = i0;
7097                          first_error[1] = i1;
7098                          first_error[2] = i2;
7099                          first_error[3] = i3;
7100                          first_error_result = result;
7101                          first_error_correct = correct;
7102                      }
7103  
7104                      // Special case, value is infinite, avoid NaN result in avg_err
7105                      // NaN also appears in results, if both are nan error is 0
7106                      if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) {
7107                          avg_err += std::fabs(correct - result);
7108                      }
7109                      counter++;
7110                  }
7111              }
7112          }
7113      }
7114  
7115      avg_err /= counter;
7116  
7117      if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7118          std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7119          std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7120          if (src0 != nullptr) {
7121              std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7122          }
7123          if (src1 != nullptr) {
7124              std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7125          }
7126          if (src2 != nullptr) {
7127              std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
7128          }
7129          std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7130          std::cerr << std::endl << "Result:" << std::endl;
7131          ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
7132          std::cerr << std::endl << "Correct:" << std::endl;
7133          ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
7134          std::cerr << std::endl;
7135          std::cerr << std::endl << "Result:" << std::endl;
7136          ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
7137          std::cerr << std::endl << "Correct:" << std::endl;
7138          ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
7139          std::cerr << std::endl;
7140          std::vector<const ggml_tensor *> done;
7141          ggml_vk_print_graph_origin(tensor, done);
7142      }
7143  
7144      if (avg_err > 0.05 || std::isnan(avg_err)) {
7145          std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7146          std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7147          if (src0 != nullptr) {
7148              std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7149          }
7150          if (src1 != nullptr) {
7151              std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7152          }
7153          if (src2 != nullptr) {
7154              std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
7155          }
7156          std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7157          std::cerr << std::endl << "Result:" << std::endl;
7158          ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
7159          std::cerr << std::endl << "Correct:" << std::endl;
7160          ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]);
7161          std::cerr << std::endl;
7162          std::vector<const ggml_tensor *> done;
7163          ggml_vk_print_graph_origin(tensor, done);
7164          GGML_ASSERT(false);
7165      } else {
7166          std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7167      }
7168  
7169      free(comp_result);
7170      comp_result = nullptr;
7171      comp_size = 0;
7172  
7173      if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7174          free(tensor_data);
7175      }
7176  }
7177  #endif