/ ggml-vulkan.cpp
ggml-vulkan.cpp
1 #include "ggml-vulkan.h" 2 #include <vulkan/vulkan_core.h> 3 #ifdef GGML_VULKAN_RUN_TESTS 4 #include <chrono> 5 #endif 6 7 #include <vulkan/vulkan.hpp> 8 9 #include <algorithm> 10 #include <cmath> 11 #include <iomanip> 12 #include <iostream> 13 #include <tuple> 14 #include <vector> 15 #include <sstream> 16 #include <utility> 17 #include <memory> 18 #include <limits> 19 #include <map> 20 21 #include "ggml.h" 22 #include "ggml-backend-impl.h" 23 24 #include "ggml-vulkan-shaders.hpp" 25 26 #define VK_API_VERSION VK_API_VERSION_1_2 27 28 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) 29 30 #define VK_VENDOR_ID_AMD 0x1002 31 #define VK_VENDOR_ID_APPLE 0x106b 32 #define VK_VENDOR_ID_INTEL 0x8086 33 #define VK_VENDOR_ID_NVIDIA 0x10de 34 35 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN 0 36 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1 37 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2 38 39 #define VK_NUM_TYPES 16 40 41 #define GGML_VK_MAX_NODES 8192 42 43 #define MAX_VK_BUFFERS 256 44 45 #ifndef K_QUANTS_PER_ITERATION 46 #define K_QUANTS_PER_ITERATION 1 47 #else 48 static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); 49 #endif 50 51 #define VK_CHECK(err, msg) \ 52 do { \ 53 vk::Result err_ = (err); \ 54 if (err_ != vk::Result::eSuccess) { \ 55 fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n", \ 56 #err, to_string(err_).c_str(), __FILE__, __LINE__); \ 57 exit(1); \ 58 } \ 59 } while (0) 60 61 #ifdef GGML_VULKAN_DEBUG 62 #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl 63 #else 64 #define VK_LOG_DEBUG(msg) ((void) 0) 65 #endif // GGML_VULKAN_DEBUG 66 67 struct ggml_backend_vk_context; 68 69 struct vk_queue { 70 uint32_t queue_family_index; 71 vk::Queue queue; 72 vk::CommandPool pool; 73 uint32_t cmd_buffer_idx; 74 std::vector<vk::CommandBuffer> cmd_buffers; 75 76 vk::PipelineStageFlags stage_flags; 77 }; 78 79 struct vk_pipeline_struct { 80 std::string name; 81 vk::ShaderModule shader_module; 82 vk::DescriptorSetLayout dsl; 83 std::vector<vk::DescriptorPool> descriptor_pools; 84 std::vector<vk::DescriptorSet> descriptor_sets; 85 uint32_t descriptor_set_idx; 86 vk::PipelineLayout layout; 87 vk::Pipeline pipeline; 88 uint32_t push_constant_size; 89 uint32_t parameter_count; 90 std::array<uint32_t, 3> wg_denoms; 91 uint32_t align; 92 }; 93 94 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline; 95 typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref; 96 97 static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); 98 99 struct vk_matmul_pipeline_struct { 100 vk_pipeline l, m, s; 101 vk_pipeline a_l, a_m, a_s; 102 }; 103 104 typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline; 105 106 struct vk_device { 107 vk::PhysicalDevice physical_device; 108 vk::PhysicalDeviceProperties properties; 109 std::string name; 110 uint64_t max_memory_allocation_size; 111 bool fp16; 112 vk::Device device; 113 uint32_t vendor_id; 114 vk_queue compute_queue; 115 vk_queue transfer_queue; 116 bool single_queue; 117 uint32_t descriptor_set_mode; 118 uint32_t subgroup_size; 119 bool uma; 120 121 bool initialized; 122 size_t idx; 123 124 vk_matmul_pipeline pipeline_matmul_f32; 125 vk_matmul_pipeline pipeline_matmul_f32_f16; 126 vk_matmul_pipeline pipeline_matmul_f16; 127 vk_matmul_pipeline pipeline_matmul_f16_f32; 128 vk_pipeline pipeline_matmul_split_k_reduce; 129 130 vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES]; 131 132 vk_matmul_pipeline pipeline_matmul_id_f32; 133 vk_matmul_pipeline pipeline_matmul_id_f16; 134 vk_matmul_pipeline pipeline_matmul_id_f16_f32; 135 136 vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES]; 137 138 vk_pipeline pipeline_dequant[VK_NUM_TYPES]; 139 vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES]; 140 vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES]; 141 vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES]; 142 143 vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; 144 vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; 145 vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; 146 vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; 147 vk_pipeline pipeline_mul_f32; 148 vk_pipeline pipeline_div_f32; 149 vk_pipeline pipeline_add_f32; 150 vk_pipeline pipeline_scale_f32; 151 vk_pipeline pipeline_sqr_f32; 152 vk_pipeline pipeline_clamp_f32; 153 vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; 154 vk_pipeline pipeline_norm_f32; 155 vk_pipeline pipeline_rms_norm_f32; 156 vk_pipeline pipeline_gelu_f32; 157 vk_pipeline pipeline_silu_f32; 158 vk_pipeline pipeline_relu_f32; 159 vk_pipeline pipeline_diag_mask_inf_f32; 160 vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; 161 vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16; 162 vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; 163 vk_pipeline pipeline_argsort_f32; 164 vk_pipeline pipeline_sum_rows_f32; 165 166 std::vector<vk_pipeline_ref> pipelines; 167 168 ~vk_device() { 169 VK_LOG_DEBUG("destroy device " << name); 170 device.destroyCommandPool(compute_queue.pool); 171 if (!single_queue) { 172 device.destroyCommandPool(transfer_queue.pool); 173 } 174 175 for (auto& pipeline : pipelines) { 176 if (pipeline.expired()) { 177 continue; 178 } 179 180 vk_pipeline pl = pipeline.lock(); 181 ggml_vk_destroy_pipeline(device, pl); 182 } 183 pipelines.clear(); 184 185 device.destroy(); 186 } 187 }; 188 189 struct vk_buffer_struct { 190 vk::Buffer buffer; 191 vk::DeviceMemory device_memory; 192 vk::MemoryPropertyFlags memory_property_flags; 193 void * ptr; 194 size_t size = 0; 195 196 ggml_backend_vk_context * ctx; 197 198 std::shared_ptr<vk_device> device; 199 200 ~vk_buffer_struct() { 201 if (size == 0) { 202 return; 203 } 204 VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")"); 205 206 device->device.freeMemory(device_memory); 207 device->device.destroyBuffer(buffer); 208 } 209 }; 210 211 typedef std::shared_ptr<vk_buffer_struct> vk_buffer; 212 typedef std::weak_ptr<vk_buffer_struct> vk_buffer_ref; 213 214 struct vk_subbuffer { 215 vk_buffer buffer; 216 uint64_t offset; 217 uint64_t size; 218 }; 219 220 struct vk_semaphore { 221 vk::Semaphore s; 222 uint64_t value; 223 }; 224 225 struct vk_submission { 226 vk::CommandBuffer buffer; 227 std::vector<vk_semaphore> wait_semaphores; 228 std::vector<vk_semaphore> signal_semaphores; 229 }; 230 231 typedef std::vector<vk_submission> vk_sequence; 232 233 struct vk_mat_mat_push_constants { 234 uint32_t M; uint32_t N; uint32_t K; 235 uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; 236 uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; 237 uint32_t k_split; 238 uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; 239 }; 240 struct vk_mat_vec_push_constants { 241 uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; 242 uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; 243 uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; 244 }; 245 246 struct vk_mat_mat_id_push_constants { 247 uint32_t M; uint32_t N; uint32_t K; 248 uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; 249 uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; 250 uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; 251 }; 252 struct vk_mat_vec_id_push_constants { 253 uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; 254 uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; 255 uint32_t nei0; uint32_t ne11; 256 }; 257 258 struct vk_op_push_constants { 259 uint32_t KX; 260 uint32_t KY; 261 float param1; 262 float param2; 263 }; 264 265 struct vk_op_unary_push_constants { 266 uint32_t ne; 267 uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; 268 uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; 269 uint32_t d_offset; 270 float param1; float param2; 271 }; 272 273 struct vk_op_binary_push_constants { 274 uint32_t ne; 275 uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; 276 uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; 277 uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; 278 uint32_t d_offset; 279 float param1; float param2; 280 }; 281 282 struct vk_op_diag_mask_push_constants { 283 uint32_t ncols; 284 uint32_t rows_per_channel; 285 int32_t n_past; 286 }; 287 288 struct vk_op_rope_push_constants { 289 uint32_t ncols; 290 uint32_t n_dims; 291 float freq_scale; 292 uint32_t p_delta_rows; 293 float freq_base; 294 float ext_factor; 295 float attn_factor; 296 float corr_dims[2]; 297 float theta_scale; 298 uint32_t has_ff; 299 }; 300 301 struct vk_op_soft_max_push_constants { 302 uint32_t KX; 303 uint32_t KY; 304 float scale; 305 float max_bias; 306 float m0; 307 float m1; 308 uint32_t n_head_log2; 309 }; 310 311 struct vk_op_argsort_push_constants { 312 uint32_t ncols; 313 uint32_t ncols_pad; 314 int32_t order; 315 }; 316 317 // Allow pre-recording command buffers 318 struct vk_staging_memcpy { 319 vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} 320 321 void * dst; 322 const void * src; 323 size_t n; 324 }; 325 326 struct vk_context { 327 size_t idx; 328 329 vk_submission * s; 330 std::vector<vk_sequence> seqs; 331 332 ggml_tensor * exit_tensor; 333 334 std::vector<vk_staging_memcpy> in_memcpys; 335 std::vector<vk_staging_memcpy> out_memcpys; 336 337 vk_queue * q; 338 }; 339 340 struct ggml_tensor_extra_gpu { 341 size_t ctx_idx; 342 343 vk_buffer_ref buffer_gpu; 344 uint64_t offset; 345 346 void reset() { 347 ctx_idx = 0; 348 buffer_gpu.reset(); 349 offset = 0; 350 } 351 }; 352 353 struct ggml_vk_garbage_collector { 354 std::vector<vk_semaphore> tl_semaphores; 355 std::vector<vk_semaphore> semaphores; 356 std::vector<vk::Event> events; 357 std::vector<vk_buffer> temp_buffers; 358 std::vector<vk_context> contexts; 359 }; 360 361 #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG) 362 #include <mutex> 363 364 #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl 365 366 static std::string format_size(size_t size) { 367 const size_t kib = 1024; 368 const size_t mib = kib * 1024; 369 const size_t gib = mib * 1024; 370 371 std::ostringstream oss; 372 oss << std::fixed << std::setprecision(2); 373 374 if (size >= gib) { 375 oss << static_cast<double>(size) / gib << " GiB"; 376 } else if (size >= mib) { 377 oss << static_cast<double>(size) / mib << " MiB"; 378 } else if (size >= kib) { 379 oss << static_cast<double>(size) / kib << " KiB"; 380 } else { 381 oss << size << " B"; 382 } 383 384 return oss.str(); 385 } 386 387 static std::mutex log_mutex; 388 389 class vk_memory_logger { 390 public: 391 vk_memory_logger(): total_device(0), total_host(0) {} 392 void log_allocation(vk_buffer_ref buf_ref, size_t size); 393 void log_deallocation(vk_buffer_ref buf_ref); 394 395 private: 396 std::map<vk::Buffer, size_t> allocations; // Track allocations 397 size_t total_device; 398 size_t total_host; 399 }; 400 #else 401 #define VK_LOG_MEMORY(msg) ((void) 0) 402 #endif // GGML_VULKAN_MEMORY_DEBUG 403 404 struct ggml_backend_vk_context { 405 std::string name; 406 407 std::shared_ptr<vk_device> device; 408 409 size_t semaphore_idx, event_idx; 410 ggml_vk_garbage_collector gc; 411 std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory; 412 size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; 413 vk_buffer prealloc_x, prealloc_y, prealloc_split_k; 414 vk::Fence fence; 415 vk_buffer staging; 416 size_t staging_size; 417 size_t staging_offset; 418 vk_buffer sync_staging; 419 420 vk_buffer buffer_pool[MAX_VK_BUFFERS]; 421 422 vk_context * compute_ctx; 423 vk_context * transfer_ctx; 424 425 bool initialized; 426 427 size_t idx; 428 429 #ifdef GGML_VULKAN_MEMORY_DEBUG 430 vk_memory_logger memory_logger; 431 #endif 432 }; 433 434 #ifdef GGML_VULKAN_MEMORY_DEBUG 435 void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { 436 std::lock_guard<std::mutex> guard(log_mutex); 437 vk_buffer buf = buf_ref.lock(); 438 const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); 439 const std::string type = device ? "device" : "host"; 440 allocations[buf->buffer] = size; 441 total_device += device ? size : 0; 442 total_host += device ? 0 : size; 443 VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); 444 } 445 446 void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { 447 if (buf_ref.expired() || buf_ref.lock()->size == 0) { 448 return; 449 } 450 451 std::lock_guard<std::mutex> guard(log_mutex); 452 vk_buffer buf = buf_ref.lock(); 453 const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); 454 std::string type = device ? "device" : "host"; 455 auto it = allocations.find(buf->buffer); 456 total_device -= device ? it->second : 0; 457 total_host -= device ? 0 : it->second; 458 if (it != allocations.end()) { 459 VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); 460 allocations.erase(it); 461 } else { 462 VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); 463 } 464 } 465 #endif // GGML_VULKAN_MEMORY_DEBUG 466 467 struct vk_instance_t { 468 vk::Instance instance; 469 470 std::vector<size_t> device_indices; 471 472 ggml_backend_t backends[GGML_VK_MAX_DEVICES]; 473 ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES]; 474 ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES]; 475 bool initialized[GGML_VK_MAX_DEVICES]; 476 }; 477 478 static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) { 479 VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); 480 static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES]; 481 482 if (devices[idx].expired()) { 483 VK_LOG_DEBUG("Initializing new vk_device"); 484 std::shared_ptr<vk_device> device = std::make_shared<vk_device>(); 485 device->initialized = false; 486 devices[idx] = device; 487 return device; 488 } 489 490 return devices[idx].lock(); 491 } 492 493 #ifdef GGML_VULKAN_CHECK_RESULTS 494 static size_t vk_skip_checks; 495 static size_t vk_output_tensor; 496 497 static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); 498 static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); 499 static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); 500 #endif 501 502 typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); 503 504 static bool vk_instance_initialized = false; 505 static vk_instance_t vk_instance; 506 507 GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); 508 509 static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) { 510 VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); 511 GGML_ASSERT(parameter_count > 0); 512 GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT 513 514 pipeline = std::make_shared<vk_pipeline_struct>(); 515 pipeline->name = name; 516 pipeline->parameter_count = parameter_count; 517 pipeline->push_constant_size = push_constant_size; 518 pipeline->wg_denoms = wg_denoms; 519 pipeline->align = align; 520 521 vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data)); 522 pipeline->shader_module = ctx->device->device.createShaderModule(shader_module_create_info); 523 524 std::vector<vk::DescriptorSetLayoutBinding> dsl_binding; 525 std::vector<vk::DescriptorBindingFlags> dsl_binding_flags; 526 for (uint32_t i = 0; i < parameter_count; i++) { 527 dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); 528 dsl_binding_flags.push_back({}); 529 } 530 531 vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; 532 533 vk::PushConstantRange pcr( 534 vk::ShaderStageFlagBits::eCompute, 535 0, 536 pipeline->push_constant_size 537 ); 538 539 vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( 540 {}, 541 dsl_binding); 542 descriptor_set_layout_create_info.setPNext(&dslbfci); 543 pipeline->dsl = ctx->device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); 544 545 // Check if device supports multiple descriptors per pool 546 if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { 547 const uint32_t alloc_count = 2; 548 549 // Try allocating multiple sets from one pool 550 // This fails on AMD for some reason, so add a fall back to allocating one pool per set 551 vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); 552 vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size); 553 vk::DescriptorPool pool = ctx->device->device.createDescriptorPool(descriptor_pool_create_info); 554 555 std::vector<vk::DescriptorSetLayout> layouts(alloc_count); 556 for (uint32_t i = 0; i < alloc_count; i++) { 557 layouts[i] = pipeline->dsl; 558 } 559 try { 560 vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data()); 561 std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); 562 } catch(vk::OutOfPoolMemoryError const&) { 563 ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; 564 } 565 566 ctx->device->device.destroyDescriptorPool(pool); 567 } 568 569 if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { 570 vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); 571 vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size); 572 pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info)); 573 } 574 575 pipeline->descriptor_set_idx = 0; 576 577 vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr); 578 pipeline->layout = ctx->device->device.createPipelineLayout(pipeline_layout_create_info); 579 580 std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size()); 581 582 for (size_t i = 0; i < specialization_constants.size(); i++) { 583 specialization_entries[i].constantID = i; 584 specialization_entries[i].offset = i * sizeof(uint32_t); 585 specialization_entries[i].size = sizeof(uint32_t); 586 } 587 588 vk::SpecializationInfo specialization_info( 589 specialization_entries.size(), 590 specialization_entries.data(), 591 specialization_constants.size() * sizeof(uint32_t), 592 specialization_constants.data() 593 ); 594 595 vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( 596 vk::PipelineShaderStageCreateFlags(), 597 vk::ShaderStageFlagBits::eCompute, 598 pipeline->shader_module, 599 entrypoint.c_str(), 600 &specialization_info); 601 vk::ComputePipelineCreateInfo compute_pipeline_create_info( 602 vk::PipelineCreateFlags(), 603 pipeline_shader_create_info, 604 pipeline->layout); 605 pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; 606 607 ctx->device->pipelines.push_back(pipeline); 608 } 609 610 static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { 611 VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); 612 for (auto& pool : pipeline->descriptor_pools) { 613 device.destroyDescriptorPool(pool); 614 } 615 pipeline->descriptor_pools.clear(); 616 pipeline->descriptor_sets.clear(); 617 pipeline->descriptor_set_idx = 0; 618 619 device.destroyDescriptorSetLayout(pipeline->dsl); 620 621 device.destroyPipelineLayout(pipeline->layout); 622 623 device.destroyShaderModule(pipeline->shader_module); 624 625 device.destroyPipeline(pipeline->pipeline); 626 } 627 628 static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) { 629 VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")"); 630 if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { 631 // Enough descriptors are available 632 return; 633 } 634 635 if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { 636 const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size(); 637 638 std::vector<vk::DescriptorSetLayout> layouts(alloc_count); 639 for (uint32_t i = 0; i < alloc_count; i++) { 640 layouts[i] = pipeline->dsl; 641 } 642 vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data()); 643 std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); 644 pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end()); 645 } else { 646 for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) { 647 vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); 648 vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size); 649 pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info)); 650 651 vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl); 652 std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); 653 pipeline->descriptor_sets.push_back(sets[0]); 654 } 655 } 656 } 657 658 static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { 659 VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")"); 660 pipeline->descriptor_set_idx = 0; 661 } 662 663 static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) { 664 VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); 665 if (q.cmd_buffers.size() > q.cmd_buffer_idx) { 666 // Reuse command buffer 667 return q.cmd_buffers[q.cmd_buffer_idx++]; 668 } 669 670 vk::CommandBufferAllocateInfo command_buffer_alloc_info( 671 q.pool, 672 vk::CommandBufferLevel::ePrimary, 673 1); 674 const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device->device.allocateCommandBuffers(command_buffer_alloc_info); 675 auto buf = cmd_buffers.front(); 676 677 q.cmd_buffers.push_back(buf); 678 q.cmd_buffer_idx++; 679 680 return buf; 681 } 682 683 static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) { 684 VK_LOG_DEBUG("ggml_vk_create_submission()"); 685 vk_submission s; 686 s.buffer = ggml_vk_create_cmd_buffer(ctx, q); 687 s.wait_semaphores = std::move(wait_semaphores); 688 s.signal_semaphores = std::move(signal_semaphores); 689 return s; 690 } 691 692 static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { 693 VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")"); 694 if (ctx->seqs.empty()) { 695 return; 696 } 697 698 std::vector<std::vector<uint64_t>> tl_wait_vals; 699 std::vector<std::vector<uint64_t>> tl_signal_vals; 700 std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores; 701 std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores; 702 std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos; 703 std::vector<vk::SubmitInfo> submit_infos; 704 int idx = -1; 705 std::vector<std::vector<vk::PipelineStageFlags>> stage_flags; 706 707 size_t reserve = 0; 708 709 for (const auto& sequence : ctx->seqs) { 710 reserve += sequence.size(); 711 } 712 713 // Pre-reserve vectors to prevent reallocation, which invalidates pointers 714 tl_wait_semaphores.reserve(reserve); 715 tl_wait_vals.reserve(reserve); 716 tl_signal_semaphores.reserve(reserve); 717 tl_signal_vals.reserve(reserve); 718 tl_submit_infos.reserve(reserve); 719 submit_infos.reserve(reserve); 720 stage_flags.reserve(reserve); 721 722 for (const auto& sequence : ctx->seqs) { 723 for (const auto& submission : sequence) { 724 stage_flags.push_back({}); 725 idx++; 726 tl_wait_vals.push_back({}); 727 tl_wait_semaphores.push_back({}); 728 tl_signal_vals.push_back({}); 729 tl_signal_semaphores.push_back({}); 730 for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { 731 stage_flags[idx].push_back(ctx->q->stage_flags); 732 tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); 733 tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); 734 } 735 for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { 736 tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); 737 tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); 738 } 739 tl_submit_infos.push_back({ 740 (uint32_t) submission.wait_semaphores.size(), 741 tl_wait_vals[idx].data(), 742 (uint32_t) submission.signal_semaphores.size(), 743 tl_signal_vals[idx].data(), 744 }); 745 tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo; 746 tl_submit_infos[idx].pNext = nullptr; 747 vk::SubmitInfo si{ 748 (uint32_t) submission.wait_semaphores.size(), 749 tl_wait_semaphores[idx].data(), 750 stage_flags[idx].data(), 751 1, 752 &submission.buffer, 753 (uint32_t) submission.signal_semaphores.size(), 754 tl_signal_semaphores[idx].data(), 755 }; 756 si.setPNext(&tl_submit_infos[idx]); 757 submit_infos.push_back(si); 758 } 759 } 760 761 ctx->q->queue.submit(submit_infos, fence); 762 763 ctx->seqs.clear(); 764 } 765 766 static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { 767 VK_LOG_DEBUG("ggml_vk_find_queue_family_index()"); 768 const uint32_t qfsize = queue_family_props.size(); 769 770 // Try with avoid preferences first 771 for (uint32_t i = 0; i < qfsize; i++) { 772 if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) { 773 return i; 774 } 775 } 776 777 // Fall back to only required 778 for (size_t i = 0; i < qfsize; i++) { 779 if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) { 780 return i; 781 } 782 } 783 784 // Fall back to reusing compute queue 785 for (size_t i = 0; i < qfsize; i++) { 786 if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) { 787 return i; 788 } 789 } 790 791 // Fall back to ignoring min_num_queries 792 for (size_t i = 0; i < qfsize; i++) { 793 if (queue_family_props[i].queueFlags & required) { 794 return i; 795 } 796 } 797 798 // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations. 799 // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional. 800 if (compute_index >= 0) { 801 return compute_index; 802 } 803 804 std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; 805 806 for(auto &q_family : queue_family_props) { 807 std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl; 808 } 809 abort(); 810 } 811 812 static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) { 813 VK_LOG_DEBUG("ggml_vk_create_queue()"); 814 q.queue_family_index = queue_family_index; 815 816 vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index); 817 q.pool = ctx->device->device.createCommandPool(command_pool_create_info_compute); 818 819 q.cmd_buffer_idx = 0; 820 821 q.queue = ctx->device->device.getQueue(queue_family_index, queue_index); 822 823 q.stage_flags = stage_flags; 824 } 825 826 static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { 827 VK_LOG_DEBUG("ggml_vk_create_context()"); 828 ctx->gc.contexts.emplace_back(); 829 vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; 830 memset((void *) result, 0, sizeof(vk_context)); 831 result->idx = ctx->gc.contexts.size() - 1; 832 result->q = &q; 833 return result; 834 } 835 836 static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { 837 VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); 838 vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; 839 vk::SemaphoreCreateInfo ci{}; 840 ci.setPNext(&tci); 841 vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); 842 ctx->gc.semaphores.push_back({ semaphore, 0 }); 843 return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; 844 } 845 846 static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { 847 VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); 848 if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { 849 vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; 850 vk::SemaphoreCreateInfo ci{}; 851 ci.setPNext(&tci); 852 vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); 853 ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); 854 } 855 return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; 856 } 857 858 static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { 859 if (ctx->event_idx >= ctx->gc.events.size()) { 860 ctx->gc.events.push_back(ctx->device->device.createEvent({})); 861 } 862 return ctx->gc.events[ctx->event_idx++]; 863 } 864 865 static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { 866 VK_LOG_DEBUG("ggml_vk_queue_cleanup()"); 867 // Requires command buffers to be done 868 869 ctx->device->device.resetCommandPool(q.pool); 870 q.cmd_buffer_idx = 0; 871 } 872 873 static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { 874 for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { 875 vk::MemoryType memory_type = mem_props->memoryTypes[i]; 876 if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && 877 (flags & memory_type.propertyFlags) == flags && 878 mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { 879 return static_cast<int32_t>(i); 880 } 881 } 882 return UINT32_MAX; 883 } 884 885 static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { 886 VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")"); 887 vk_buffer buf = std::make_shared<vk_buffer_struct>(); 888 889 if (size == 0) { 890 buf->size = 0; 891 return buf; 892 } 893 894 buf->size = size; 895 vk::BufferCreateInfo buffer_create_info{ 896 vk::BufferCreateFlags(), 897 size, 898 vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, 899 vk::SharingMode::eExclusive, 900 0, 901 nullptr, 902 }; 903 904 buf->buffer = ctx->device->device.createBuffer(buffer_create_info); 905 906 vk::MemoryRequirements mem_req = ctx->device->device.getBufferMemoryRequirements(buf->buffer); 907 908 vk::PhysicalDeviceMemoryProperties mem_props = ctx->device->physical_device.getMemoryProperties(); 909 910 uint32_t memory_type_index = UINT32_MAX; 911 912 memory_type_index = find_properties(&mem_props, &mem_req, req_flags); 913 buf->memory_property_flags = req_flags; 914 915 if (memory_type_index == UINT32_MAX && fallback_flags) { 916 memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); 917 buf->memory_property_flags = fallback_flags; 918 } 919 920 if (memory_type_index == UINT32_MAX) { 921 ctx->device->device.destroyBuffer(buf->buffer); 922 buf->size = 0; 923 throw vk::OutOfDeviceMemoryError("No suitable memory type found"); 924 } 925 926 try { 927 buf->device_memory = ctx->device->device.allocateMemory({ mem_req.size, memory_type_index }); 928 } catch (const vk::SystemError& e) { 929 // Out of Host/Device memory, clean up buffer 930 ctx->device->device.destroyBuffer(buf->buffer); 931 buf->size = 0; 932 throw e; 933 } 934 buf->ptr = nullptr; 935 936 if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { 937 buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); 938 } 939 940 ctx->device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); 941 942 buf->ctx = ctx; 943 944 buf->device = ctx->device; 945 946 #ifdef GGML_VULKAN_MEMORY_DEBUG 947 ctx->memory_logger.log_allocation(buf, size); 948 #endif 949 950 return buf; 951 } 952 953 static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { 954 try { 955 return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags); 956 } catch (const vk::SystemError& e) { 957 std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; 958 std::cerr << "ggml_vulkan: " << e.what() << std::endl; 959 throw e; 960 } 961 } 962 963 static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { 964 vk_buffer buf; 965 try { 966 if (ctx->device->uma) { 967 // Fall back to host memory type 968 buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); 969 } else { 970 buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); 971 } 972 } catch (const vk::SystemError& e) { 973 std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; 974 std::cerr << "ggml_vulkan: " << e.what() << std::endl; 975 throw e; 976 } 977 978 return buf; 979 } 980 981 static void ggml_vk_destroy_buffer(vk_buffer& buf) { 982 if (buf == nullptr) { 983 return; 984 } 985 986 #ifdef GGML_VULKAN_MEMORY_DEBUG 987 buf->ctx->memory_logger.log_deallocation(buf); 988 #endif 989 990 buf.reset(); 991 } 992 993 static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { 994 return { buf, 0, VK_WHOLE_SIZE }; 995 } 996 997 static void ggml_vk_sync_buffers(vk_context * ctx) { 998 VK_LOG_DEBUG("ggml_vk_sync_buffers()"); 999 const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; 1000 1001 ctx->s->buffer.pipelineBarrier( 1002 ctx->q->stage_flags, 1003 ctx->q->stage_flags, 1004 {}, 1005 mem_barriers, 1006 {}, 1007 {} 1008 ); 1009 } 1010 1011 static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) { 1012 VK_LOG_DEBUG("ggml_vk_wait_events()"); 1013 if (events.empty()) { 1014 return; 1015 } 1016 1017 ctx->s->buffer.waitEvents( 1018 events, 1019 ctx->q->stage_flags, 1020 ctx->q->stage_flags, 1021 {}, 1022 {}, 1023 {} 1024 ); 1025 } 1026 1027 static bool ggml_vk_build_shader(ggml_type type) { 1028 switch(type) { 1029 case GGML_TYPE_F16: 1030 case GGML_TYPE_Q4_0: 1031 case GGML_TYPE_Q4_1: 1032 case GGML_TYPE_Q5_0: 1033 case GGML_TYPE_Q5_1: 1034 case GGML_TYPE_Q8_0: 1035 case GGML_TYPE_Q2_K: 1036 case GGML_TYPE_Q3_K: 1037 case GGML_TYPE_Q4_K: 1038 case GGML_TYPE_Q5_K: 1039 case GGML_TYPE_Q6_K: 1040 return true; 1041 default: 1042 return false; 1043 } 1044 } 1045 1046 static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { 1047 VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")"); 1048 1049 const std::shared_ptr<vk_device> device = ctx->device; 1050 1051 // mulmat 1052 std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; 1053 std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; 1054 std::initializer_list<uint32_t> warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; 1055 1056 std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; 1057 std::initializer_list<uint32_t> warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; 1058 std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; 1059 1060 std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 }; 1061 std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 }; 1062 std::array<uint32_t, 3> s_wg_denoms = { 32, 32, 1 }; 1063 1064 uint32_t l_align = 128; 1065 uint32_t m_align = 64; 1066 uint32_t s_align = 32; 1067 1068 ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>(); 1069 ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>(); 1070 ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>(); 1071 ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>(); 1072 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1073 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>(); 1074 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1075 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>(); 1076 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1077 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1078 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1079 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1080 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1081 ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1082 1083 ctx->device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>(); 1084 ctx->device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>(); 1085 ctx->device->pipeline_matmul_id_f16 = std::make_shared<vk_matmul_pipeline_struct>(); 1086 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1087 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>(); 1088 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1089 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>(); 1090 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>(); 1091 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1092 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1093 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1094 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1095 ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>(); 1096 1097 if (device->fp16) { 1098 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1099 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1100 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1101 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1102 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1103 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1104 1105 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1106 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1107 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1108 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1109 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1110 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1111 1112 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1113 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1114 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1115 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1116 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1117 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1118 1119 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1120 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1121 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1122 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1123 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1124 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1125 1126 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1127 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1128 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1129 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1130 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1131 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1132 1133 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1134 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1135 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1136 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1137 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1138 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1139 1140 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1141 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1142 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1143 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1144 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1145 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1146 1147 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1148 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1149 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1150 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1151 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1152 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1153 1154 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1155 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1156 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1157 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1158 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1159 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1160 1161 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1162 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1163 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1164 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1165 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1166 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1167 1168 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1169 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1170 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1171 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1172 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1173 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1174 1175 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1176 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1177 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1178 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1179 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1180 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1181 1182 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1183 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1184 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1185 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1186 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1187 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1188 1189 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1190 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1191 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1192 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1193 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1194 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1195 1196 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1197 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1198 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1199 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1200 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1201 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1202 1203 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1204 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1205 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1206 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1207 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1208 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1209 1210 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1211 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1212 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1213 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1214 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1215 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1216 1217 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1218 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1219 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1220 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1221 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1222 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1223 1224 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1225 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1226 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1227 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1228 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1229 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1230 1231 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1232 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1233 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1234 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1235 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1236 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1237 1238 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1239 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1240 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1241 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1242 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1243 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1244 1245 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1246 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1247 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1248 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1249 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1250 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1251 1252 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1253 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1254 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1255 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1256 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1257 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1258 1259 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1260 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1261 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1262 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1263 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1264 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1265 1266 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1267 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1268 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1269 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1270 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1271 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1272 1273 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1274 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1275 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1276 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1277 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1278 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1279 1280 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1281 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1282 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1283 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1284 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1285 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1286 } else { 1287 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1288 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1289 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1290 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1291 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1292 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1293 1294 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1295 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1296 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1297 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1298 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1299 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1300 1301 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1302 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1303 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1304 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1305 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1306 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1307 1308 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); 1309 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); 1310 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); 1311 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); 1312 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); 1313 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); 1314 1315 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1316 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1317 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1318 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1319 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1320 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1321 1322 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1323 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1324 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1325 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1326 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1327 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1328 1329 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1330 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1331 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1332 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1333 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1334 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1335 1336 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1337 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1338 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1339 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1340 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1341 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1342 1343 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1344 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1345 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1346 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1347 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1348 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1349 1350 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1351 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1352 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1353 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1354 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1355 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1356 1357 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1358 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1359 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1360 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1361 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1362 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1363 1364 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1365 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1366 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1367 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1368 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1369 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1370 1371 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1372 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1373 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1374 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1375 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1376 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1377 1378 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1379 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1380 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1381 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1382 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1383 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1384 1385 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1386 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1387 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1388 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1389 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1390 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1391 1392 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1393 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1394 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1395 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1396 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1397 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1398 1399 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); 1400 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); 1401 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); 1402 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); 1403 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); 1404 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); 1405 1406 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1407 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1408 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1409 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1410 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1411 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1412 1413 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1414 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1415 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1416 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1417 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1418 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1419 1420 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1421 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1422 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1423 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1424 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1425 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1426 1427 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1428 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1429 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1430 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1431 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1432 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1433 1434 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1435 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1436 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1437 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1438 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1439 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1440 1441 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1442 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1443 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1444 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1445 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1446 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1447 1448 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1449 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1450 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1451 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1452 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1453 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1454 1455 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1456 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1457 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1458 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1459 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1460 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1461 1462 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1463 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1464 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1465 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1466 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1467 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1468 1469 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1470 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1471 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1472 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); 1473 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); 1474 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); 1475 } 1476 1477 // mul mat vec 1478 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1479 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1480 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1481 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1482 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1483 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1484 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1485 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1486 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1487 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1488 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1489 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1490 1491 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1492 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1493 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1494 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1495 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1496 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1497 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1498 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1499 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1500 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1501 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1502 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1503 1504 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1505 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1506 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1507 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1508 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1509 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1510 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1511 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1512 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1513 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1514 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1515 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1516 1517 // dequant shaders 1518 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1519 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1520 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1521 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1522 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1523 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); 1524 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); 1525 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); 1526 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); 1527 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); 1528 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); 1529 1530 // get_rows 1531 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); 1532 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); 1533 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1534 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1535 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1536 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1537 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1538 1539 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); 1540 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); 1541 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1542 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1543 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1544 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1545 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); 1546 1547 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); 1548 1549 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); 1550 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); 1551 1552 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); 1553 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); 1554 1555 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1556 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1557 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1558 1559 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); 1560 1561 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); 1562 1563 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); 1564 1565 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1566 1567 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1568 1569 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); 1570 1571 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); 1572 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); 1573 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); 1574 1575 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); 1576 1577 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); 1578 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); 1579 1580 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); 1581 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); 1582 1583 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); 1584 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); 1585 1586 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); 1587 1588 ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); 1589 } 1590 1591 static void ggml_vk_print_gpu_info(size_t idx) { 1592 GGML_ASSERT(idx < vk_instance.device_indices.size()); 1593 size_t dev_num = vk_instance.device_indices[idx]; 1594 VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")"); 1595 GGML_ASSERT(vk_instance.initialized); 1596 1597 std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices(); 1598 1599 if (dev_num >= devices.size()) { 1600 std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; 1601 throw std::runtime_error("Device not found"); 1602 } 1603 1604 vk::PhysicalDevice physical_device = devices[dev_num]; 1605 std::vector<vk::ExtensionProperties> ext_props = physical_device.enumerateDeviceExtensionProperties(); 1606 1607 vk::PhysicalDeviceProperties2 props2; 1608 vk::PhysicalDeviceMaintenance3Properties props3; 1609 vk::PhysicalDeviceSubgroupProperties subgroup_props; 1610 vk::PhysicalDeviceDriverProperties driver_props; 1611 props2.pNext = &props3; 1612 props3.pNext = &subgroup_props; 1613 subgroup_props.pNext = &driver_props; 1614 physical_device.getProperties2(&props2); 1615 1616 const size_t subgroup_size = subgroup_props.subgroupSize; 1617 const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; 1618 1619 bool fp16_storage = false; 1620 bool fp16_compute = false; 1621 1622 for (auto properties : ext_props) { 1623 if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { 1624 fp16_storage = true; 1625 } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { 1626 fp16_compute = true; 1627 } 1628 } 1629 1630 const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); 1631 bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; 1632 1633 bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; 1634 1635 vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures(); 1636 1637 VkPhysicalDeviceFeatures2 device_features2; 1638 device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; 1639 device_features2.pNext = nullptr; 1640 device_features2.features = (VkPhysicalDeviceFeatures)device_features; 1641 1642 VkPhysicalDeviceVulkan11Features vk11_features; 1643 vk11_features.pNext = nullptr; 1644 vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; 1645 device_features2.pNext = &vk11_features; 1646 1647 VkPhysicalDeviceVulkan12Features vk12_features; 1648 vk12_features.pNext = nullptr; 1649 vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; 1650 vk11_features.pNext = &vk12_features; 1651 1652 vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); 1653 1654 fp16 = fp16 && vk12_features.shaderFloat16; 1655 1656 std::string device_name = props2.properties.deviceName.data(); 1657 std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl; 1658 1659 if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { 1660 std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl; 1661 } 1662 } 1663 1664 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions); 1665 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions); 1666 1667 void ggml_vk_instance_init() { 1668 if (vk_instance_initialized) { 1669 return; 1670 } 1671 VK_LOG_DEBUG("ggml_vk_instance_init()"); 1672 1673 vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; 1674 1675 const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties(); 1676 const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions); 1677 #ifdef __APPLE__ 1678 const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); 1679 #endif 1680 1681 std::vector<const char*> layers; 1682 1683 if (validation_ext) { 1684 layers.push_back("VK_LAYER_KHRONOS_validation"); 1685 } 1686 std::vector<const char*> extensions; 1687 if (validation_ext) { 1688 extensions.push_back("VK_EXT_validation_features"); 1689 } 1690 #ifdef __APPLE__ 1691 if (portability_enumeration_ext) { 1692 extensions.push_back("VK_KHR_portability_enumeration"); 1693 } 1694 #endif 1695 vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); 1696 #ifdef __APPLE__ 1697 if (portability_enumeration_ext) { 1698 instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; 1699 } 1700 #endif 1701 1702 std::vector<vk::ValidationFeatureEnableEXT> features_enable; 1703 vk::ValidationFeaturesEXT validation_features; 1704 1705 if (validation_ext) { 1706 features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; 1707 validation_features = { 1708 features_enable, 1709 {}, 1710 }; 1711 validation_features.setPNext(nullptr); 1712 instance_create_info.setPNext(&validation_features); 1713 1714 std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; 1715 } 1716 vk_instance.instance = vk::createInstance(instance_create_info); 1717 1718 memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); 1719 1720 size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); 1721 1722 // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan 1723 char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); 1724 if (devices_env != nullptr) { 1725 std::string devices(devices_env); 1726 std::replace(devices.begin(), devices.end(), ',', ' '); 1727 1728 std::stringstream ss(devices); 1729 size_t tmp; 1730 while (ss >> tmp) { 1731 if(tmp >= num_available_devices) { 1732 std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; 1733 throw std::runtime_error("Invalid Vulkan device index"); 1734 } 1735 vk_instance.device_indices.push_back(tmp); 1736 } 1737 } else { 1738 std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices(); 1739 1740 // Make sure at least one device exists 1741 if (devices.empty()) { 1742 std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; 1743 GGML_ASSERT(false); 1744 } 1745 1746 // Default to using all dedicated GPUs 1747 for (size_t i = 0; i < devices.size(); i++) { 1748 vk::PhysicalDeviceProperties props = devices[i].getProperties(); 1749 1750 if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { 1751 // Check if there are two physical devices corresponding to the same GPU 1752 auto old_device = std::find_if( 1753 vk_instance.device_indices.begin(), 1754 vk_instance.device_indices.end(), 1755 [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; } 1756 ); 1757 if (old_device == vk_instance.device_indices.end()) { 1758 vk_instance.device_indices.push_back(i); 1759 } else { 1760 // There can be two physical devices corresponding to the same GPU if there are 2 different drivers 1761 // This can cause error when splitting layers aross the devices, need to keep only 1 1762 VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same device id"); 1763 1764 vk::PhysicalDeviceProperties2 old_prop; 1765 vk::PhysicalDeviceDriverProperties old_driver; 1766 old_prop.pNext = &old_driver; 1767 devices[*old_device].getProperties2(&old_prop); 1768 1769 vk::PhysicalDeviceProperties2 new_prop; 1770 vk::PhysicalDeviceDriverProperties new_driver; 1771 new_prop.pNext = &new_driver; 1772 devices[i].getProperties2(&new_prop); 1773 1774 std::map<vk::DriverId, int> driver_priorities {}; 1775 int old_priority = std::numeric_limits<int>::max(); 1776 int new_priority = std::numeric_limits<int>::max(); 1777 1778 // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id 1779 // Smaller number -> higher priority 1780 switch (old_prop.properties.vendorID) { 1781 case VK_VENDOR_ID_AMD: 1782 driver_priorities[vk::DriverId::eMesaRadv] = 1; 1783 driver_priorities[vk::DriverId::eAmdOpenSource] = 2; 1784 driver_priorities[vk::DriverId::eAmdProprietary] = 3; 1785 break; 1786 case VK_VENDOR_ID_INTEL: 1787 driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1; 1788 driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2; 1789 break; 1790 case VK_VENDOR_ID_NVIDIA: 1791 driver_priorities[vk::DriverId::eNvidiaProprietary] = 1; 1792 #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235 1793 driver_priorities[vk::DriverId::eMesaNvk] = 2; 1794 #endif 1795 break; 1796 } 1797 1798 if (driver_priorities.count(old_driver.driverID)) { 1799 old_priority = driver_priorities[old_driver.driverID]; 1800 } 1801 if (driver_priorities.count(new_driver.driverID)) { 1802 new_priority = driver_priorities[new_driver.driverID]; 1803 } 1804 1805 if (new_priority < old_priority) { 1806 auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device); 1807 vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); 1808 vk_instance.device_indices.push_back(i); 1809 1810 VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName); 1811 } 1812 else { 1813 VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl); 1814 } 1815 } 1816 } 1817 } 1818 1819 // If no dedicated GPUs found, fall back to GPU 0 1820 if (vk_instance.device_indices.empty()) { 1821 vk_instance.device_indices.push_back(0); 1822 } 1823 } 1824 1825 std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl; 1826 1827 for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { 1828 ggml_vk_print_gpu_info(i); 1829 } 1830 1831 vk_instance_initialized = true; 1832 } 1833 1834 static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { 1835 GGML_ASSERT(idx < vk_instance.device_indices.size()); 1836 size_t dev_num = vk_instance.device_indices[idx]; 1837 VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")"); 1838 ggml_vk_instance_init(); 1839 1840 std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices(); 1841 1842 if (dev_num >= devices.size()) { 1843 std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; 1844 throw std::runtime_error("Device not found"); 1845 } 1846 1847 ctx->device = ggml_vk_get_device(idx); 1848 if (!ctx->device->initialized) { 1849 ctx->device->physical_device = devices[dev_num]; 1850 const std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties(); 1851 1852 bool maintenance4_support = false; 1853 1854 // Check if maintenance4 is supported 1855 for (const auto& properties : ext_props) { 1856 if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { 1857 maintenance4_support = true; 1858 } 1859 } 1860 1861 vk::PhysicalDeviceProperties2 props2; 1862 vk::PhysicalDeviceMaintenance3Properties props3; 1863 vk::PhysicalDeviceMaintenance4Properties props4; 1864 vk::PhysicalDeviceSubgroupProperties subgroup_props; 1865 props2.pNext = &props3; 1866 props3.pNext = &subgroup_props; 1867 if (maintenance4_support) { 1868 subgroup_props.pNext = &props4; 1869 } 1870 ctx->device->physical_device.getProperties2(&props2); 1871 ctx->device->properties = props2.properties; 1872 1873 const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); 1874 1875 if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { 1876 ctx->device->max_memory_allocation_size = std::stoi(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); 1877 } else if (maintenance4_support) { 1878 ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); 1879 } else { 1880 ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize; 1881 } 1882 1883 ctx->device->vendor_id = ctx->device->properties.vendorID; 1884 ctx->device->subgroup_size = subgroup_props.subgroupSize; 1885 ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; 1886 1887 bool fp16_storage = false; 1888 bool fp16_compute = false; 1889 1890 for (const auto& properties : ext_props) { 1891 if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { 1892 fp16_storage = true; 1893 } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { 1894 fp16_compute = true; 1895 } 1896 } 1897 1898 const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); 1899 const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; 1900 1901 ctx->device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; 1902 1903 std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device->physical_device.getQueueFamilyProperties(); 1904 1905 // Try to find a non-graphics compute queue and transfer-focused queues 1906 const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); 1907 const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); 1908 1909 const float priorities[] = { 1.0f, 1.0f }; 1910 ctx->device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; 1911 1912 std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos; 1913 if (compute_queue_family_index != transfer_queue_family_index) { 1914 device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); 1915 device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); 1916 } else if(!ctx->device->single_queue) { 1917 device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); 1918 } else { 1919 device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); 1920 } 1921 vk::DeviceCreateInfo device_create_info; 1922 std::vector<const char *> device_extensions; 1923 vk::PhysicalDeviceFeatures device_features = ctx->device->physical_device.getFeatures(); 1924 1925 VkPhysicalDeviceFeatures2 device_features2; 1926 device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; 1927 device_features2.pNext = nullptr; 1928 device_features2.features = (VkPhysicalDeviceFeatures)device_features; 1929 1930 VkPhysicalDeviceVulkan11Features vk11_features; 1931 vk11_features.pNext = nullptr; 1932 vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; 1933 device_features2.pNext = &vk11_features; 1934 1935 VkPhysicalDeviceVulkan12Features vk12_features; 1936 vk12_features.pNext = nullptr; 1937 vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; 1938 vk11_features.pNext = &vk12_features; 1939 1940 vkGetPhysicalDeviceFeatures2(ctx->device->physical_device, &device_features2); 1941 1942 ctx->device->fp16 = ctx->device->fp16 && vk12_features.shaderFloat16; 1943 1944 if (!vk11_features.storageBuffer16BitAccess) { 1945 std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; 1946 throw std::runtime_error("Unsupported device"); 1947 } 1948 1949 device_extensions.push_back("VK_KHR_16bit_storage"); 1950 1951 #ifdef GGML_VULKAN_VALIDATE 1952 device_extensions.push_back("VK_KHR_shader_non_semantic_info"); 1953 #endif 1954 1955 if (ctx->device->fp16) { 1956 device_extensions.push_back("VK_KHR_shader_float16_int8"); 1957 } 1958 ctx->device->name = ctx->device->properties.deviceName.data(); 1959 1960 device_create_info = { 1961 vk::DeviceCreateFlags(), 1962 device_queue_create_infos, 1963 {}, 1964 device_extensions 1965 }; 1966 device_create_info.setPNext(&device_features2); 1967 ctx->device->device = ctx->device->physical_device.createDevice(device_create_info); 1968 1969 ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; 1970 1971 // Queues 1972 ggml_vk_create_queue(ctx, ctx->device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); 1973 1974 // Shaders 1975 ggml_vk_load_shaders(ctx); 1976 1977 if (!ctx->device->single_queue) { 1978 const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; 1979 ggml_vk_create_queue(ctx, ctx->device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); 1980 } else { 1981 // TODO: Use pointer or reference to avoid copy 1982 ctx->device->transfer_queue = ctx->device->compute_queue; 1983 } 1984 1985 ctx->device->idx = dev_num; 1986 ctx->device->initialized = true; 1987 } else if (ctx->device->idx != dev_num) { 1988 std::cerr << "ggml_vulkan: Device " << ctx->device->name << " already initialized with index " << ctx->device->idx << ", but trying to reinitialize with index " << dev_num << std::endl; 1989 throw std::runtime_error("Device already initialized"); 1990 } 1991 1992 ctx->fence = ctx->device->device.createFence({}); 1993 1994 ctx->compute_ctx = nullptr; 1995 ctx->transfer_ctx = nullptr; 1996 1997 ctx->initialized = true; 1998 1999 ctx->idx = idx; 2000 2001 #ifdef GGML_VULKAN_CHECK_RESULTS 2002 const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); 2003 vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); 2004 const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR"); 2005 vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor)); 2006 #endif 2007 } 2008 2009 static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { 2010 VK_LOG_DEBUG("ggml_vk_get_to_fp16()"); 2011 switch (type) { 2012 case GGML_TYPE_F32: 2013 case GGML_TYPE_Q4_0: 2014 case GGML_TYPE_Q4_1: 2015 case GGML_TYPE_Q5_0: 2016 case GGML_TYPE_Q5_1: 2017 case GGML_TYPE_Q8_0: 2018 case GGML_TYPE_Q2_K: 2019 case GGML_TYPE_Q3_K: 2020 case GGML_TYPE_Q4_K: 2021 case GGML_TYPE_Q5_K: 2022 case GGML_TYPE_Q6_K: 2023 break; 2024 default: 2025 return nullptr; 2026 } 2027 2028 return ctx->device->pipeline_dequant[type]; 2029 } 2030 2031 static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { 2032 VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()"); 2033 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { 2034 return ctx->device->pipeline_matmul_f32; 2035 } 2036 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { 2037 return ctx->device->pipeline_matmul_f32_f16; 2038 } 2039 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { 2040 return ctx->device->pipeline_matmul_f16_f32; 2041 } 2042 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { 2043 return ctx->device->pipeline_matmul_f16; 2044 } 2045 2046 GGML_ASSERT(src1_type == GGML_TYPE_F32); 2047 2048 switch (src0_type) { 2049 case GGML_TYPE_Q4_0: 2050 case GGML_TYPE_Q4_1: 2051 case GGML_TYPE_Q5_0: 2052 case GGML_TYPE_Q5_1: 2053 case GGML_TYPE_Q8_0: 2054 case GGML_TYPE_Q2_K: 2055 case GGML_TYPE_Q3_K: 2056 case GGML_TYPE_Q4_K: 2057 case GGML_TYPE_Q5_K: 2058 case GGML_TYPE_Q6_K: 2059 break; 2060 default: 2061 return nullptr; 2062 } 2063 2064 return ctx->device->pipeline_dequant_mul_mat_mat[src0_type]; 2065 } 2066 2067 static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { 2068 VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); 2069 GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16); 2070 2071 switch (a_type) { 2072 case GGML_TYPE_F32: 2073 case GGML_TYPE_F16: 2074 case GGML_TYPE_Q4_0: 2075 case GGML_TYPE_Q4_1: 2076 case GGML_TYPE_Q5_0: 2077 case GGML_TYPE_Q5_1: 2078 case GGML_TYPE_Q8_0: 2079 case GGML_TYPE_Q2_K: 2080 case GGML_TYPE_Q3_K: 2081 case GGML_TYPE_Q4_K: 2082 case GGML_TYPE_Q5_K: 2083 case GGML_TYPE_Q6_K: 2084 break; 2085 default: 2086 return nullptr; 2087 } 2088 2089 return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type]; 2090 } 2091 2092 static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { 2093 VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); 2094 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { 2095 return ctx->device->pipeline_matmul_id_f32; 2096 } 2097 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { 2098 return ctx->device->pipeline_matmul_id_f16_f32; 2099 } 2100 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { 2101 return ctx->device->pipeline_matmul_id_f16; 2102 } 2103 2104 GGML_ASSERT(src1_type == GGML_TYPE_F32); 2105 2106 switch (src0_type) { 2107 case GGML_TYPE_Q4_0: 2108 case GGML_TYPE_Q4_1: 2109 case GGML_TYPE_Q5_0: 2110 case GGML_TYPE_Q5_1: 2111 case GGML_TYPE_Q8_0: 2112 case GGML_TYPE_Q2_K: 2113 case GGML_TYPE_Q3_K: 2114 case GGML_TYPE_Q4_K: 2115 case GGML_TYPE_Q5_K: 2116 case GGML_TYPE_Q6_K: 2117 break; 2118 default: 2119 return nullptr; 2120 } 2121 2122 return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; 2123 } 2124 2125 static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { 2126 VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); 2127 GGML_ASSERT(b_type == GGML_TYPE_F32); 2128 2129 switch (a_type) { 2130 case GGML_TYPE_F32: 2131 case GGML_TYPE_F16: 2132 case GGML_TYPE_Q4_0: 2133 case GGML_TYPE_Q4_1: 2134 case GGML_TYPE_Q5_0: 2135 case GGML_TYPE_Q5_1: 2136 case GGML_TYPE_Q8_0: 2137 case GGML_TYPE_Q2_K: 2138 case GGML_TYPE_Q3_K: 2139 case GGML_TYPE_Q4_K: 2140 case GGML_TYPE_Q5_K: 2141 case GGML_TYPE_Q6_K: 2142 break; 2143 default: 2144 return nullptr; 2145 } 2146 2147 return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[a_type]; 2148 } 2149 2150 static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { 2151 VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")"); 2152 VK_LOG_MEMORY("ggml_vk_pool_malloc"); 2153 2154 int best_i = -1; 2155 size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs 2156 int worst_i = -1; 2157 size_t worst_size = 0; //largest unused buffer seen so far 2158 for (int i = 0; i < MAX_VK_BUFFERS; ++i) { 2159 vk_buffer &b = ctx->buffer_pool[i]; 2160 if (b != nullptr && b->size >= size && b->size < best_size) { 2161 best_i = i; 2162 best_size = b->size; 2163 } 2164 if (b != nullptr && b->size > worst_size) { 2165 worst_i = i; 2166 worst_size = b->size; 2167 } 2168 } 2169 if(best_i != -1) { 2170 //found the smallest buffer that fits our needs 2171 vk_buffer b = ctx->buffer_pool[best_i]; 2172 ctx->buffer_pool[best_i].reset(); 2173 return b; 2174 } 2175 if(worst_i != -1) { 2176 //no buffer that fits our needs, resize largest one to save memory 2177 vk_buffer& b = ctx->buffer_pool[worst_i]; 2178 ggml_vk_destroy_buffer(b); 2179 } 2180 2181 return ggml_vk_create_buffer_device(ctx, size); 2182 } 2183 2184 static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) { 2185 VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")"); 2186 for (int i = 0; i < MAX_VK_BUFFERS; ++i) { 2187 vk_buffer& b = ctx->buffer_pool[i]; 2188 if (b == nullptr) { 2189 b = buffer; 2190 return; 2191 } 2192 } 2193 std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl; 2194 ggml_vk_destroy_buffer(buffer); 2195 } 2196 2197 // Returns an available temporary buffer that may only be used temporarily, it will be reused 2198 static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) { 2199 // Try to find existing temp buffer with enough capacity 2200 for (auto& buffer : ctx->gc.temp_buffers) { 2201 if (buffer->size >= size) { 2202 return buffer; 2203 } 2204 } 2205 2206 VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")"); 2207 2208 // Otherwise create new buffer 2209 vk_buffer buf = ggml_vk_pool_malloc(ctx, size); 2210 ctx->gc.temp_buffers.push_back(buf); 2211 2212 return buf; 2213 } 2214 2215 static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { 2216 VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); 2217 vk_buffer buf = ggml_vk_create_buffer(ctx, size, 2218 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, 2219 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); 2220 2221 if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { 2222 fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", 2223 size/1024.0/1024.0); 2224 ctx->device->device.freeMemory(buf->device_memory); 2225 ctx->device->device.destroyBuffer(buf->buffer); 2226 return nullptr; 2227 } 2228 2229 ctx->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); 2230 2231 return buf->ptr; 2232 } 2233 2234 static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) { 2235 if (ptr == nullptr) { 2236 return; 2237 } 2238 VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); 2239 vk_buffer buf; 2240 size_t index; 2241 for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { 2242 const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); 2243 const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); 2244 if (ptr >= addr && ptr < endr) { 2245 buf = std::get<2>(ctx->pinned_memory[i]); 2246 index = i; 2247 break; 2248 } 2249 } 2250 if (buf == nullptr) { 2251 fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n"); 2252 return; 2253 } 2254 2255 ggml_vk_destroy_buffer(buf); 2256 2257 ctx->pinned_memory.erase(ctx->pinned_memory.begin() + index); 2258 } 2259 2260 static void ggml_vk_host_get(ggml_backend_vk_context * ctx, const void * ptr, vk_buffer& buf, size_t& buf_offset) { 2261 buf = nullptr; 2262 buf_offset = 0; 2263 for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { 2264 const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); 2265 const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); 2266 if (ptr >= addr && ptr < endr) { 2267 buf = std::get<2>(ctx->pinned_memory[i]); 2268 buf_offset = ((const uint8_t *)ptr) - addr; 2269 break; 2270 } 2271 } 2272 } 2273 2274 static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_queue& q, bool one_time = true) { 2275 vk_submission s; 2276 s.buffer = ggml_vk_create_cmd_buffer(ctx, q); 2277 if (one_time) { 2278 s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); 2279 } else { 2280 s.buffer.begin({ vk::CommandBufferUsageFlags{} }); 2281 } 2282 2283 return s; 2284 } 2285 2286 static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { 2287 const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); 2288 const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); 2289 const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); 2290 VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; 2291 for (auto& buffer : buffers) { 2292 std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; 2293 } 2294 std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); 2295 std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos; 2296 std::vector<vk::WriteDescriptorSet> write_descriptor_sets; 2297 GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); 2298 GGML_ASSERT(buffers.size() == pipeline->parameter_count); 2299 vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; 2300 for (uint32_t i = 0; i < pipeline->parameter_count; i++) { 2301 descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); 2302 } 2303 for (uint32_t i = 0; i < pipeline->parameter_count; i++) { 2304 write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); 2305 } 2306 2307 ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); 2308 2309 subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); 2310 subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); 2311 subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, 2312 pipeline->layout, 2313 0, 2314 { descriptor_set }, 2315 {}); 2316 subctx->s->buffer.dispatch(wg0, wg1, wg2); 2317 } 2318 2319 static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) { 2320 s.buffer.end(); 2321 2322 s.wait_semaphores = std::move(wait_semaphores); 2323 s.signal_semaphores = std::move(signal_semaphores); 2324 } 2325 2326 static void ggml_vk_ctx_end(vk_context * ctx) { 2327 VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); 2328 if (ctx->s == nullptr) { 2329 return; 2330 } 2331 2332 ctx->s->buffer.end(); 2333 ctx->s = nullptr; 2334 } 2335 2336 static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) { 2337 VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")"); 2338 if (subctx->s != nullptr) { 2339 ggml_vk_ctx_end(subctx); 2340 } 2341 2342 subctx->seqs.push_back({ ggml_vk_begin_submission(ctx, *subctx->q) }); 2343 subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); 2344 } 2345 2346 static size_t ggml_vk_align_size(size_t width, size_t align) { 2347 VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); 2348 return CEIL_DIV(width, align) * align; 2349 } 2350 2351 static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr) { 2352 if (memcpys == nullptr) { 2353 memcpy(dst, src, size); 2354 } else { 2355 memcpys->emplace_back(dst, src, size); 2356 } 2357 } 2358 2359 static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { 2360 if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { 2361 VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); 2362 ggml_vk_destroy_buffer(ctx->sync_staging); 2363 ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, 2364 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, 2365 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); 2366 } 2367 } 2368 2369 static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { 2370 VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); 2371 GGML_ASSERT(!ggml_is_contiguous(tensor)); 2372 // Buffer is already mapped 2373 if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { 2374 std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; 2375 GGML_ASSERT(false); 2376 } 2377 // Check if src is pinned memory 2378 vk_buffer buf; 2379 size_t buf_offset; 2380 ggml_vk_host_get(ctx, tensor->data, buf, buf_offset); 2381 2382 const uint64_t ne0 = tensor->ne[0]; 2383 const uint64_t ne1 = tensor->ne[1]; 2384 const uint64_t ne2 = tensor->ne[2]; 2385 const uint64_t ne3 = tensor->ne[3]; 2386 const uint64_t nb0 = tensor->nb[0]; 2387 const uint64_t nb1 = tensor->nb[1]; 2388 const uint64_t nb2 = tensor->nb[2]; 2389 const uint64_t nb3 = tensor->nb[3]; 2390 const ggml_type type = tensor->type; 2391 const uint64_t ts = ggml_type_size(type); 2392 const uint64_t bs = ggml_blck_size(type); 2393 2394 const uint64_t dstnb0 = ts; 2395 const uint64_t dstnb1 = dstnb0*(ne0/bs); 2396 const uint64_t dstnb2 = dstnb1*ne1; 2397 const uint64_t dstnb3 = dstnb2*ne2; 2398 2399 const uint64_t ne = ggml_nelements(tensor); 2400 2401 if (buf != nullptr) { 2402 // Memory is pinned, use as staging buffer 2403 std::vector<vk::BufferCopy> slices; 2404 2405 for (uint64_t i3 = 0; i3 < ne3; i3++) { 2406 for (uint64_t i2 = 0; i2 < ne2; i2++) { 2407 // Find longest contiguous slice 2408 if (ne1*nb1 == dstnb2) { 2409 slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 }); 2410 } else { 2411 for (uint64_t i1 = 0; i1 < ne1; i1++) { 2412 if (ne0*nb0/bs == dstnb1) { 2413 slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 }); 2414 } else { 2415 const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; 2416 const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; 2417 for (uint64_t i0 = 0; i0 < ne0; i0++) { 2418 slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 }); 2419 } 2420 } 2421 } 2422 } 2423 } 2424 } 2425 2426 ggml_vk_sync_buffers(subctx); 2427 subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); 2428 return; 2429 } 2430 2431 // Staging buffer required 2432 vk_buffer staging = ctx->staging; 2433 size_t staging_offset = ctx->staging_offset; 2434 const size_t copy_size = ts*ne/bs; 2435 if (ctx->staging->size < ctx->staging_offset + copy_size) { 2436 if (sync_staging) { 2437 // Create temporary larger buffer 2438 ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); 2439 2440 staging = ctx->sync_staging; 2441 staging_offset = 0; 2442 } else { 2443 GGML_ASSERT(false); 2444 } 2445 } 2446 2447 VkBufferCopy buf_copy{ staging_offset, offset, copy_size }; 2448 2449 ggml_vk_sync_buffers(subctx); 2450 vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); 2451 2452 for (uint64_t i3 = 0; i3 < ne3; i3++) { 2453 for (uint64_t i2 = 0; i2 < ne2; i2++) { 2454 // Find longest contiguous slice 2455 if (ne1*nb1 == dstnb2) { 2456 deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); 2457 } else { 2458 for (uint64_t i1 = 0; i1 < ne1; i1++) { 2459 if (ne0*nb0/bs == dstnb1) { 2460 deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); 2461 } else { 2462 const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; 2463 const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; 2464 for (uint64_t i0 = 0; i0 < ne0; i0++) { 2465 deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); 2466 } 2467 } 2468 } 2469 } 2470 } 2471 } 2472 } 2473 2474 static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { 2475 VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); 2476 // Make sure ctx owns the buffer 2477 GGML_ASSERT(dst->ctx == ctx); 2478 2479 // Buffer is already mapped 2480 if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { 2481 std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; 2482 GGML_ASSERT(false); 2483 } 2484 // Check if src is pinned memory 2485 vk_buffer buf = nullptr; 2486 size_t buf_offset; 2487 ggml_vk_host_get(ctx, src, buf, buf_offset); 2488 2489 if (buf != nullptr) { 2490 // Memory is pinned, use as staging buffer 2491 std::vector<vk::BufferCopy> slices(1); 2492 if (width == spitch) { 2493 // Only do single write if stride is equal 2494 slices[0].srcOffset = buf_offset; 2495 slices[0].dstOffset = offset; 2496 slices[0].size = width * height; 2497 } else { 2498 slices.resize(height); 2499 for (size_t i = 0; i < height; i++) { 2500 slices[i].srcOffset = buf_offset + i * spitch; 2501 slices[i].dstOffset = offset + i * width; 2502 slices[i].size = width; 2503 } 2504 } 2505 2506 ggml_vk_sync_buffers(subctx); 2507 subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); 2508 return; 2509 } 2510 VK_LOG_DEBUG("STAGING"); 2511 2512 // Staging buffer required 2513 vk_buffer staging = ctx->staging; 2514 size_t staging_offset = ctx->staging_offset; 2515 const size_t copy_size = width*height; 2516 if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { 2517 if (sync_staging) { 2518 ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); 2519 2520 staging = ctx->sync_staging; 2521 staging_offset = 0; 2522 } else { 2523 GGML_ASSERT(false); 2524 } 2525 } 2526 2527 VkBufferCopy buf_copy = { 2528 staging_offset, 2529 offset, 2530 copy_size}; 2531 2532 ggml_vk_sync_buffers(subctx); 2533 vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); 2534 2535 if (width == spitch) { 2536 deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &subctx->in_memcpys); 2537 } else { 2538 for (size_t i = 0; i < height; i++) { 2539 deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); 2540 } 2541 } 2542 } 2543 2544 static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { 2545 VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); 2546 return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging); 2547 } 2548 2549 static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { 2550 VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")"); 2551 // Buffer is already mapped 2552 if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { 2553 GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); 2554 2555 for (size_t i = 0; i < height; i++) { 2556 memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); 2557 } 2558 } else { 2559 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 2560 ggml_vk_ctx_begin(ctx, subctx); 2561 ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true); 2562 ggml_vk_ctx_end(subctx); 2563 2564 for (auto& cpy : subctx->in_memcpys) { 2565 memcpy(cpy.dst, cpy.src, cpy.n); 2566 } 2567 2568 ggml_vk_submit(subctx, ctx->fence); 2569 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); 2570 ctx->device->device.resetFences({ ctx->fence }); 2571 } 2572 } 2573 2574 static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) { 2575 VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")"); 2576 ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1); 2577 } 2578 2579 static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { 2580 VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); 2581 GGML_ASSERT(width > 0); 2582 GGML_ASSERT(height > 0); 2583 GGML_ASSERT(src != nullptr); 2584 // Make sure ctx owns the buffer 2585 GGML_ASSERT(src->ctx == ctx); 2586 2587 // Check if dst is pinned memory 2588 vk_buffer buf = nullptr; 2589 size_t buf_offset; 2590 ggml_vk_host_get(ctx, dst, buf, buf_offset); 2591 2592 std::vector<vk::BufferCopy> slices(1); 2593 if (width == spitch && width == dpitch) { 2594 // Only do single write if stride is equal 2595 slices[0].srcOffset = offset; 2596 slices[0].dstOffset = buf_offset; 2597 slices[0].size = width * height; 2598 } else { 2599 slices.resize(height); 2600 for (size_t i = 0; i < height; i++) { 2601 slices[i].srcOffset = offset + i * spitch; 2602 slices[i].dstOffset = buf_offset + i * dpitch; 2603 slices[i].size = width; 2604 } 2605 } 2606 2607 if (buf != nullptr) { 2608 // Memory is pinned, use as staging buffer 2609 ggml_vk_sync_buffers(subctx); 2610 subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); 2611 2612 return; 2613 } 2614 VK_LOG_DEBUG("STAGING"); 2615 2616 // Fall back to staging buffer 2617 vk_buffer staging = ctx->staging; 2618 const size_t copy_size = dpitch * height; 2619 if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { 2620 if (sync_staging) { 2621 // Create temporary larger buffer 2622 ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); 2623 2624 staging = ctx->sync_staging; 2625 } else { 2626 GGML_ASSERT(false); 2627 } 2628 } 2629 2630 ggml_vk_sync_buffers(subctx); 2631 subctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices); 2632 2633 deferred_memcpy(dst, staging->ptr, copy_size, &subctx->out_memcpys); 2634 } 2635 2636 static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { 2637 return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst, size, size, size, 1, sync_staging); 2638 } 2639 2640 static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) { 2641 VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")"); 2642 if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { 2643 GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); 2644 2645 memcpy(dst, (uint8_t *) src->ptr + offset, size); 2646 } else { 2647 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 2648 ggml_vk_ctx_begin(ctx, subctx); 2649 ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true); 2650 ggml_vk_ctx_end(subctx); 2651 2652 ggml_vk_submit(subctx, ctx->fence); 2653 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); 2654 ctx->device->device.resetFences({ ctx->fence }); 2655 2656 for (auto& cpy : subctx->out_memcpys) { 2657 memcpy(cpy.dst, cpy.src, cpy.n); 2658 } 2659 } 2660 } 2661 2662 static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { 2663 VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); 2664 // Make sure both buffers are on same ctx 2665 GGML_ASSERT(src->ctx == dst->ctx); 2666 2667 VkBufferCopy bc{ src_offset, dst_offset, size }; 2668 2669 vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); 2670 } 2671 2672 static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { 2673 if (src->ctx == dst->ctx) { 2674 VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); 2675 // Copy within the device 2676 ggml_backend_vk_context * ctx = src->ctx; 2677 2678 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 2679 ggml_vk_ctx_begin(ctx, subctx); 2680 ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); 2681 ggml_vk_ctx_end(subctx); 2682 ggml_vk_submit(subctx, ctx->fence); 2683 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); 2684 ctx->device->device.resetFences({ ctx->fence }); 2685 } else { 2686 VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); 2687 // Copy device to device 2688 ggml_backend_vk_context * src_ctx = src->ctx; 2689 ggml_backend_vk_context * dst_ctx = dst->ctx; 2690 2691 ggml_vk_ensure_sync_staging_buffer(src_ctx, size); 2692 ggml_vk_ensure_sync_staging_buffer(dst_ctx, size); 2693 2694 // Copy to src staging buffer 2695 ggml_vk_buffer_copy(src_ctx->sync_staging, 0, src, src_offset, size); 2696 // memcpy to dst staging buffer 2697 memcpy(dst_ctx->sync_staging->ptr, src_ctx->sync_staging->ptr, size); 2698 // Copy to dst buffer 2699 ggml_vk_buffer_copy(dst, dst_offset, dst_ctx->sync_staging, 0, size); 2700 } 2701 } 2702 2703 static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { 2704 VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); 2705 // Make sure ctx owns the buffer 2706 GGML_ASSERT(dst->ctx == ctx); 2707 2708 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 2709 ggml_vk_ctx_begin(ctx, subctx); 2710 subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); 2711 ggml_vk_ctx_end(subctx); 2712 2713 ggml_vk_submit(subctx, ctx->fence); 2714 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences"); 2715 ctx->device->device.resetFences({ ctx->fence }); 2716 } 2717 2718 static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { 2719 VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")"); 2720 const uint64_t ne0 = src->ne[0]; 2721 const uint64_t ne1 = src->ne[1]; 2722 const uint64_t nb0 = src->nb[0]; 2723 const uint64_t nb1 = src->nb[1]; 2724 const uint64_t nb2 = src->nb[2]; 2725 const uint64_t nb3 = src->nb[3]; 2726 const enum ggml_type type = src->type; 2727 const size_t ts = ggml_type_size(type); 2728 const size_t bs = ggml_blck_size(type); 2729 const size_t row_length = ts*ne0/bs; 2730 2731 const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); 2732 if (nb0 == ts && nb1 == row_length) { 2733 return ggml_vk_buffer_write_async(ctx, subctx, dst, offset, x, i1*nb1); 2734 } 2735 if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) { 2736 return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, x, nb1, row_length, i1); 2737 } 2738 2739 GGML_ASSERT(i3 == 0); 2740 GGML_ASSERT(i2 == 0); 2741 GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src)); 2742 2743 return ggml_vk_buffer_write_nc_async(ctx, subctx, dst, offset, src); 2744 } 2745 2746 static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) { 2747 VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()"); 2748 const uint64_t ne0 = dst->ne[0]; 2749 const uint64_t ne1 = dst->ne[1]; 2750 const uint64_t ne2 = dst->ne[2]; 2751 const uint64_t ne3 = dst->ne[3]; 2752 const uint64_t nb0 = dst->nb[0]; 2753 const uint64_t nb1 = dst->nb[1]; 2754 // const uint64_t nb2 = dst->nb[2]; 2755 // const uint64_t nb3 = dst->nb[3]; 2756 const enum ggml_type type = dst->type; 2757 const size_t ts = ggml_type_size(type); 2758 const size_t bs = ggml_blck_size(type); 2759 const size_t row_length = ts*ne0/bs; 2760 2761 if (ggml_is_contiguous(dst)) { 2762 return ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst->data, ne1*nb1*ne2*ne3); 2763 } 2764 if (nb0 == ts) { 2765 return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3); 2766 } 2767 GGML_ASSERT(false); 2768 } 2769 2770 static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { 2771 VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"); 2772 // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { 2773 // return 4; 2774 // } 2775 2776 return 1; 2777 2778 GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k); 2779 } 2780 2781 static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { 2782 if (m <= 32 || n <= 32) { 2783 return aligned ? mmp->a_s : mmp->s; 2784 } 2785 return aligned ? mmp->a_m : mmp->m; 2786 2787 GGML_UNUSED(ctx); 2788 } 2789 2790 static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { 2791 return aligned ? mmp->a_m : mmp->m; 2792 2793 GGML_UNUSED(ctx); 2794 } 2795 2796 static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { 2797 return aligned ? mmp->a_s : mmp->s; 2798 2799 GGML_UNUSED(ctx); 2800 } 2801 2802 static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { 2803 VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); 2804 switch (ctx->device->vendor_id) { 2805 case VK_VENDOR_ID_AMD: 2806 return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned); 2807 case VK_VENDOR_ID_APPLE: 2808 return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned); 2809 case VK_VENDOR_ID_INTEL: 2810 return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned); 2811 default: 2812 break; 2813 } 2814 2815 if (m <= 32 || n <= 32) { 2816 return aligned ? mmp->a_s : mmp->s; 2817 } 2818 if (m <= 64 || n <= 64) { 2819 return aligned ? mmp->a_m : mmp->m; 2820 } 2821 return aligned ? mmp->a_l : mmp->l; 2822 } 2823 2824 static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { 2825 VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); 2826 return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; 2827 } 2828 2829 static void ggml_vk_matmul( 2830 ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, 2831 vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, 2832 uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, 2833 uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, 2834 uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) { 2835 VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")"); 2836 ggml_vk_sync_buffers(subctx); 2837 if (split_k == 1) { 2838 const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 }; 2839 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); 2840 return; 2841 } 2842 2843 GGML_ASSERT(batch_stride_d == m * n); 2844 2845 const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 }; 2846 // Make sure enough workgroups get assigned for split k to work 2847 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); 2848 ggml_vk_sync_buffers(subctx); 2849 const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k }; 2850 ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); 2851 } 2852 2853 static void ggml_vk_matmul_id( 2854 ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, 2855 vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, 2856 uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, 2857 uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, 2858 uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) { 2859 VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " << 2860 "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << 2861 "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << 2862 "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); 2863 ggml_vk_sync_buffers(subctx); 2864 const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, 2865 nei0, nei1, nbi1, ne11 }; 2866 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as }); 2867 } 2868 2869 static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { 2870 return 2871 tensor->nb[0] == ggml_type_size(tensor->type) && 2872 tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && 2873 tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; 2874 } 2875 2876 static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { 2877 if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { 2878 return ctx->device->pipeline_cpy_f32_f32; 2879 } 2880 if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { 2881 return ctx->device->pipeline_cpy_f32_f16; 2882 } 2883 if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { 2884 return ctx->device->pipeline_cpy_f16_f16; 2885 } 2886 2887 std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; 2888 GGML_ASSERT(false); 2889 } 2890 2891 static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { 2892 VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; 2893 std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); 2894 const int tensor_type_size = ggml_type_size(tensor->type); 2895 2896 const uint32_t ne = ggml_nelements(tensor); 2897 2898 const vk_op_unary_push_constants pc = { 2899 (uint32_t)ne, 2900 (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, 2901 (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 2902 0, 2903 0.0f, 0.0f, 2904 }; 2905 ggml_vk_sync_buffers(subctx); 2906 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); 2907 } 2908 2909 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 2910 VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 2911 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 2912 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 2913 GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT 2914 GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT 2915 2916 const uint64_t ne00 = src0->ne[0]; 2917 const uint64_t ne01 = src0->ne[1]; 2918 const uint64_t ne02 = src0->ne[2]; 2919 const uint64_t ne03 = src0->ne[3]; 2920 2921 const uint64_t ne10 = src1->ne[0]; 2922 const uint64_t ne11 = src1->ne[1]; 2923 const uint64_t ne12 = src1->ne[2]; 2924 const uint64_t ne13 = src1->ne[3]; 2925 2926 const uint64_t ne20 = dst->ne[0]; 2927 const uint64_t ne21 = dst->ne[1]; 2928 2929 const uint64_t r2 = ne12 / ne02; 2930 const uint64_t r3 = ne13 / ne03; 2931 2932 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 2933 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 2934 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 2935 2936 vk_buffer d_Qx; 2937 size_t qx_buf_offset = 0; 2938 vk_buffer d_Qy; 2939 size_t qy_buf_offset = 0; 2940 2941 bool src0_uma = false; 2942 bool src1_uma = false; 2943 2944 if (ctx->device->uma) { 2945 ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); 2946 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 2947 src0_uma = d_Qx != nullptr; 2948 src1_uma = d_Qy != nullptr; 2949 } 2950 2951 const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); 2952 const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); 2953 2954 const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; 2955 2956 vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); 2957 2958 const bool qx_needs_dequant = mmp == nullptr || x_non_contig; 2959 const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; 2960 2961 if (mmp == nullptr) { 2962 // Fall back to dequant + f16 mulmat 2963 mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16); 2964 } 2965 2966 // Not implemented 2967 GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT 2968 2969 const int x_ne = ne01 * ne00; 2970 const int y_ne = ne11 * ne10; 2971 const int d_ne = ne11 * ne01; 2972 2973 const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); 2974 const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; 2975 2976 const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); 2977 2978 vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); 2979 2980 const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); 2981 const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); 2982 const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; 2983 const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; 2984 const uint64_t d_sz = sizeof(float) * d_ne; 2985 2986 vk_buffer d_D = extra->buffer_gpu.lock(); 2987 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 2988 GGML_ASSERT(d_D != nullptr); 2989 GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); 2990 vk_buffer d_X; 2991 uint64_t x_buf_offset = 0; 2992 vk_buffer d_Y; 2993 uint64_t y_buf_offset = 0; 2994 if (!src0_uma) { 2995 d_Qx = extra_src0->buffer_gpu.lock(); 2996 qx_buf_offset = extra_src0->offset + src0->view_offs; 2997 GGML_ASSERT(d_Qx != nullptr); 2998 } 2999 if (!src1_uma) { 3000 d_Qy = extra_src1->buffer_gpu.lock(); 3001 qy_buf_offset = extra_src1->offset + src1->view_offs; 3002 GGML_ASSERT(d_Qy != nullptr); 3003 } 3004 if (qx_needs_dequant) { 3005 d_X = ctx->prealloc_x; 3006 GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03); 3007 } else { 3008 d_X = d_Qx; 3009 x_buf_offset = qx_buf_offset; 3010 GGML_ASSERT(qx_sz == x_sz); 3011 } 3012 if (qy_needs_dequant) { 3013 d_Y = ctx->prealloc_y; 3014 GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03); 3015 } else { 3016 d_Y = d_Qy; 3017 y_buf_offset = qy_buf_offset; 3018 GGML_ASSERT(qy_sz == y_sz); 3019 } 3020 3021 vk_pipeline to_fp16_vk_0 = nullptr; 3022 vk_pipeline to_fp16_vk_1 = nullptr; 3023 3024 if (x_non_contig) { 3025 to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); 3026 } else { 3027 to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); 3028 } 3029 if (y_non_contig) { 3030 to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); 3031 } else { 3032 to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); 3033 } 3034 GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT 3035 GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT 3036 3037 // Allocate descriptor sets 3038 ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); 3039 if (qx_needs_dequant) { 3040 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); 3041 } 3042 if (qy_needs_dequant) { 3043 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1); 3044 } 3045 if (split_k > 1) { 3046 ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); 3047 } 3048 3049 if (x_non_contig) { 3050 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); 3051 } else if (qx_needs_dequant) { 3052 const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; 3053 ggml_vk_sync_buffers(subctx); 3054 ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); 3055 } 3056 if (y_non_contig) { 3057 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); 3058 } 3059 3060 uint32_t stride_batch_x = ne00*ne01; 3061 uint32_t stride_batch_y = ne10*ne11; 3062 3063 if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { 3064 stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); 3065 } 3066 3067 if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { 3068 stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); 3069 } 3070 3071 // compute 3072 ggml_vk_matmul( 3073 ctx, subctx, pipeline, 3074 { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, 3075 { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, 3076 ne01, ne11, ne10, 3077 ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21, 3078 split_k, ne12*ne13, ne02, ne12, r2, r3 3079 ); // NOLINT 3080 } 3081 3082 static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 3083 VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3084 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3085 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 3086 GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT 3087 GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT 3088 3089 const uint64_t ne00 = src0->ne[0]; 3090 const uint64_t ne01 = src0->ne[1]; 3091 const uint64_t ne02 = src0->ne[2]; 3092 const uint64_t ne03 = src0->ne[3]; 3093 3094 const uint64_t ne10 = src1->ne[0]; 3095 const uint64_t ne11 = src1->ne[1]; 3096 const uint64_t ne12 = src1->ne[2]; 3097 const uint64_t ne13 = src1->ne[3]; 3098 3099 GGML_ASSERT(ne11 == 1); 3100 3101 const uint64_t ne20 = dst->ne[0]; 3102 const uint64_t ne21 = dst->ne[1]; 3103 const uint64_t ne22 = dst->ne[2]; 3104 const uint64_t ne23 = dst->ne[3]; 3105 3106 const uint64_t r2 = ne12 / ne02; 3107 const uint64_t r3 = ne13 / ne03; 3108 3109 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3110 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3111 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 3112 3113 vk_buffer d_Qx; 3114 size_t qx_buf_offset = 0; 3115 vk_buffer d_Qy; 3116 size_t qy_buf_offset = 0; 3117 3118 bool src0_uma = false; 3119 bool src1_uma = false; 3120 3121 if (ctx->device->uma) { 3122 ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); 3123 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 3124 src0_uma = d_Qx != nullptr; 3125 src1_uma = d_Qy != nullptr; 3126 } 3127 3128 const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); 3129 const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); 3130 3131 const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; 3132 3133 const bool qx_needs_dequant = x_non_contig; 3134 const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig; 3135 3136 // Not implemented 3137 GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT 3138 3139 const uint64_t x_ne = ne01 * ne00; 3140 const uint64_t y_ne = ne11 * ne10; 3141 const uint64_t d_ne = ne11 * ne01; 3142 3143 const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); 3144 const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); 3145 const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; 3146 const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; 3147 const uint64_t d_sz = sizeof(float) * d_ne; 3148 3149 vk_buffer d_D = extra->buffer_gpu.lock(); 3150 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 3151 GGML_ASSERT(d_D != nullptr); 3152 vk_buffer d_X; 3153 uint64_t x_buf_offset = 0; 3154 vk_buffer d_Y; 3155 uint64_t y_buf_offset = 0; 3156 if(!src0_uma) { 3157 d_Qx = extra_src0->buffer_gpu.lock(); 3158 qx_buf_offset = extra_src0->offset + src0->view_offs; 3159 GGML_ASSERT(d_Qx != nullptr); 3160 } 3161 if(!src1_uma) { 3162 d_Qy = extra_src1->buffer_gpu.lock(); 3163 qy_buf_offset = extra_src1->offset + src1->view_offs; 3164 GGML_ASSERT(d_Qy != nullptr); 3165 } 3166 if (qx_needs_dequant) { 3167 d_X = ctx->prealloc_x; 3168 } else { 3169 d_X = d_Qx; 3170 x_buf_offset = qx_buf_offset; 3171 GGML_ASSERT(qx_sz == x_sz); 3172 } 3173 if (qy_needs_dequant) { 3174 d_Y = ctx->prealloc_y; 3175 } else { 3176 d_Y = d_Qy; 3177 y_buf_offset = qy_buf_offset; 3178 GGML_ASSERT(qy_sz == y_sz); 3179 } 3180 3181 vk_pipeline to_fp16_vk_0 = nullptr; 3182 vk_pipeline to_fp16_vk_1 = nullptr; 3183 if (x_non_contig) { 3184 to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); 3185 } 3186 if (y_non_contig) { 3187 to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); 3188 } else { 3189 to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); 3190 } 3191 vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type); 3192 GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT 3193 GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT 3194 GGML_ASSERT(dmmv != nullptr); 3195 3196 // Allocate descriptor sets 3197 if (qx_needs_dequant) { 3198 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); 3199 } 3200 if (qy_needs_dequant) { 3201 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); 3202 } 3203 ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13); 3204 3205 if (x_non_contig) { 3206 GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); 3207 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); 3208 } 3209 if (y_non_contig) { 3210 GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); 3211 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); 3212 } 3213 3214 uint32_t stride_batch_x = ne00*ne01; 3215 uint32_t stride_batch_y = ne10*ne11; 3216 3217 if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { 3218 stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); 3219 } 3220 3221 if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { 3222 stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); 3223 } 3224 3225 // compute 3226 const vk_mat_vec_push_constants pc = { 3227 (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, 3228 stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21), 3229 (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, 3230 }; 3231 ggml_vk_sync_buffers(subctx); 3232 ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1}); 3233 } 3234 3235 static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 3236 VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3237 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3238 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 3239 GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); 3240 GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT 3241 GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT 3242 GGML_ASSERT(src0->type == GGML_TYPE_F16); 3243 GGML_ASSERT(src1->type == GGML_TYPE_F32); 3244 3245 const uint64_t ne00 = src0->ne[0]; 3246 const uint64_t ne01 = src0->ne[1]; 3247 const uint64_t ne02 = src0->ne[2]; 3248 // const uint64_t ne03 = src0->ne[3]; 3249 3250 const uint64_t ne10 = src1->ne[0]; 3251 const uint64_t ne11 = src1->ne[1]; 3252 const uint64_t ne12 = src1->ne[2]; 3253 // const uint64_t ne13 = src1->ne[3]; 3254 3255 GGML_ASSERT(ne11 == 1); 3256 3257 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3258 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3259 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 3260 3261 vk_buffer d_Qy; 3262 size_t qy_buf_offset = 0; 3263 3264 bool src1_uma = false; 3265 3266 if (ctx->device->uma) { 3267 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 3268 src1_uma = d_Qy != nullptr; 3269 } 3270 3271 const uint64_t x_ne = ne00 * ne01 * ne02; 3272 const uint64_t y_ne = ne10 * ne11 * ne12; 3273 const uint64_t d_ne = ne01 * ne11 * ne12; 3274 3275 const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); 3276 const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); 3277 const uint64_t d_sz = sizeof(float) * d_ne; 3278 3279 vk_buffer d_D = extra->buffer_gpu.lock(); 3280 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 3281 GGML_ASSERT(d_D != nullptr); 3282 vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); 3283 const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; 3284 GGML_ASSERT(d_Qx != nullptr); 3285 if (!src1_uma) { 3286 d_Qy = extra_src1->buffer_gpu.lock(); 3287 qy_buf_offset = extra_src1->offset + src1->view_offs; 3288 GGML_ASSERT(d_Qx != nullptr); 3289 } 3290 3291 // Allocate descriptor sets 3292 ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1); 3293 3294 const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; 3295 const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; 3296 3297 const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; 3298 const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; 3299 3300 // compute 3301 const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; 3302 ggml_vk_sync_buffers(subctx); 3303 ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); 3304 } 3305 3306 static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 3307 VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3308 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3309 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 3310 GGML_ASSERT(!ggml_is_transposed(src0)); 3311 GGML_ASSERT(!ggml_is_transposed(src1)); 3312 GGML_ASSERT(!ggml_is_permuted(src0)); 3313 GGML_ASSERT(src0->type == GGML_TYPE_F16); 3314 GGML_ASSERT(src1->type == GGML_TYPE_F32); 3315 3316 const uint64_t ne00 = src0->ne[0]; 3317 const uint64_t ne01 = src0->ne[1]; 3318 const uint64_t ne02 = src0->ne[2]; 3319 // const uint64_t ne03 = src0->ne[3]; 3320 3321 const uint64_t nb01 = src0->nb[1]; 3322 const uint64_t nb02 = src0->nb[2]; 3323 3324 // const uint64_t ne10 = src1->ne[0]; 3325 const uint64_t ne11 = src1->ne[1]; 3326 const uint64_t ne12 = src1->ne[2]; 3327 // const uint64_t ne13 = src1->ne[3]; 3328 3329 GGML_ASSERT(ne11 == 1); 3330 3331 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3332 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3333 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 3334 3335 vk_buffer d_Qy = nullptr; 3336 size_t qy_buf_offset = 0; 3337 3338 bool src1_uma = false; 3339 3340 if (ctx->device->uma) { 3341 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 3342 src1_uma = d_Qy != nullptr; 3343 } 3344 3345 const uint64_t d_ne = ne01 * ne11 * ne12; 3346 3347 const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); 3348 const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t); 3349 3350 const uint64_t qx_sz = ggml_nbytes(src0); 3351 const uint64_t qy_sz = ggml_nbytes(src1); 3352 const uint64_t d_sz = sizeof(float) * d_ne; 3353 3354 vk_buffer d_D = extra->buffer_gpu.lock(); 3355 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 3356 GGML_ASSERT(d_D != nullptr); 3357 vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); 3358 const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; 3359 GGML_ASSERT(d_Qx != nullptr); 3360 if (!src1_uma) { 3361 d_Qy = extra_src1->buffer_gpu.lock(); 3362 qy_buf_offset = extra_src1->offset + src1->view_offs; 3363 GGML_ASSERT(d_Qx != nullptr); 3364 } 3365 3366 // Allocate descriptor sets 3367 ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); 3368 3369 const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; 3370 const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; 3371 3372 const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; 3373 const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; 3374 3375 // compute 3376 const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; 3377 ggml_vk_sync_buffers(subctx); 3378 ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); 3379 } 3380 3381 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 3382 VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); 3383 if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { 3384 ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); 3385 } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) { 3386 ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst); 3387 } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { 3388 ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst); 3389 } else { 3390 ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst); 3391 } 3392 } 3393 3394 static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { 3395 VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3396 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3397 std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; 3398 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 3399 GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT 3400 GGML_ASSERT(ids->type == GGML_TYPE_I32); 3401 3402 const uint64_t ne00 = src0->ne[0]; 3403 const uint64_t ne01 = src0->ne[1]; 3404 const uint64_t ne02 = src0->ne[2]; 3405 const uint64_t ne03 = src0->ne[3]; 3406 3407 const uint64_t ne10 = src1->ne[0]; 3408 const uint64_t ne11 = src1->ne[1]; 3409 const uint64_t ne12 = src1->ne[2]; 3410 const uint64_t ne13 = src1->ne[3]; 3411 3412 const uint64_t nei0 = ids->ne[0]; 3413 const uint64_t nei1 = ids->ne[1]; 3414 GGML_ASSERT(nei0 * nei1 <= 2048); 3415 3416 const uint32_t nbi1 = ids->nb[1]; 3417 const uint32_t nbi2 = ids->nb[2]; 3418 3419 const uint64_t ne20 = dst->ne[0]; 3420 const uint64_t ne21 = dst->ne[1]; 3421 const uint64_t ne22 = dst->ne[2]; 3422 const uint64_t ne23 = dst->ne[3]; 3423 3424 const uint64_t n_as = ne02; 3425 3426 GGML_ASSERT(n_as <= 8); 3427 3428 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3429 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3430 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 3431 ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; 3432 3433 vk_buffer d_Qx; 3434 size_t qx_buf_offset = 0; 3435 vk_buffer d_Qy; 3436 size_t qy_buf_offset = 0; 3437 vk_buffer d_ids; 3438 size_t ids_buf_offset = 0; 3439 3440 bool src0_uma = false; 3441 bool src1_uma = false; 3442 bool ids_uma = false; 3443 3444 if (ctx->device->uma) { 3445 ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); 3446 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 3447 ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset); 3448 src0_uma = d_Qx != nullptr; 3449 src1_uma = d_Qy != nullptr; 3450 ids_uma = d_ids != nullptr; 3451 } 3452 3453 const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); 3454 const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); 3455 3456 const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; 3457 3458 vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); 3459 3460 const bool qx_needs_dequant = mmp == nullptr || x_non_contig; 3461 const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; 3462 3463 if (mmp == nullptr) { 3464 GGML_ASSERT(false); 3465 } 3466 3467 // Not implemented 3468 GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT 3469 3470 const uint64_t x_ne = ne01 * ne00; 3471 const uint64_t y_ne = ne11 * ne10; 3472 const uint64_t d_ne = ne21 * ne20; 3473 3474 const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, nei1)); 3475 const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; 3476 3477 vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, nei1, aligned); 3478 3479 const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); 3480 const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); 3481 const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; 3482 const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; 3483 const uint64_t ids_sz = nbi2; 3484 const uint64_t d_sz = sizeof(float) * d_ne; 3485 3486 vk_buffer d_D = extra->buffer_gpu.lock(); 3487 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 3488 GGML_ASSERT(d_D != nullptr); 3489 vk_buffer d_X; 3490 uint64_t x_buf_offset = 0; 3491 vk_buffer d_Y; 3492 uint64_t y_buf_offset = 0; 3493 if (!src0_uma) { 3494 d_Qx = extra_src0->buffer_gpu.lock(); 3495 qx_buf_offset = extra_src0->offset + src0->view_offs; 3496 GGML_ASSERT(d_Qx != nullptr); 3497 } 3498 if (!src1_uma) { 3499 d_Qy = extra_src1->buffer_gpu.lock(); 3500 qy_buf_offset = extra_src1->offset + src1->view_offs; 3501 GGML_ASSERT(d_Qy != nullptr); 3502 } 3503 if (!ids_uma) { 3504 d_ids = extra_ids->buffer_gpu.lock(); 3505 ids_buf_offset = extra_ids->offset + ids->view_offs; 3506 GGML_ASSERT(d_ids != nullptr); 3507 } 3508 if (qx_needs_dequant) { 3509 d_X = ctx->prealloc_x; 3510 GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03); 3511 } else { 3512 d_X = d_Qx; 3513 x_buf_offset = qx_buf_offset; 3514 GGML_ASSERT(qx_sz == x_sz); 3515 } 3516 if (qy_needs_dequant) { 3517 d_Y = ctx->prealloc_y; 3518 GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03); 3519 } else { 3520 d_Y = d_Qy; 3521 y_buf_offset = qy_buf_offset; 3522 GGML_ASSERT(qy_sz == y_sz); 3523 } 3524 3525 vk_pipeline to_fp16_vk_0 = nullptr; 3526 vk_pipeline to_fp16_vk_1 = nullptr; 3527 3528 if (x_non_contig) { 3529 to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); 3530 } else { 3531 to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); 3532 } 3533 if (y_non_contig) { 3534 to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); 3535 } else { 3536 to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); 3537 } 3538 GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT 3539 GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT 3540 3541 // Allocate descriptor sets 3542 ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); 3543 if (qx_needs_dequant) { 3544 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); 3545 } 3546 if (qy_needs_dequant) { 3547 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1); 3548 } 3549 3550 if (x_non_contig) { 3551 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); 3552 } else if (qx_needs_dequant) { 3553 const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; 3554 ggml_vk_sync_buffers(subctx); 3555 ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); 3556 } 3557 if (y_non_contig) { 3558 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); 3559 } 3560 3561 uint32_t stride_batch_x = ne00*ne01; 3562 uint32_t stride_batch_y = ne10*ne11; 3563 3564 if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { 3565 stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); 3566 } 3567 3568 if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { 3569 stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); 3570 } 3571 3572 // compute 3573 ggml_vk_matmul_id( 3574 ctx, subctx, pipeline, 3575 { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, 3576 { d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz }, 3577 ne01, ne21, ne10, ne10, ne10, ne01, 3578 stride_batch_x, stride_batch_y, ne20*ne21, 3579 n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11 3580 ); // NOLINT 3581 } 3582 3583 static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { 3584 VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3585 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3586 std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; 3587 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); 3588 GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT 3589 GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT 3590 GGML_ASSERT(ids->type == GGML_TYPE_I32); 3591 3592 const uint64_t ne00 = src0->ne[0]; 3593 const uint64_t ne01 = src0->ne[1]; 3594 const uint64_t ne02 = src0->ne[2]; 3595 const uint64_t ne03 = src0->ne[3]; 3596 3597 const uint64_t ne10 = src1->ne[0]; 3598 const uint64_t ne11 = src1->ne[1]; 3599 const uint64_t ne12 = src1->ne[2]; 3600 const uint64_t ne13 = src1->ne[3]; 3601 3602 const uint64_t nei0 = ids->ne[0]; 3603 const uint64_t nei1 = ids->ne[1]; 3604 3605 const uint64_t nbi2 = ids->nb[2]; 3606 3607 GGML_ASSERT(nei1 == 1); 3608 3609 const uint64_t ne20 = dst->ne[0]; 3610 const uint64_t ne21 = dst->ne[1]; 3611 const uint64_t ne22 = dst->ne[2]; 3612 const uint64_t ne23 = dst->ne[3]; 3613 3614 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3615 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3616 ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; 3617 ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; 3618 3619 vk_buffer d_Qx; 3620 size_t qx_buf_offset = 0; 3621 vk_buffer d_Qy; 3622 size_t qy_buf_offset = 0; 3623 vk_buffer d_ids; 3624 size_t ids_buf_offset = 0; 3625 3626 bool src0_uma = false; 3627 bool src1_uma = false; 3628 bool ids_uma = false; 3629 3630 if (ctx->device->uma) { 3631 ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); 3632 ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); 3633 ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset); 3634 src0_uma = d_Qx != nullptr; 3635 src1_uma = d_Qy != nullptr; 3636 ids_uma = d_ids != nullptr; 3637 } 3638 3639 const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); 3640 const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); 3641 3642 const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; 3643 3644 const bool qx_needs_dequant = x_non_contig; 3645 const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig; 3646 3647 // Not implemented 3648 GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT 3649 3650 const uint64_t x_ne = ne01 * ne00; 3651 const uint64_t y_ne = ne11 * ne10; 3652 const uint64_t d_ne = ne21 * ne20; 3653 3654 const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); 3655 const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); 3656 const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; 3657 const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; 3658 const uint64_t ids_sz = nbi2; 3659 const uint64_t d_sz = sizeof(float) * d_ne; 3660 3661 vk_buffer d_D = extra->buffer_gpu.lock(); 3662 const uint64_t d_buf_offset = extra->offset + dst->view_offs; 3663 GGML_ASSERT(d_D != nullptr); 3664 vk_buffer d_X; 3665 uint64_t x_buf_offset = 0; 3666 vk_buffer d_Y; 3667 uint64_t y_buf_offset = 0; 3668 if(!src0_uma) { 3669 d_Qx = extra_src0->buffer_gpu.lock(); 3670 qx_buf_offset = extra_src0->offset + src0->view_offs; 3671 GGML_ASSERT(d_Qx != nullptr); 3672 } 3673 if(!src1_uma) { 3674 d_Qy = extra_src1->buffer_gpu.lock(); 3675 qy_buf_offset = extra_src1->offset + src1->view_offs; 3676 GGML_ASSERT(d_Qy != nullptr); 3677 } 3678 if(!ids_uma) { 3679 d_ids = extra_ids->buffer_gpu.lock(); 3680 ids_buf_offset = extra_ids->offset + ids->view_offs; 3681 GGML_ASSERT(d_ids != nullptr); 3682 } 3683 if (qx_needs_dequant) { 3684 d_X = ctx->prealloc_x; 3685 } else { 3686 d_X = d_Qx; 3687 x_buf_offset = qx_buf_offset; 3688 GGML_ASSERT(qx_sz == x_sz); 3689 } 3690 if (qy_needs_dequant) { 3691 d_Y = ctx->prealloc_y; 3692 } else { 3693 d_Y = d_Qy; 3694 y_buf_offset = qy_buf_offset; 3695 GGML_ASSERT(qy_sz == y_sz); 3696 } 3697 3698 vk_pipeline to_fp16_vk_0 = nullptr; 3699 vk_pipeline to_fp16_vk_1 = nullptr; 3700 if (x_non_contig) { 3701 to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); 3702 } 3703 if (y_non_contig) { 3704 to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); 3705 } else { 3706 to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); 3707 } 3708 vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type); 3709 GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT 3710 GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT 3711 GGML_ASSERT(dmmv != nullptr); 3712 3713 // Allocate descriptor sets 3714 if (qx_needs_dequant) { 3715 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); 3716 } 3717 if (qy_needs_dequant) { 3718 ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); 3719 } 3720 ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13); 3721 3722 if (x_non_contig) { 3723 GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); 3724 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); 3725 } 3726 if (y_non_contig) { 3727 GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); 3728 ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); 3729 } 3730 3731 uint32_t stride_batch_y = ne10*ne11; 3732 3733 if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { 3734 stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); 3735 } 3736 3737 // compute 3738 const vk_mat_vec_id_push_constants pc = { 3739 (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, 3740 (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21), 3741 (uint32_t)nei0, (uint32_t)ne11, 3742 }; 3743 ggml_vk_sync_buffers(subctx); 3744 ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, 3745 { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, 3746 sizeof(vk_mat_vec_id_push_constants), &pc, { (uint32_t)ne01, (uint32_t)nei0, 1 }); 3747 } 3748 3749 static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { 3750 VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); 3751 if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { 3752 ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst); 3753 } else { 3754 ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); 3755 } 3756 } 3757 3758 static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 3759 // guaranteed to be an integer due to the check in ggml_can_repeat 3760 const uint64_t ne0 = dst->ne[0]; 3761 const uint64_t ne1 = dst->ne[1]; 3762 const uint64_t ne2 = dst->ne[2]; 3763 const uint64_t ne3 = dst->ne[3]; 3764 3765 const uint64_t ne00 = src0->ne[0]; 3766 const uint64_t ne01 = src0->ne[1]; 3767 const uint64_t ne02 = src0->ne[2]; 3768 const uint64_t ne03 = src0->ne[3]; 3769 3770 const uint64_t nb0 = dst->nb[0]; 3771 const uint64_t nb1 = dst->nb[1]; 3772 const uint64_t nb2 = dst->nb[2]; 3773 const uint64_t nb3 = dst->nb[3]; 3774 3775 const uint64_t nb00 = src0->nb[0]; 3776 const uint64_t nb01 = src0->nb[1]; 3777 const uint64_t nb02 = src0->nb[2]; 3778 const uint64_t nb03 = src0->nb[3]; 3779 3780 const uint64_t nr0 = ne0/ne00; 3781 const uint64_t nr1 = ne1/ne01; 3782 const uint64_t nr2 = ne2/ne02; 3783 const uint64_t nr3 = ne3/ne03; 3784 3785 // TODO: support for transposed / permuted tensors 3786 GGML_ASSERT(nb0 == sizeof(float)); 3787 GGML_ASSERT(nb00 == sizeof(float)); 3788 3789 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 3790 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 3791 3792 const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); 3793 const uint64_t src_offset = extra_src0->offset + src0->view_offs; 3794 vk_buffer dst_buf = extra->buffer_gpu.lock(); 3795 const uint64_t dst_offset = extra->offset + dst->view_offs; 3796 3797 std::vector<vk::BufferCopy> copies; 3798 3799 for (uint64_t i3 = 0; i3 < nr3; i3++) { 3800 for (uint64_t k3 = 0; k3 < ne03; k3++) { 3801 for (uint64_t i2 = 0; i2 < nr2; i2++) { 3802 for (uint64_t k2 = 0; k2 < ne02; k2++) { 3803 for (uint64_t i1 = 0; i1 < nr1; i1++) { 3804 for (uint64_t k1 = 0; k1 < ne01; k1++) { 3805 for (uint64_t i0 = 0; i0 < nr0; i0++) { 3806 copies.push_back({ 3807 src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, 3808 dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, 3809 ne00*nb0, 3810 }); 3811 } 3812 } 3813 } 3814 } 3815 } 3816 } 3817 } 3818 3819 ggml_vk_sync_buffers(subctx); 3820 subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); 3821 3822 GGML_UNUSED(ctx); 3823 GGML_UNUSED(src1); 3824 } 3825 3826 3827 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { 3828 switch (op) { 3829 case GGML_OP_ADD: 3830 if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3831 return ctx->device->pipeline_add_f32; 3832 } 3833 return nullptr; 3834 case GGML_OP_GET_ROWS: 3835 GGML_ASSERT(src1->type == GGML_TYPE_I32); 3836 if (dst->type == GGML_TYPE_F16) { 3837 return ctx->device->pipeline_get_rows[src0->type]; 3838 } 3839 if (dst->type == GGML_TYPE_F32) { 3840 return ctx->device->pipeline_get_rows_f32[src0->type]; 3841 } 3842 return nullptr; 3843 case GGML_OP_MUL: 3844 if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3845 return ctx->device->pipeline_mul_f32; 3846 } 3847 return nullptr; 3848 case GGML_OP_DIV: 3849 if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3850 return ctx->device->pipeline_div_f32; 3851 } 3852 return nullptr; 3853 case GGML_OP_SCALE: 3854 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3855 return ctx->device->pipeline_scale_f32; 3856 } 3857 return nullptr; 3858 case GGML_OP_SQR: 3859 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3860 return ctx->device->pipeline_sqr_f32; 3861 } 3862 return nullptr; 3863 case GGML_OP_CLAMP: 3864 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3865 return ctx->device->pipeline_clamp_f32; 3866 } 3867 return nullptr; 3868 case GGML_OP_CPY: 3869 case GGML_OP_CONT: 3870 case GGML_OP_DUP: 3871 return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); 3872 case GGML_OP_NORM: 3873 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3874 return ctx->device->pipeline_norm_f32; 3875 } 3876 return nullptr; 3877 case GGML_OP_RMS_NORM: 3878 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3879 return ctx->device->pipeline_rms_norm_f32; 3880 } 3881 return nullptr; 3882 case GGML_OP_UNARY: 3883 switch (ggml_get_unary_op(dst)) { 3884 case GGML_UNARY_OP_SILU: 3885 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3886 return ctx->device->pipeline_silu_f32; 3887 } 3888 break; 3889 case GGML_UNARY_OP_GELU: 3890 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3891 return ctx->device->pipeline_gelu_f32; 3892 } 3893 break; 3894 case GGML_UNARY_OP_RELU: 3895 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3896 return ctx->device->pipeline_relu_f32; 3897 } 3898 break; 3899 default: 3900 break; 3901 } 3902 return nullptr; 3903 case GGML_OP_DIAG_MASK_INF: 3904 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3905 return ctx->device->pipeline_diag_mask_inf_f32; 3906 } 3907 return nullptr; 3908 case GGML_OP_SOFT_MAX: 3909 GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); 3910 3911 if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { 3912 return ctx->device->pipeline_soft_max_f32; 3913 } 3914 if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { 3915 return ctx->device->pipeline_soft_max_f32_f16; 3916 } 3917 return nullptr; 3918 case GGML_OP_ROPE: 3919 { 3920 const int mode = ((const int32_t *) dst->op_params)[2]; 3921 const bool is_neox = mode & 2; 3922 3923 if (is_neox) { 3924 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3925 return ctx->device->pipeline_rope_neox_f32; 3926 } 3927 if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { 3928 return ctx->device->pipeline_rope_neox_f16; 3929 } 3930 } else { 3931 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3932 return ctx->device->pipeline_rope_norm_f32; 3933 } 3934 if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { 3935 return ctx->device->pipeline_rope_norm_f16; 3936 } 3937 } 3938 return nullptr; 3939 } 3940 case GGML_OP_ARGSORT: 3941 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) { 3942 return ctx->device->pipeline_argsort_f32; 3943 } 3944 return nullptr; 3945 case GGML_OP_SUM_ROWS: 3946 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { 3947 return ctx->device->pipeline_sum_rows_f32; 3948 } 3949 return nullptr; 3950 default: 3951 return nullptr; 3952 } 3953 3954 GGML_UNUSED(src2); 3955 } 3956 3957 static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { 3958 switch(op) { 3959 case GGML_OP_REPEAT: 3960 return ggml_vk_op_repeat; 3961 default: 3962 return nullptr; 3963 } 3964 } 3965 3966 static bool ggml_vk_op_supports_incontiguous(ggml_op op) { 3967 switch (op) { 3968 case GGML_OP_CPY: 3969 case GGML_OP_GET_ROWS: 3970 case GGML_OP_ADD: 3971 case GGML_OP_MUL: 3972 case GGML_OP_DIV: 3973 case GGML_OP_SCALE: 3974 case GGML_OP_SQR: 3975 case GGML_OP_CLAMP: 3976 return true; 3977 default: 3978 return false; 3979 } 3980 } 3981 3982 template<typename PC> 3983 static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { 3984 VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; 3985 if (src1 != nullptr) { 3986 std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; 3987 } 3988 if (src2 != nullptr) { 3989 std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; 3990 } 3991 std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"); 3992 GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT 3993 GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT 3994 GGML_ASSERT(dst->extra != nullptr); 3995 const uint64_t ne00 = src0->ne[0]; 3996 const uint64_t ne01 = src0->ne[1]; 3997 const uint64_t ne02 = src0->ne[2]; 3998 const uint64_t ne03 = src0->ne[3]; 3999 const uint64_t ne0 = ne00 * ne01; 4000 4001 const bool use_src1 = src1 != nullptr; 4002 const uint64_t ne10 = use_src1 ? src1->ne[0] : 0; 4003 const uint64_t ne11 = use_src1 ? src1->ne[1] : 0; 4004 const uint64_t ne12 = use_src1 ? src1->ne[2] : 0; 4005 const uint64_t ne13 = use_src1 ? src1->ne[3] : 0; 4006 const uint64_t ne1 = ne10 * ne11; 4007 // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0; 4008 4009 const bool use_src2 = src2 != nullptr; 4010 const uint64_t ne20 = use_src2 ? src2->ne[0] : 0; 4011 const uint64_t ne21 = use_src2 ? src2->ne[1] : 0; 4012 const uint64_t ne22 = use_src2 ? src2->ne[2] : 0; 4013 const uint64_t ne23 = use_src2 ? src2->ne[3] : 0; 4014 const uint64_t ne2 = ne20 * ne21; 4015 4016 const uint64_t ned0 = dst->ne[0]; 4017 const uint64_t ned1 = dst->ne[1]; 4018 const uint64_t ned2 = dst->ne[2]; 4019 const uint64_t ned3 = dst->ne[3]; 4020 const uint64_t ned = ned0 * ned1; 4021 4022 vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); 4023 ggml_vk_func_t op_func; 4024 4025 if (pipeline == nullptr) { 4026 op_func = ggml_vk_op_get_func(op); 4027 if (op_func == nullptr) { 4028 std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); 4029 if (src1 != nullptr) { 4030 std::cerr << " and " << ggml_type_name(src1->type); 4031 } 4032 std::cerr << " to " << ggml_type_name(dst->type) << std::endl; 4033 GGML_ASSERT(false); 4034 } 4035 4036 op_func(ctx, subctx, src0, src1, dst); 4037 return; 4038 } 4039 4040 const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); 4041 4042 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 4043 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; 4044 ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; 4045 ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr; 4046 4047 vk_buffer d_X = nullptr; 4048 size_t x_buf_offset = 0; 4049 vk_buffer d_Y = nullptr; 4050 size_t y_buf_offset = 0; 4051 vk_buffer d_Z = nullptr; 4052 size_t z_buf_offset = 0; 4053 4054 bool src0_uma = false; 4055 bool src1_uma = false; 4056 bool src2_uma = false; 4057 4058 if (ctx->device->uma) { 4059 ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset); 4060 src0_uma = d_X != nullptr; 4061 if (use_src1) { 4062 ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset); 4063 src1_uma = d_Y != nullptr; 4064 } 4065 if (use_src2) { 4066 ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset); 4067 src2_uma = d_Z != nullptr; 4068 } 4069 } 4070 4071 uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0; 4072 uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0; 4073 uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; 4074 uint64_t d_sz = ggml_type_size(dst->type) * ned; 4075 4076 vk_buffer d_D = extra->buffer_gpu.lock(); 4077 4078 // Workaround for tiny tensor inputs on ROPE 4079 if (use_src1 && y_sz > d_D->size) { 4080 y_sz = VK_WHOLE_SIZE; 4081 } 4082 4083 GGML_ASSERT(d_D != nullptr); 4084 uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; 4085 GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT 4086 if(!src0_uma) { 4087 d_X = extra_src0->buffer_gpu.lock(); 4088 x_buf_offset = extra_src0->offset + src0->view_offs; 4089 GGML_ASSERT(d_X != nullptr); 4090 } 4091 if (use_src1 && !src1_uma) { 4092 d_Y = extra_src1->buffer_gpu.lock(); 4093 y_buf_offset = extra_src1->offset + src1->view_offs; 4094 GGML_ASSERT(d_Y != nullptr); 4095 } 4096 if (use_src2 && !src2_uma) { 4097 d_Z = extra_src2->buffer_gpu.lock(); 4098 z_buf_offset = extra_src2->offset + src2->view_offs; 4099 GGML_ASSERT(d_Z != nullptr); 4100 } 4101 4102 if (op_supports_incontiguous) { 4103 x_sz = ggml_nbytes(src0); 4104 y_sz = use_src1 ? ggml_nbytes(src1) : 0; 4105 z_sz = use_src2 ? ggml_nbytes(src2) : 0; 4106 d_sz = ggml_nbytes(dst); 4107 4108 if (x_buf_offset + x_sz >= d_X->size) { 4109 x_sz = VK_WHOLE_SIZE; 4110 } 4111 if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { 4112 y_sz = VK_WHOLE_SIZE; 4113 } 4114 if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { 4115 z_sz = VK_WHOLE_SIZE; 4116 } 4117 if (d_buf_offset + d_sz >= d_D->size) { 4118 d_sz = VK_WHOLE_SIZE; 4119 } 4120 } 4121 4122 std::array<uint32_t, 3> elements; 4123 4124 // Single call if dimension 2 is contiguous 4125 if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { 4126 ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); 4127 4128 switch (dst->op) { 4129 case GGML_OP_NORM: 4130 case GGML_OP_RMS_NORM: 4131 case GGML_OP_SOFT_MAX: 4132 case GGML_OP_SUM_ROWS: 4133 elements = { (uint32_t)ggml_nrows(src0), 1, 1 }; 4134 break; 4135 case GGML_OP_DIAG_MASK_INF: 4136 case GGML_OP_ROPE: 4137 elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; 4138 break; 4139 case GGML_OP_GET_ROWS: 4140 elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; 4141 break; 4142 case GGML_OP_ARGSORT: 4143 elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 }; 4144 break; 4145 default: 4146 elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; 4147 break; 4148 } 4149 4150 if (!op_supports_incontiguous) { 4151 if (x_sz != VK_WHOLE_SIZE) { 4152 x_sz *= ne02 * ne03; 4153 } 4154 if (use_src1 && y_sz != VK_WHOLE_SIZE) { 4155 y_sz *= ne12 * ne13; 4156 } 4157 if (use_src2 && z_sz != VK_WHOLE_SIZE) { 4158 z_sz *= ne22 * ne23; 4159 } 4160 if (d_sz != VK_WHOLE_SIZE) { 4161 d_sz *= ned2 * ned3; 4162 } 4163 } 4164 4165 if (op == GGML_OP_SOFT_MAX) { 4166 // Empty src1 is possible in soft_max, but the shader needs a buffer 4167 vk_subbuffer subbuf_y; 4168 if (use_src1) { 4169 subbuf_y = { d_Y, y_buf_offset, y_sz }; 4170 } else { 4171 subbuf_y = { d_X, 0, d_X->size }; 4172 } 4173 4174 ggml_vk_sync_buffers(subctx); 4175 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); 4176 } else if (op == GGML_OP_ROPE) { 4177 // Empty src2 is possible in rope, but the shader needs a buffer 4178 vk_subbuffer subbuf_z; 4179 if (use_src2) { 4180 subbuf_z = { d_Z, z_buf_offset, z_sz }; 4181 } else { 4182 subbuf_z = { d_X, 0, d_X->size }; 4183 } 4184 4185 ggml_vk_sync_buffers(subctx); 4186 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); 4187 } else if (use_src2) { 4188 ggml_vk_sync_buffers(subctx); 4189 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); 4190 } else if (use_src1) { 4191 ggml_vk_sync_buffers(subctx); 4192 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); 4193 } else { 4194 ggml_vk_sync_buffers(subctx); 4195 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); 4196 } 4197 } else { 4198 GGML_ASSERT(op != GGML_OP_SOFT_MAX); 4199 GGML_ASSERT(op != GGML_OP_ARGSORT); 4200 GGML_ASSERT(!use_src2); 4201 4202 ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03); 4203 4204 switch (dst->op) { 4205 case GGML_OP_NORM: 4206 case GGML_OP_RMS_NORM: 4207 elements = { (uint32_t)ne01, 1, 1 }; 4208 break; 4209 case GGML_OP_DIAG_MASK_INF: 4210 case GGML_OP_ROPE: 4211 elements = { (uint32_t)ne01, (uint32_t)ne00, 1 }; 4212 break; 4213 case GGML_OP_GET_ROWS: 4214 elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; 4215 break; 4216 default: 4217 elements = { (uint32_t)ne0, 1, 1 }; 4218 break; 4219 } 4220 4221 for (uint64_t i03 = 0; i03 < ne03; i03++) { 4222 for (uint64_t i02 = 0; i02 < ne02; i02++) { 4223 const uint32_t it_idx0 = (i03 * ne02 + i02); 4224 const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0; 4225 const uint32_t x_offset = x_sz * it_idx0; 4226 const uint32_t y_offset = y_sz * it_idx1; 4227 const uint32_t d_offset = d_sz * it_idx0; 4228 4229 if (use_src1) { 4230 ggml_vk_sync_buffers(subctx); 4231 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); 4232 } else { 4233 ggml_vk_sync_buffers(subctx); 4234 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); 4235 } 4236 } 4237 } 4238 } 4239 } 4240 4241 static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4242 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); 4243 } 4244 4245 static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4246 const uint32_t src0_type_size = ggml_type_size(src0->type); 4247 const uint32_t src1_type_size = ggml_type_size(src1->type); 4248 const uint32_t dst_type_size = ggml_type_size(dst->type); 4249 4250 ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { 4251 (uint32_t)ggml_nelements(src0), 4252 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4253 (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, 4254 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4255 0, 4256 0.0f, 0.0f, 4257 }); 4258 } 4259 4260 static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4261 const uint32_t src0_type_size = ggml_type_size(src0->type); 4262 const uint32_t src1_type_size = ggml_type_size(src1->type); 4263 const uint32_t dst_type_size = ggml_type_size(dst->type); 4264 4265 ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, { 4266 (uint32_t)ggml_nelements(src0), 4267 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4268 (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, 4269 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4270 0, 4271 0.0f, 0.0f, 4272 }); 4273 } 4274 4275 static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4276 const uint32_t src0_type_size = ggml_type_size(src0->type); 4277 const uint32_t src1_type_size = ggml_type_size(src1->type); 4278 const uint32_t dst_type_size = ggml_type_size(dst->type); 4279 4280 ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, { 4281 (uint32_t)ggml_nelements(src0), 4282 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4283 (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, 4284 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4285 0, 4286 0.0f, 0.0f, 4287 }); 4288 } 4289 4290 static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4291 const uint32_t src0_type_size = ggml_type_size(src0->type); 4292 const uint32_t src1_type_size = ggml_type_size(src1->type); 4293 const uint32_t dst_type_size = ggml_type_size(dst->type); 4294 4295 ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_DIV, { 4296 (uint32_t)ggml_nelements(src0), 4297 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4298 (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, 4299 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4300 0, 4301 0.0f, 0.0f, 4302 }); 4303 } 4304 4305 static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4306 float * op_params = (float *)dst->op_params; 4307 const uint32_t src0_type_size = ggml_type_size(src0->type); 4308 const uint32_t dst_type_size = ggml_type_size(dst->type); 4309 4310 ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, { 4311 (uint32_t)ggml_nelements(src0), 4312 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4313 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4314 0, 4315 op_params[0], 0.0f 4316 }); 4317 } 4318 4319 static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4320 const uint32_t src0_type_size = ggml_type_size(src0->type); 4321 const uint32_t dst_type_size = ggml_type_size(dst->type); 4322 4323 ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, { 4324 (uint32_t)ggml_nelements(src0), 4325 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4326 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4327 0, 4328 0.0f, 0.0f, 4329 }); 4330 } 4331 4332 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4333 float * op_params = (float *)dst->op_params; 4334 const uint32_t src0_type_size = ggml_type_size(src0->type); 4335 const uint32_t dst_type_size = ggml_type_size(dst->type); 4336 4337 ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, { 4338 (uint32_t)ggml_nelements(src0), 4339 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4340 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4341 0, 4342 op_params[0], op_params[1], 4343 }); 4344 } 4345 4346 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4347 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; 4348 const uint32_t src0_type_size = ggml_type_size(src0->type); 4349 const uint32_t dst_type_size = ggml_type_size(dst->type); 4350 const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; 4351 4352 ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { 4353 (uint32_t)ggml_nelements(src0), 4354 (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, 4355 (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 4356 d_offset, 4357 0.0f, 0.0f, 4358 }); 4359 } 4360 4361 static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4362 float * op_params = (float *)dst->op_params; 4363 4364 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); 4365 } 4366 4367 static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4368 float * op_params = (float *)dst->op_params; 4369 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); 4370 } 4371 4372 static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4373 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); 4374 } 4375 4376 static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4377 int32_t * op_params = (int32_t *)dst->op_params; 4378 ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); 4379 } 4380 4381 static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { 4382 float * op_params = (float *)dst->op_params; 4383 4384 float scale = op_params[0]; 4385 float max_bias = op_params[1]; 4386 4387 const uint32_t ncols = (uint32_t)src0->ne[0]; 4388 const uint32_t nrows_x = (uint32_t)ggml_nrows(src0); 4389 const uint32_t nrows_y = (uint32_t)src0->ne[1]; 4390 4391 const uint32_t n_head_kv = nrows_x/nrows_y; 4392 const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); 4393 4394 const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); 4395 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); 4396 4397 ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, { 4398 ncols, 4399 src1 != nullptr ? nrows_y : (uint32_t)0, 4400 scale, max_bias, 4401 m0, m1, 4402 n_head_log2, 4403 }); 4404 } 4405 4406 static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { 4407 const int n_dims = ((int32_t *) dst->op_params)[1]; 4408 // const int mode = ((int32_t *) dst->op_params)[2]; 4409 // const int n_ctx = ((int32_t *) dst->op_params)[3]; 4410 const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; 4411 const float freq_base = ((float *) dst->op_params)[5]; 4412 const float freq_scale = ((float *) dst->op_params)[6]; 4413 const float ext_factor = ((float *) dst->op_params)[7]; 4414 const float attn_factor = ((float *) dst->op_params)[8]; 4415 const float beta_fast = ((float *) dst->op_params)[9]; 4416 const float beta_slow = ((float *) dst->op_params)[10]; 4417 4418 float corr_dims[2]; 4419 ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); 4420 4421 const float theta_scale = powf(freq_base, -2.0f/n_dims); 4422 4423 ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, { 4424 (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], 4425 freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, 4426 src2 != nullptr, 4427 }); 4428 } 4429 4430 static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4431 int32_t * op_params = (int32_t *)dst->op_params; 4432 4433 uint32_t ncols = src0->ne[0]; 4434 4435 uint32_t ncols_pad = 1; 4436 while (ncols_pad < ncols) { 4437 ncols_pad *= 2; 4438 } 4439 4440 GGML_ASSERT(ncols_pad <= 1024); 4441 4442 ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { 4443 ncols, 4444 ncols_pad, 4445 op_params[0], 4446 }); 4447 } 4448 4449 static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { 4450 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }); 4451 } 4452 4453 #ifdef GGML_VULKAN_RUN_TESTS 4454 static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { 4455 if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { 4456 return; 4457 } 4458 i0 = std::max(i0, 5); 4459 i1 = std::max(i1, 5); 4460 i2 = std::max(i2, 0); 4461 fprintf(stderr, " "); 4462 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 4463 fprintf(stderr, "%7d ", idx1); 4464 } 4465 fprintf(stderr, "\n"); 4466 for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { 4467 fprintf(stderr, "%7d: ", idx0); 4468 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 4469 if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) { 4470 float val; 4471 if (type == GGML_TYPE_F32) { 4472 val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0); 4473 } else if (type == GGML_TYPE_F16) { 4474 val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); 4475 } else { 4476 GGML_ASSERT(false); 4477 } 4478 fprintf(stderr, "% 7.2f ", val); 4479 } else { 4480 fprintf(stderr, " "); 4481 } 4482 } 4483 fprintf(stderr, "\n"); 4484 } 4485 } 4486 4487 template <typename X_TYPE, typename Y_TYPE> 4488 static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { 4489 VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")"); 4490 const size_t x_ne = m * k * batch; 4491 const size_t y_ne = k * n * batch; 4492 const size_t d_ne = m * n * batch; 4493 4494 vk_pipeline p; 4495 std::string shname; 4496 if (shader_size == 0) { 4497 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4498 p = ctx->device->pipeline_matmul_f32->a_s; 4499 shname = "F32_ALIGNED_S"; 4500 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4501 p = ctx->device->pipeline_matmul_f32_f16->a_s; 4502 shname = "F32_F16_ALIGNED_S"; 4503 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4504 p = ctx->device->pipeline_matmul_f16_f32->a_s; 4505 shname = "F16_F32_ALIGNED_S"; 4506 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4507 p = ctx->device->pipeline_matmul_f16->a_s; 4508 shname = "F16_ALIGNED_S"; 4509 } else { 4510 GGML_ASSERT(false); 4511 } 4512 } else if (shader_size == 1) { 4513 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4514 p = ctx->device->pipeline_matmul_f32->a_m; 4515 shname = "F32_ALIGNED_M"; 4516 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4517 p = ctx->device->pipeline_matmul_f32_f16->a_m; 4518 shname = "F32_F16_ALIGNED_M"; 4519 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4520 p = ctx->device->pipeline_matmul_f16_f32->a_m; 4521 shname = "F16_F32_ALIGNED_M"; 4522 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4523 p = ctx->device->pipeline_matmul_f16->a_m; 4524 shname = "F16_ALIGNED_M"; 4525 } else { 4526 GGML_ASSERT(false); 4527 } 4528 } else if (shader_size == 2) { 4529 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4530 p = ctx->device->pipeline_matmul_f32->a_l; 4531 shname = "F32_ALIGNED_L"; 4532 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4533 p = ctx->device->pipeline_matmul_f32_f16->a_l; 4534 shname = "F32_F16_ALIGNED_L"; 4535 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4536 p = ctx->device->pipeline_matmul_f16_f32->a_l; 4537 shname = "F16_F32_ALIGNED_L"; 4538 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4539 p = ctx->device->pipeline_matmul_f16->a_l; 4540 shname = "F16_ALIGNED_L"; 4541 } else { 4542 GGML_ASSERT(false); 4543 } 4544 } else { 4545 GGML_ASSERT(0); 4546 } 4547 4548 const size_t kpad = ggml_vk_align_size(k, p->align); 4549 4550 if (k != kpad) { 4551 if (shader_size == 0) { 4552 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4553 p = ctx->device->pipeline_matmul_f32->s; 4554 shname = "F32_S"; 4555 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4556 p = ctx->device->pipeline_matmul_f32_f16->s; 4557 shname = "F32_F16_S"; 4558 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4559 p = ctx->device->pipeline_matmul_f16_f32->s; 4560 shname = "F16_F32_S"; 4561 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4562 p = ctx->device->pipeline_matmul_f16->s; 4563 shname = "F16_S"; 4564 } 4565 } else if (shader_size == 1) { 4566 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4567 p = ctx->device->pipeline_matmul_f32->m; 4568 shname = "F32_M"; 4569 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4570 p = ctx->device->pipeline_matmul_f32_f16->m; 4571 shname = "F32_F16_M"; 4572 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4573 p = ctx->device->pipeline_matmul_f16_f32->m; 4574 shname = "F16_F32_M"; 4575 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4576 p = ctx->device->pipeline_matmul_f16->m; 4577 shname = "F16_M"; 4578 } 4579 } else if (shader_size == 2) { 4580 if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4581 p = ctx->device->pipeline_matmul_f32->l; 4582 shname = "F32_L"; 4583 } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4584 p = ctx->device->pipeline_matmul_f32_f16->l; 4585 shname = "F32_F16_L"; 4586 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) { 4587 p = ctx->device->pipeline_matmul_f16_f32->l; 4588 shname = "F16_F32_L"; 4589 } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) { 4590 p = ctx->device->pipeline_matmul_f16->l; 4591 shname = "F16_L"; 4592 } 4593 } 4594 } 4595 4596 ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it); 4597 if (split_k > 1) { 4598 ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); 4599 4600 if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { 4601 // Resize buffer 4602 if (ctx->prealloc_split_k != nullptr) { 4603 ggml_vk_destroy_buffer(ctx->prealloc_split_k); 4604 } 4605 ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); 4606 } 4607 } 4608 4609 vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); 4610 vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); 4611 vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); 4612 4613 X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); 4614 Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); 4615 float* d = (float *) malloc(sizeof(float) * d_ne); 4616 4617 for (size_t i = 0; i < x_ne; i++) { 4618 if (std::is_same<float, X_TYPE>()) { 4619 x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; 4620 } else if (std::is_same<ggml_fp16_t, X_TYPE>()) { 4621 x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); 4622 } else { 4623 GGML_ASSERT(false); 4624 } 4625 } 4626 for (size_t i = 0; i < y_ne; i++) { 4627 if (std::is_same<float, Y_TYPE>()) { 4628 // y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; 4629 y[i] = (i % k == i / k) ? 1.0f : 0.0f; 4630 } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) { 4631 // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); 4632 y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); 4633 } else { 4634 GGML_ASSERT(false); 4635 } 4636 } 4637 4638 ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch); 4639 ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); 4640 4641 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 4642 for (size_t i = 0; i < num_it; i++) { 4643 ggml_vk_ctx_begin(ctx, subctx); 4644 ggml_vk_matmul( 4645 ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), 4646 m, n, k, 4647 k, k, m, k*m, k*n, m*n, 4648 split_k, batch, batch, batch, 1, 1 4649 ); 4650 ggml_vk_ctx_end(subctx); 4651 } 4652 4653 auto begin = std::chrono::high_resolution_clock::now(); 4654 ggml_vk_submit(subctx, ctx->fence); 4655 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); 4656 ctx->device->device.resetFences({ ctx->fence }); 4657 4658 auto end = std::chrono::high_resolution_clock::now(); 4659 double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; 4660 4661 // copy dst to host 4662 ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne); 4663 4664 float * d_chk = (float *) malloc(sizeof(float) * d_ne); 4665 4666 ggml_init_params iparams = { 4667 /*.mem_size =*/ 1024*1024*1024, 4668 /*.mem_buffer =*/ NULL, 4669 /*.no_alloc =*/ true, 4670 }; 4671 4672 ggml_context * ggml_ctx = ggml_init(iparams); 4673 4674 ggml_type src0_type; 4675 ggml_type src1_type; 4676 4677 if (std::is_same<float, X_TYPE>()) { 4678 src0_type = GGML_TYPE_F32; 4679 } else if (std::is_same<ggml_fp16_t, X_TYPE>()) { 4680 src0_type = GGML_TYPE_F16; 4681 } else { 4682 GGML_ASSERT(false); 4683 } 4684 if (std::is_same<float, Y_TYPE>()) { 4685 src1_type = GGML_TYPE_F32; 4686 } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) { 4687 src1_type = GGML_TYPE_F16; 4688 } else { 4689 GGML_ASSERT(false); 4690 } 4691 4692 ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); 4693 ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch); 4694 ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); 4695 4696 src0_ggml->data = x; 4697 src1_ggml->data = y; 4698 tensor_ggml->data = d_chk; 4699 4700 ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); 4701 ggml_build_forward_expand(cgraph, tensor_ggml); 4702 4703 ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); 4704 4705 ggml_free(ggml_ctx); 4706 4707 double avg_err = 0.0; 4708 int first_err_n = -1; 4709 int first_err_m = -1; 4710 int first_err_b = -1; 4711 4712 for (size_t i = 0; i < m*n*batch; i++) { 4713 double err = std::fabs(d[i] - d_chk[i]); 4714 avg_err += err; 4715 4716 if (err > 0.05f && first_err_n == -1) { 4717 first_err_b = i / (m * n); 4718 first_err_n = (i % (m * n)) / m; 4719 first_err_m = (i % (m * n)) % m; 4720 } 4721 } 4722 4723 avg_err /= m * n; 4724 4725 std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl; 4726 4727 if (avg_err > 0.1) { 4728 std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; 4729 std::cerr << "Actual result: " << std::endl << std::endl; 4730 ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4731 std::cerr << std::endl; 4732 ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n + 15, first_err_b); 4733 std::cerr << "Expected result: " << std::endl << std::endl; 4734 ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4735 4736 if (split_k > 1) { 4737 float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); 4738 ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); 4739 4740 std::cerr << "d_buf0: " << std::endl << std::endl; 4741 ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4742 4743 std::cerr << "d_buf1: " << std::endl << std::endl; 4744 ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4745 4746 std::cerr << "d_buf2: " << std::endl << std::endl; 4747 ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4748 4749 std::cerr << "d_buf3: " << std::endl << std::endl; 4750 ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 4751 4752 free(split_k_buf); 4753 } 4754 } 4755 4756 free(d_chk); 4757 4758 ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); 4759 ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue); 4760 4761 ggml_vk_destroy_buffer(d_X); 4762 ggml_vk_destroy_buffer(d_Y); 4763 ggml_vk_destroy_buffer(d_D); 4764 4765 ggml_pipeline_cleanup(p); 4766 ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce); 4767 4768 free(x); 4769 free(y); 4770 free(d); 4771 } 4772 4773 static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) { 4774 if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { 4775 return; 4776 } 4777 i0 = std::max(i0, 5); 4778 i1 = std::max(i1, 5); 4779 i2 = std::max(i2, 0); 4780 i3 = std::max(i3, 0); 4781 fprintf(stderr, " "); 4782 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 4783 fprintf(stderr, "%7d ", idx1); 4784 } 4785 fprintf(stderr, "\n"); 4786 for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { 4787 fprintf(stderr, "%7d: ", idx0); 4788 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 4789 if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { 4790 float val; 4791 if (tensor->type == GGML_TYPE_F32) { 4792 val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); 4793 } else if (tensor->type == GGML_TYPE_F16) { 4794 val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); 4795 } else { 4796 GGML_ASSERT(false); 4797 } 4798 fprintf(stderr, "% 7.2f ", val); 4799 } else { 4800 fprintf(stderr, " "); 4801 } 4802 } 4803 fprintf(stderr, "\n"); 4804 } 4805 } 4806 4807 static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_t ne1, size_t ne2, size_t ne3) { 4808 const size_t ne = ne0 * ne1 * ne2 * ne3; 4809 4810 ggml_init_params iparams = { 4811 /*.mem_size =*/ 1024*1024*1024, 4812 /*.mem_buffer =*/ NULL, 4813 /*.no_alloc =*/ true, 4814 }; 4815 4816 ggml_context * ggml_ctx = ggml_init(iparams); 4817 4818 ggml_tensor * tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne2, ne1, ne3); // NOLINT 4819 ggml_tensor * result_tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne1, ne2, ne3); 4820 4821 float * data = (float *) ggml_vk_host_malloc(ctx, ggml_nbytes(tensor)); 4822 tensor->data = data; 4823 4824 float * result_data = (float *) malloc(ggml_nbytes(tensor)); 4825 result_tensor->data = result_data; 4826 4827 // Permute 4828 { 4829 size_t tmp = tensor->nb[2]; 4830 tensor->nb[2] = tensor->nb[1]; 4831 tensor->nb[1] = tmp; 4832 4833 tensor->ne[2] = ne2; 4834 tensor->ne[1] = ne1; 4835 } 4836 4837 for (size_t i = 0; i < ne; i++) { 4838 data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; 4839 } 4840 4841 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 4842 ggml_vk_ctx_begin(ctx, subctx); 4843 4844 vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); 4845 4846 ggml_vk_h2d_tensor_2d(ctx, subctx, buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); 4847 4848 ggml_vk_ctx_end(subctx); 4849 ggml_vk_submit(subctx, ctx->fence); 4850 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences"); 4851 ctx->device->device.resetFences({ ctx->fence }); 4852 4853 ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor)); 4854 4855 double avg_err = 0.0; 4856 int first_err_i0 = -1; 4857 int first_err_i1 = -1; 4858 int first_err_i2 = -1; 4859 int first_err_i3 = -1; 4860 4861 for (size_t i3 = 0; i3 < ne3; i3++) { 4862 for (size_t i2 = 0; i2 < ne2; i2++) { 4863 for (size_t i1 = 0; i1 < ne1; i1++) { 4864 for (size_t i0 = 0; i0 < ne0; i0++) { 4865 float correct = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); 4866 float result = *(float *) ((char *) result_data + i3*ne2*ne1*ne0*sizeof(float) + i2*ne1*ne0*sizeof(float) + i1*ne0*sizeof(float) + i0*sizeof(float)); 4867 double err = std::fabs(result - correct); 4868 4869 avg_err += err; 4870 4871 if (err > 0.05f && first_err_i0 == -1) { 4872 first_err_i0 = i0; 4873 first_err_i1 = i1; 4874 first_err_i2 = i2; 4875 first_err_i3 = i3; 4876 } 4877 } 4878 } 4879 } 4880 } 4881 4882 avg_err /= ne; 4883 4884 std::cerr << "TEST nc copy ne0=" << ne0 << " ne1=" << ne1 << " ne2=" << ne2 << " ne3=" << ne3 << " avg_err=" << avg_err << std::endl; 4885 4886 if (avg_err > 0.1) { 4887 std::cerr << "i0 = " << first_err_i0 << " i1 = " << first_err_i1 << " i2 = " << first_err_i2 << " i3 = " << first_err_i3 << std::endl; 4888 std::cerr << "Actual result: " << std::endl << std::endl; 4889 ggml_vk_print_tensor_area(result_tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3); 4890 std::cerr << "Expected result: " << std::endl << std::endl; 4891 ggml_vk_print_tensor_area(tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3); 4892 } 4893 4894 ggml_free(ggml_ctx); 4895 4896 ggml_vk_destroy_buffer(buffer); 4897 4898 ggml_vk_host_free(ctx, data); 4899 free(result_data); 4900 } 4901 4902 static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) { 4903 VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")"); 4904 // Check transfers are correct 4905 vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); 4906 4907 float * x; 4908 float * y; 4909 if (pinned) { 4910 x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); 4911 y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); 4912 } else { 4913 x = (float *) malloc(sizeof(float) * ne); 4914 y = (float *) malloc(sizeof(float) * ne); 4915 } 4916 4917 for (size_t i = 0; i < ne; i++) { 4918 x[i] = rand() / (float)RAND_MAX; 4919 } 4920 4921 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 4922 ggml_vk_ctx_begin(ctx, subctx); 4923 4924 auto begin = std::chrono::high_resolution_clock::now(); 4925 4926 ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne); 4927 4928 for (auto& cpy : subctx->in_memcpys) { 4929 memcpy(cpy.dst, cpy.src, cpy.n); 4930 } 4931 subctx->in_memcpys.clear(); 4932 4933 ggml_vk_ctx_end(subctx); 4934 ggml_vk_submit(subctx, ctx->fence); 4935 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); 4936 ctx->device->device.resetFences({ ctx->fence }); 4937 4938 auto end = std::chrono::high_resolution_clock::now(); 4939 4940 double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; 4941 4942 ggml_vk_ctx_begin(ctx, subctx); 4943 4944 begin = std::chrono::high_resolution_clock::now(); 4945 4946 ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne); 4947 4948 ggml_vk_ctx_end(subctx); 4949 ggml_vk_submit(subctx, ctx->fence); 4950 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); 4951 ctx->device->device.resetFences({ ctx->fence }); 4952 4953 for (auto& cpy : subctx->out_memcpys) { 4954 memcpy(cpy.dst, cpy.src, cpy.n); 4955 } 4956 subctx->out_memcpys.clear(); 4957 4958 end = std::chrono::high_resolution_clock::now(); 4959 4960 double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; 4961 4962 double avg_err = 0.0; 4963 for (size_t i = 0; i < ne; i++) { 4964 avg_err += std::fabs(x[i] - y[i]); 4965 } 4966 4967 double kb = ne * sizeof(float) / 1024.0; 4968 4969 std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl; 4970 4971 ggml_vk_destroy_buffer(buffer); 4972 4973 if (pinned) { 4974 ggml_vk_host_free(ctx, x); 4975 ggml_vk_host_free(ctx, y); 4976 } else { 4977 free(x); 4978 free(y); 4979 } 4980 } 4981 4982 static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { 4983 ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); 4984 } 4985 4986 static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { 4987 VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); 4988 const size_t x_sz = sizeof(float) * ne; 4989 const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; 4990 const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); 4991 float * x = (float *) malloc(x_sz); 4992 void * qx = malloc(qx_sz); 4993 vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); 4994 vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); 4995 ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); 4996 4997 for (size_t i = 0; i < ne; i++) { 4998 x[i] = rand() / (float)RAND_MAX; 4999 } 5000 5001 vk_pipeline p = ctx->device->pipeline_dequant[quant]; 5002 5003 ggml_vk_quantize_data(x, qx, ne, quant); 5004 5005 ggml_pipeline_allocate_descriptor_sets(ctx, p, 1); 5006 5007 ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); 5008 5009 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 5010 ggml_vk_ctx_begin(ctx, subctx); 5011 const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; 5012 ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); 5013 ggml_vk_ctx_end(subctx); 5014 5015 auto begin = std::chrono::high_resolution_clock::now(); 5016 5017 ggml_vk_submit(subctx, ctx->fence); 5018 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); 5019 ctx->device->device.resetFences({ ctx->fence }); 5020 5021 auto end = std::chrono::high_resolution_clock::now(); 5022 5023 double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; 5024 ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16); 5025 5026 int first_err = -1; 5027 5028 double avg_err = 0.0; 5029 for (size_t i = 0; i < ne; i++) { 5030 double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i])); 5031 avg_err += error; 5032 5033 if (first_err < 0 && error > 0.05) { 5034 first_err = i; 5035 } 5036 } 5037 5038 avg_err /= ne; 5039 5040 std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl; 5041 5042 if (avg_err > 0.1) { 5043 std::cerr << "first_error = " << first_err << std::endl; 5044 std::cerr << "Actual result: " << std::endl << std::endl; 5045 for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { 5046 std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", "; 5047 } 5048 std::cerr << std::endl << "Expected result: " << std::endl << std::endl; 5049 for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { 5050 std::cerr << x[i] << ", "; 5051 } 5052 std::cerr << std::endl; 5053 } 5054 5055 ggml_vk_destroy_buffer(x_buf); 5056 ggml_vk_destroy_buffer(qx_buf); 5057 5058 free(x); 5059 free(qx); 5060 free(x_chk); 5061 } 5062 5063 static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) { 5064 VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")"); 5065 const size_t x_ne = m * k * batch; 5066 const size_t y_ne = k * n * batch; 5067 const size_t d_ne = m * n * batch; 5068 5069 vk_pipeline p; 5070 std::string shname; 5071 if (shader_size == 0) { 5072 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_s; 5073 shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; 5074 } else if (shader_size == 1) { 5075 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_m; 5076 shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; 5077 } else if (shader_size == 2) { 5078 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_l; 5079 shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; 5080 } else { 5081 GGML_ASSERT(0); 5082 } 5083 5084 const size_t kpad = ggml_vk_align_size(k, p->align); 5085 5086 if (k != kpad) { 5087 if (shader_size == 0) { 5088 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->s; 5089 shname = std::string(ggml_type_name(quant)) + "_S"; 5090 } else if (shader_size == 1) { 5091 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->m; 5092 shname = std::string(ggml_type_name(quant)) + "_M"; 5093 } else if (shader_size == 2) { 5094 p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->l; 5095 shname = std::string(ggml_type_name(quant)) + "_L"; 5096 } else { 5097 GGML_ASSERT(0); 5098 } 5099 } 5100 5101 const size_t x_sz = sizeof(float) * x_ne; 5102 const size_t y_sz = sizeof(float) * y_ne; 5103 const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant); 5104 const size_t d_sz = sizeof(float) * d_ne; 5105 float * x = (float *) malloc(x_sz); 5106 float * y = (float *) malloc(y_sz); 5107 void * qx = malloc(qx_sz); 5108 vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); 5109 vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); 5110 vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); 5111 float * d = (float *) malloc(d_sz); 5112 float * d_chk = (float *) malloc(d_sz); 5113 5114 for (size_t i = 0; i < x_ne; i++) { 5115 x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; 5116 } 5117 5118 ggml_vk_quantize_data(x, qx, x_ne, quant); 5119 5120 for (size_t i = 0; i < y_ne; i++) { 5121 // y[i] = rand() / (float)RAND_MAX; 5122 y[i] = (i % k == i / k) ? 1.0f : 0.0f; 5123 } 5124 5125 ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it); 5126 if (split_k > 1) { 5127 ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); 5128 5129 if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { 5130 // Resize buffer 5131 if (ctx->prealloc_split_k != nullptr) { 5132 ggml_vk_destroy_buffer(ctx->prealloc_split_k); 5133 } 5134 ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); 5135 } 5136 } 5137 5138 ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); 5139 ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz); 5140 5141 vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 5142 for (size_t i = 0; i < num_it; i++) { 5143 ggml_vk_ctx_begin(ctx, subctx); 5144 ggml_vk_matmul( 5145 ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), 5146 m, n, k, 5147 k, k, m, k*m, k*n, m*n, 5148 split_k, batch, batch, batch, 1, 1 5149 ); 5150 ggml_vk_ctx_end(subctx); 5151 } 5152 5153 auto begin = std::chrono::high_resolution_clock::now(); 5154 5155 ggml_vk_submit(subctx, ctx->fence); 5156 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); 5157 ctx->device->device.resetFences({ ctx->fence }); 5158 5159 auto end = std::chrono::high_resolution_clock::now(); 5160 5161 double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; 5162 ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz); 5163 5164 ggml_init_params iparams = { 5165 /*.mem_size =*/ 1024*1024*1024, 5166 /*.mem_buffer =*/ NULL, 5167 /*.no_alloc =*/ true, 5168 }; 5169 5170 ggml_context * ggml_ctx = ggml_init(iparams); 5171 5172 ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch); 5173 ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch); 5174 ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); 5175 5176 src0_ggml->data = qx; 5177 src1_ggml->data = y; 5178 tensor_ggml->data = d_chk; 5179 5180 ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); 5181 ggml_build_forward_expand(cgraph, tensor_ggml); 5182 5183 ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); 5184 5185 ggml_free(ggml_ctx); 5186 5187 double avg_err = 0.0; 5188 int first_err_n = -1; 5189 int first_err_m = -1; 5190 int first_err_b = -1; 5191 5192 for (size_t i = 0; i < m*n*batch; i++) { 5193 double err = std::fabs(d[i] - d_chk[i]); 5194 avg_err += err; 5195 5196 if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { 5197 first_err_b = i / (m * n); 5198 first_err_n = (i % (m * n)) / m; 5199 first_err_m = (i % (m * n)) % m; 5200 } 5201 } 5202 5203 avg_err /= m * n; 5204 5205 std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl; 5206 5207 if (avg_err > 0.01 || std::isnan(avg_err)) { 5208 std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; 5209 std::cerr << "Actual result: " << std::endl << std::endl; 5210 ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5211 std::cerr << std::endl; 5212 std::cerr << "Expected result: " << std::endl << std::endl; 5213 ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5214 5215 if (split_k > 1) { 5216 float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); 5217 ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); 5218 5219 std::cerr << "d_buf0: " << std::endl << std::endl; 5220 ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5221 5222 std::cerr << "d_buf1: " << std::endl << std::endl; 5223 ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5224 5225 std::cerr << "d_buf2: " << std::endl << std::endl; 5226 ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5227 5228 std::cerr << "d_buf3: " << std::endl << std::endl; 5229 ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); 5230 5231 free(split_k_buf); 5232 } 5233 } 5234 5235 ggml_vk_destroy_buffer(qx_buf); 5236 ggml_vk_destroy_buffer(y_buf); 5237 ggml_vk_destroy_buffer(d_buf); 5238 5239 free(x); 5240 free(qx); 5241 free(y); 5242 free(d); 5243 free(d_chk); 5244 } 5245 #endif 5246 5247 static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { 5248 VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))"); 5249 ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; 5250 extra->reset(); 5251 tensor->extra = extra; 5252 return extra; 5253 } 5254 5255 static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ 5256 VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")"); 5257 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; 5258 5259 if (extra == nullptr) { 5260 return; 5261 } 5262 5263 ggml_tensor * src0 = node->src[0]; 5264 ggml_tensor * src1 = node->src[1]; 5265 5266 const bool use_src0 = src0 != nullptr; 5267 const int64_t ne00 = use_src0 ? src0->ne[0] : 0; 5268 const int64_t ne01 = use_src0 ? src0->ne[1] : 0; 5269 const int64_t ne02 = use_src0 ? src0->ne[2] : 0; 5270 const int64_t ne03 = use_src0 ? src0->ne[3] : 0; 5271 const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP; 5272 const int64_t ne10 = use_src1 ? src1->ne[0] : 0; 5273 const int64_t ne11 = use_src1 ? src1->ne[1] : 0; 5274 const int64_t ne12 = use_src1 ? src1->ne[2] : 0; 5275 const int64_t ne13 = use_src1 ? src1->ne[3] : 0; 5276 const int64_t ne20 = node->ne[0]; 5277 const int64_t ne21 = node->ne[1]; 5278 const int64_t ne22 = node->ne[2]; 5279 const int64_t ne23 = node->ne[3]; 5280 5281 const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16; 5282 const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16; 5283 5284 const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0); 5285 const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1); 5286 5287 const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig; 5288 5289 bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; 5290 5291 const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig); 5292 const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); 5293 5294 int split_k; 5295 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { 5296 split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); 5297 } else { 5298 split_k = 1; 5299 } 5300 const uint32_t x_ne = ne00 * ne01; 5301 const uint32_t y_ne = ne10 * ne11; 5302 const uint32_t d_ne = ne20 * ne21; 5303 5304 const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; 5305 const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; 5306 uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; 5307 const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; 5308 5309 if (extra->buffer_gpu.expired()) { 5310 // Workaround for CPU backend BLAS matmul calls 5311 extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz); 5312 } 5313 5314 switch (node->op) { 5315 case GGML_OP_REPEAT: 5316 case GGML_OP_GET_ROWS: 5317 case GGML_OP_RESHAPE: 5318 case GGML_OP_VIEW: 5319 case GGML_OP_PERMUTE: 5320 case GGML_OP_TRANSPOSE: 5321 case GGML_OP_ADD: 5322 case GGML_OP_SCALE: 5323 case GGML_OP_SQR: 5324 case GGML_OP_CLAMP: 5325 case GGML_OP_CPY: 5326 case GGML_OP_CONT: 5327 case GGML_OP_DUP: 5328 case GGML_OP_MUL: 5329 case GGML_OP_DIV: 5330 case GGML_OP_NORM: 5331 case GGML_OP_RMS_NORM: 5332 case GGML_OP_DIAG_MASK_INF: 5333 case GGML_OP_SOFT_MAX: 5334 case GGML_OP_ROPE: 5335 case GGML_OP_ARGSORT: 5336 case GGML_OP_SUM_ROWS: 5337 break; 5338 case GGML_OP_UNARY: 5339 switch (ggml_get_unary_op(node)) { 5340 case GGML_UNARY_OP_SILU: 5341 case GGML_UNARY_OP_GELU: 5342 case GGML_UNARY_OP_RELU: 5343 break; 5344 default: 5345 return; 5346 } 5347 break; 5348 case GGML_OP_MUL_MAT: 5349 case GGML_OP_MUL_MAT_ID: 5350 if (ctx->prealloc_size_x < x_sz) { 5351 ctx->prealloc_size_x = x_sz; 5352 } 5353 if (ctx->prealloc_size_y < y_sz) { 5354 ctx->prealloc_size_y = y_sz; 5355 } 5356 if (ctx->prealloc_size_split_k < split_k_size) { 5357 ctx->prealloc_size_split_k = split_k_size; 5358 } 5359 if (ctx->staging_size < x_sz + y_sz) { 5360 ctx->staging_size = x_sz + y_sz; 5361 } 5362 break; 5363 default: 5364 return; 5365 } 5366 } 5367 5368 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 5369 #if defined(GGML_VULKAN_RUN_TESTS) 5370 ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, 5371 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, 5372 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); 5373 ggml_vk_test_transfer(ctx, 8192 * 1000, false); 5374 ggml_vk_test_transfer(ctx, 8192 * 1000, true); 5375 5376 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); 5377 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); 5378 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); 5379 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0); 5380 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1); 5381 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0); 5382 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K); 5383 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K); 5384 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K); 5385 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K); 5386 ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K); 5387 5388 ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2); 5389 5390 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0); 5391 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1); 5392 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2); 5393 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0); 5394 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1); 5395 ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2); 5396 5397 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0); 5398 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0); 5399 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0); 5400 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0); 5401 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0); 5402 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0); 5403 5404 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1); 5405 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1); 5406 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1); 5407 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1); 5408 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1); 5409 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1); 5410 5411 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0); 5412 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0); 5413 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0); 5414 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0); 5415 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0); 5416 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0); 5417 5418 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1); 5419 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1); 5420 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1); 5421 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1); 5422 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1); 5423 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1); 5424 5425 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0); 5426 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0); 5427 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0); 5428 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0); 5429 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0); 5430 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0); 5431 5432 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K); 5433 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K); 5434 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K); 5435 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K); 5436 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K); 5437 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K); 5438 5439 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K); 5440 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K); 5441 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K); 5442 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K); 5443 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K); 5444 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K); 5445 5446 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K); 5447 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K); 5448 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K); 5449 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K); 5450 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K); 5451 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K); 5452 5453 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K); 5454 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K); 5455 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K); 5456 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K); 5457 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K); 5458 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K); 5459 5460 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K); 5461 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K); 5462 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K); 5463 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K); 5464 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K); 5465 ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K); 5466 5467 std::cerr << std::endl; 5468 5469 const std::vector<size_t> vals { 5470 8, 8, 8, 5471 100, 46, 576, 5472 623, 111, 128, 5473 100, 46, 558, 5474 512, 1, 256, 5475 128, 110, 622, 5476 511, 511, 127, 5477 511, 511, 7, 5478 511, 511, 17, 5479 49, 49, 128, 5480 128, 49, 49, 5481 4096, 49, 4096, 5482 11008, 49, 4096, 5483 4096, 49, 11008, 5484 32000, 49, 4096, 5485 512, 512, 128, 5486 128, 512, 512, 5487 4096, 512, 4096, 5488 11008, 512, 4096, 5489 4096, 512, 11008, 5490 32000, 512, 4096, 5491 }; 5492 const size_t num_it = 1; 5493 for (size_t i = 0; i < vals.size(); i += 3) { 5494 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); 5495 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); 5496 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); 5497 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); 5498 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); 5499 ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); 5500 std::cerr << std::endl; 5501 } 5502 5503 GGML_ASSERT(false); 5504 #endif 5505 5506 if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { 5507 VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); 5508 // Resize buffer 5509 if (ctx->prealloc_x != nullptr) { 5510 ggml_vk_destroy_buffer(ctx->prealloc_x); 5511 } 5512 ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x); 5513 } 5514 if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { 5515 VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")"); 5516 // Resize buffer 5517 if (ctx->prealloc_y != nullptr) { 5518 ggml_vk_destroy_buffer(ctx->prealloc_y); 5519 } 5520 ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y); 5521 } 5522 if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { 5523 VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")"); 5524 // Resize buffer 5525 if (ctx->prealloc_split_k != nullptr) { 5526 ggml_vk_destroy_buffer(ctx->prealloc_split_k); 5527 } 5528 ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k); 5529 } 5530 if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) { 5531 VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")"); 5532 // Resize buffer 5533 if (ctx->staging != nullptr) { 5534 ggml_vk_destroy_buffer(ctx->staging); 5535 } 5536 ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, 5537 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, 5538 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); 5539 } 5540 } 5541 5542 static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ 5543 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; 5544 5545 if (ggml_is_empty(node) || extra == nullptr) { 5546 return; 5547 } 5548 5549 VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); 5550 ctx->semaphore_idx = 0; 5551 ctx->staging_offset = 0; 5552 5553 const ggml_tensor * src0 = node->src[0]; 5554 const ggml_tensor * src1 = node->src[1]; 5555 const ggml_tensor * src2 = node->src[2]; 5556 5557 switch (node->op) { 5558 // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor 5559 case GGML_OP_RESHAPE: 5560 case GGML_OP_VIEW: 5561 case GGML_OP_PERMUTE: 5562 case GGML_OP_TRANSPOSE: 5563 case GGML_OP_NONE: 5564 return; 5565 case GGML_OP_UNARY: 5566 switch (ggml_get_unary_op(node)) { 5567 case GGML_UNARY_OP_SILU: 5568 case GGML_UNARY_OP_GELU: 5569 case GGML_UNARY_OP_RELU: 5570 break; 5571 default: 5572 return; 5573 } 5574 break; 5575 case GGML_OP_REPEAT: 5576 case GGML_OP_GET_ROWS: 5577 case GGML_OP_ADD: 5578 case GGML_OP_MUL: 5579 case GGML_OP_DIV: 5580 case GGML_OP_SCALE: 5581 case GGML_OP_SQR: 5582 case GGML_OP_CLAMP: 5583 case GGML_OP_CPY: 5584 case GGML_OP_CONT: 5585 case GGML_OP_DUP: 5586 case GGML_OP_NORM: 5587 case GGML_OP_RMS_NORM: 5588 case GGML_OP_DIAG_MASK_INF: 5589 case GGML_OP_SOFT_MAX: 5590 case GGML_OP_ROPE: 5591 case GGML_OP_MUL_MAT: 5592 case GGML_OP_MUL_MAT_ID: 5593 case GGML_OP_ARGSORT: 5594 case GGML_OP_SUM_ROWS: 5595 break; 5596 default: 5597 std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; 5598 GGML_ASSERT(false); 5599 return; 5600 } 5601 5602 if (ctx->compute_ctx == nullptr) { 5603 ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); 5604 ggml_vk_ctx_begin(ctx, ctx->compute_ctx); 5605 } 5606 5607 switch (node->op) { 5608 case GGML_OP_REPEAT: 5609 ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node); 5610 5611 break; 5612 case GGML_OP_GET_ROWS: 5613 ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node); 5614 5615 break; 5616 case GGML_OP_ADD: 5617 ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node); 5618 5619 break; 5620 case GGML_OP_MUL: 5621 ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node); 5622 5623 break; 5624 case GGML_OP_DIV: 5625 ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node); 5626 5627 break; 5628 case GGML_OP_SCALE: 5629 ggml_vk_scale(ctx, ctx->compute_ctx, src0, node); 5630 5631 break; 5632 case GGML_OP_SQR: 5633 ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node); 5634 5635 break; 5636 case GGML_OP_CLAMP: 5637 ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node); 5638 5639 break; 5640 case GGML_OP_CPY: 5641 case GGML_OP_CONT: 5642 case GGML_OP_DUP: 5643 ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); 5644 5645 break; 5646 case GGML_OP_NORM: 5647 ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); 5648 5649 break; 5650 case GGML_OP_RMS_NORM: 5651 ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node); 5652 5653 break; 5654 case GGML_OP_UNARY: 5655 switch (ggml_get_unary_op(node)) { 5656 case GGML_UNARY_OP_SILU: 5657 case GGML_UNARY_OP_GELU: 5658 case GGML_UNARY_OP_RELU: 5659 ggml_vk_unary(ctx, ctx->compute_ctx, src0, node); 5660 break; 5661 default: 5662 return; 5663 } 5664 break; 5665 case GGML_OP_DIAG_MASK_INF: 5666 ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node); 5667 5668 break; 5669 case GGML_OP_SOFT_MAX: 5670 ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node); 5671 5672 break; 5673 case GGML_OP_ROPE: 5674 ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node); 5675 5676 break; 5677 case GGML_OP_ARGSORT: 5678 ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node); 5679 5680 break; 5681 case GGML_OP_SUM_ROWS: 5682 ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node); 5683 5684 break; 5685 case GGML_OP_MUL_MAT: 5686 ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node); 5687 5688 break; 5689 case GGML_OP_MUL_MAT_ID: 5690 ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node); 5691 5692 break; 5693 default: 5694 return; 5695 } 5696 5697 extra->ctx_idx = ctx->compute_ctx->idx; 5698 5699 #ifdef GGML_VULKAN_CHECK_RESULTS 5700 // Force context reset on each node so that each tensor ends up in its own context 5701 // and can be run and compared to its CPU equivalent separately 5702 last_node = true; 5703 #endif 5704 5705 if (last_node) { 5706 ggml_vk_ctx_end(ctx->compute_ctx); 5707 ctx->compute_ctx->exit_tensor = node; 5708 ctx->compute_ctx = nullptr; 5709 } 5710 } 5711 5712 static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ 5713 ggml_tensor_extra_gpu * extra = nullptr; 5714 5715 switch (tensor->op) { 5716 case GGML_OP_ADD: 5717 case GGML_OP_GET_ROWS: 5718 case GGML_OP_MUL: 5719 case GGML_OP_DIV: 5720 case GGML_OP_SCALE: 5721 case GGML_OP_SQR: 5722 case GGML_OP_CLAMP: 5723 case GGML_OP_CPY: 5724 case GGML_OP_CONT: 5725 case GGML_OP_DUP: 5726 case GGML_OP_NORM: 5727 case GGML_OP_RMS_NORM: 5728 case GGML_OP_DIAG_MASK_INF: 5729 case GGML_OP_SOFT_MAX: 5730 case GGML_OP_ROPE: 5731 case GGML_OP_RESHAPE: 5732 case GGML_OP_VIEW: 5733 case GGML_OP_PERMUTE: 5734 case GGML_OP_TRANSPOSE: 5735 case GGML_OP_NONE: 5736 case GGML_OP_ARGSORT: 5737 case GGML_OP_SUM_ROWS: 5738 extra = (ggml_tensor_extra_gpu *) tensor->extra; 5739 5740 break; 5741 case GGML_OP_UNARY: 5742 switch (ggml_get_unary_op(tensor)) { 5743 case GGML_UNARY_OP_SILU: 5744 case GGML_UNARY_OP_GELU: 5745 case GGML_UNARY_OP_RELU: 5746 extra = (ggml_tensor_extra_gpu *) tensor->extra; 5747 break; 5748 default: 5749 return false; 5750 } 5751 break; 5752 case GGML_OP_MUL_MAT: 5753 case GGML_OP_MUL_MAT_ID: 5754 extra = (ggml_tensor_extra_gpu *) tensor->extra; 5755 5756 break; 5757 default: 5758 return false; 5759 } 5760 5761 if (extra == nullptr) { 5762 return false; 5763 } 5764 5765 if (params->ith != 0) { 5766 return true; 5767 } 5768 if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { 5769 return true; 5770 } 5771 5772 VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); 5773 5774 #ifdef GGML_VULKAN_CHECK_RESULTS 5775 ggml_vk_check_results_0(ctx, params, tensor); 5776 #endif 5777 5778 vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; 5779 5780 // Only run if ctx hasn't been submitted yet 5781 if (!subctx.seqs.empty()) { 5782 // Do staging buffer copies 5783 for (auto& cpy : subctx.in_memcpys) { 5784 memcpy(cpy.dst, cpy.src, cpy.n); 5785 } 5786 5787 ggml_vk_submit(&subctx, ctx->fence); 5788 } 5789 5790 if (tensor == subctx.exit_tensor) { 5791 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); 5792 ctx->device->device.resetFences({ ctx->fence }); 5793 5794 // Do staging buffer copies 5795 for (auto& cpy : subctx.out_memcpys) { 5796 memcpy(cpy.dst, cpy.src, cpy.n); 5797 } 5798 subctx.in_memcpys.clear(); 5799 subctx.out_memcpys.clear(); 5800 } 5801 5802 return true; 5803 } 5804 5805 // Clean up after graph processing is done 5806 static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { 5807 VK_LOG_DEBUG("ggml_vk_graph_cleanup()"); 5808 for (auto& buffer : ctx->gc.temp_buffers) { 5809 ggml_vk_pool_free(ctx, buffer); 5810 } 5811 ctx->gc.temp_buffers.clear(); 5812 5813 for (auto& pipeline : ctx->device->pipelines) { 5814 if (pipeline.expired()) { 5815 continue; 5816 } 5817 5818 vk_pipeline pl = pipeline.lock(); 5819 ggml_pipeline_cleanup(pl); 5820 } 5821 5822 ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue); 5823 ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); 5824 5825 for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { 5826 ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); 5827 } 5828 ctx->gc.semaphores.clear(); 5829 5830 for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { 5831 ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); 5832 } 5833 ctx->gc.tl_semaphores.clear(); 5834 ctx->semaphore_idx = 0; 5835 5836 ctx->event_idx = 0; 5837 5838 for (auto& event : ctx->gc.events) { 5839 ctx->device->device.resetEvent(event); 5840 } 5841 5842 ctx->staging_offset = 0; 5843 5844 ctx->compute_ctx = nullptr; 5845 ctx->transfer_ctx = nullptr; 5846 ctx->gc.contexts.clear(); 5847 } 5848 5849 // Clean up on backend free 5850 static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { 5851 VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")"); 5852 ggml_vk_graph_cleanup(ctx); 5853 5854 ggml_vk_destroy_buffer(ctx->prealloc_x); 5855 ggml_vk_destroy_buffer(ctx->prealloc_y); 5856 ggml_vk_destroy_buffer(ctx->prealloc_split_k); 5857 ggml_vk_destroy_buffer(ctx->staging); 5858 ggml_vk_destroy_buffer(ctx->sync_staging); 5859 5860 for (auto& buffer : ctx->buffer_pool) { 5861 ggml_vk_destroy_buffer(buffer); 5862 } 5863 5864 ctx->prealloc_size_x = 0; 5865 ctx->prealloc_size_y = 0; 5866 ctx->prealloc_size_split_k = 0; 5867 ctx->staging_size = 0; 5868 5869 for (auto& event : ctx->gc.events) { 5870 ctx->device->device.destroyEvent(event); 5871 } 5872 ctx->gc.events.clear(); 5873 5874 ctx->device->device.destroyFence(ctx->fence); 5875 } 5876 5877 GGML_CALL static int ggml_vk_get_device_count() { 5878 ggml_vk_instance_init(); 5879 5880 return vk_instance.device_indices.size(); 5881 } 5882 5883 GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { 5884 ggml_vk_instance_init(); 5885 5886 std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices(); 5887 5888 vk::PhysicalDeviceProperties props; 5889 devices[device].getProperties(&props); 5890 5891 snprintf(description, description_size, "%s", props.deviceName.data()); 5892 } 5893 5894 // backend interface 5895 5896 #define UNUSED GGML_UNUSED 5897 5898 // device backend 5899 5900 static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT 5901 5902 struct ggml_backend_vk_buffer_context { 5903 ggml_backend_vk_context * ctx; 5904 vk_buffer dev_buffer; 5905 ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; 5906 size_t temp_tensor_extra_index = 0; 5907 std::string name; 5908 5909 ggml_backend_vk_buffer_context(ggml_backend_vk_context * ctx, vk_buffer&& dev_buffer, std::string& name) : 5910 ctx(ctx), 5911 dev_buffer(dev_buffer), 5912 name(name) { 5913 } 5914 5915 ~ggml_backend_vk_buffer_context() { 5916 ggml_vk_destroy_buffer(dev_buffer); 5917 if (temp_tensor_extras != nullptr) { 5918 delete[] temp_tensor_extras; 5919 } 5920 } 5921 5922 ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { 5923 if (temp_tensor_extras == nullptr) { 5924 temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES]; 5925 } 5926 5927 size_t alloc_index = temp_tensor_extra_index; 5928 temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES; 5929 ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; 5930 extra->reset(); 5931 5932 return extra; 5933 } 5934 }; 5935 5936 GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { 5937 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 5938 return ctx->name.c_str(); 5939 } 5940 5941 GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { 5942 return buffer->iface.get_name == ggml_backend_vk_buffer_get_name; 5943 } 5944 5945 GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { 5946 VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); 5947 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 5948 ggml_vk_destroy_buffer(ctx->dev_buffer); 5949 delete ctx; 5950 } 5951 5952 GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { 5953 return vk_ptr_base; 5954 5955 UNUSED(buffer); 5956 } 5957 5958 GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { 5959 VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); 5960 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 5961 5962 if (tensor->view_src != nullptr) { 5963 GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); 5964 GGML_ASSERT(tensor->view_src->extra != nullptr); 5965 tensor->extra = tensor->view_src->extra; 5966 } else { 5967 ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); 5968 extra->buffer_gpu = ctx->dev_buffer; 5969 extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; 5970 tensor->extra = extra; 5971 } 5972 } 5973 5974 GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { 5975 VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); 5976 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 5977 5978 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 5979 5980 vk_buffer buf = extra->buffer_gpu.lock(); 5981 5982 ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size); 5983 } 5984 5985 GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { 5986 VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); 5987 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 5988 5989 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 5990 5991 vk_buffer buf = extra->buffer_gpu.lock(); 5992 5993 ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size); 5994 } 5995 5996 GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { 5997 if (ggml_backend_buffer_is_vk(src->buffer)) { 5998 ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; 5999 ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; 6000 6001 vk_buffer src_buf = src_extra->buffer_gpu.lock(); 6002 vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); 6003 6004 ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); 6005 6006 return true; 6007 } 6008 return false; 6009 6010 UNUSED(buffer); 6011 } 6012 6013 GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { 6014 ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; 6015 6016 ggml_vk_buffer_memset(ctx->ctx, ctx->dev_buffer, 0, value, buffer->size); 6017 } 6018 6019 static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { 6020 /* .get_name = */ ggml_backend_vk_buffer_get_name, 6021 /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, 6022 /* .get_base = */ ggml_backend_vk_buffer_get_base, 6023 /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, 6024 /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor, 6025 /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor, 6026 /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, 6027 /* .clear = */ ggml_backend_vk_buffer_clear, 6028 /* .reset = */ NULL, 6029 }; 6030 6031 // vk buffer type 6032 struct ggml_backend_vk_buffer_type_context { 6033 std::string name; 6034 ggml_backend_vk_context * ctx; 6035 }; 6036 6037 GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { 6038 ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; 6039 6040 return ctx->name.c_str(); 6041 } 6042 6043 GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { 6044 VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); 6045 ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; 6046 6047 vk_buffer dev_buffer = nullptr; 6048 try { 6049 dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size); 6050 } catch (const vk::SystemError& e) { 6051 return nullptr; 6052 } 6053 6054 ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name); 6055 6056 return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); 6057 } 6058 6059 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { 6060 ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; 6061 return ctx->ctx->device->properties.limits.minStorageBufferOffsetAlignment; 6062 } 6063 6064 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { 6065 ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; 6066 return ctx->ctx->device->max_memory_allocation_size; 6067 } 6068 6069 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { 6070 return ggml_nbytes(tensor); 6071 6072 UNUSED(buft); 6073 } 6074 6075 static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { 6076 /* .get_name = */ ggml_backend_vk_buffer_type_name, 6077 /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, 6078 /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, 6079 /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, 6080 /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, 6081 /* .is_host = */ NULL, 6082 }; 6083 6084 GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { 6085 ggml_vk_instance_init(); 6086 6087 VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); 6088 6089 GGML_ASSERT(dev_num < vk_instance.device_indices.size()); 6090 6091 ggml_backend_vk_init(dev_num); 6092 6093 return &vk_instance.buffer_types[dev_num]; 6094 } 6095 6096 // host buffer type 6097 6098 GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { 6099 return GGML_VK_NAME "_Host"; 6100 6101 UNUSED(buft); 6102 } 6103 6104 GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { 6105 return GGML_VK_NAME "_Host"; 6106 6107 UNUSED(buffer); 6108 } 6109 6110 GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { 6111 VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); 6112 ggml_vk_host_free(&vk_instance.contexts[0], buffer->context); 6113 } 6114 6115 GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { 6116 VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); 6117 size += 32; // Behave like the CPU buffer type 6118 void * ptr = nullptr; 6119 try { 6120 ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size); 6121 } catch (vk::SystemError& e) { 6122 std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl; 6123 std::cerr << "ggml_vulkan: " << e.what() << std::endl; 6124 // fallback to cpu buffer 6125 return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); 6126 } 6127 6128 ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); 6129 buffer->buft = buft; 6130 buffer->iface.get_name = ggml_backend_vk_host_buffer_name; 6131 buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; 6132 6133 return buffer; 6134 } 6135 6136 GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { 6137 return vk_instance.contexts[0].device->properties.limits.minMemoryMapAlignment; 6138 6139 UNUSED(buft); 6140 } 6141 6142 GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { 6143 static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { 6144 /* .iface = */ { 6145 /* .get_name = */ ggml_backend_vk_host_buffer_type_name, 6146 /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, 6147 /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, 6148 /* .get_max_size = */ NULL, // defaults to SIZE_MAX 6149 /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, 6150 /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, 6151 }, 6152 /* .context = */ nullptr, 6153 }; 6154 6155 if (!vk_instance.contexts[0].initialized) { 6156 // Fall back to CPU 6157 return ggml_backend_cpu_buffer_type(); 6158 } 6159 6160 return &ggml_backend_vk_buffer_type_host; 6161 } 6162 6163 // backend 6164 6165 GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { 6166 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6167 6168 return ctx->name.c_str(); 6169 } 6170 6171 GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { 6172 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6173 VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); 6174 6175 size_t idx = ctx->idx; 6176 6177 ggml_vk_cleanup(ctx); 6178 6179 ctx->device.reset(); 6180 ctx->initialized = false; 6181 6182 vk_instance.initialized[idx] = false; 6183 vk_instance.backends[idx] = nullptr; 6184 memset(&vk_instance.buffer_types[idx], 0, sizeof(ggml_backend_buffer_type)); 6185 delete backend; 6186 } 6187 6188 GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { 6189 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6190 6191 GGML_ASSERT(ctx->initialized); 6192 6193 return ggml_backend_vk_buffer_type(ctx->idx); 6194 } 6195 6196 GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { 6197 VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); 6198 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6199 GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); 6200 6201 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 6202 6203 if (ctx->transfer_ctx == nullptr) { 6204 // Initialize new transfer context 6205 ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 6206 ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); 6207 } 6208 6209 vk_buffer buf = extra->buffer_gpu.lock(); 6210 6211 ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); 6212 } 6213 6214 GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { 6215 VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); 6216 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6217 GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); 6218 6219 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 6220 6221 if (ctx->transfer_ctx == nullptr) { 6222 // Initialize new transfer context 6223 ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 6224 ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); 6225 } 6226 6227 vk_buffer buf = extra->buffer_gpu.lock(); 6228 6229 ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); 6230 } 6231 6232 GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { 6233 VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); 6234 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6235 if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { 6236 ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; 6237 ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; 6238 6239 if (ctx->transfer_ctx == nullptr) { 6240 // Initialize new transfer context 6241 ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); 6242 ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); 6243 } 6244 6245 vk_buffer src_buf = src_extra->buffer_gpu.lock(); 6246 vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); 6247 6248 ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); 6249 return true; 6250 } 6251 6252 return false; 6253 } 6254 6255 GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { 6256 VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); 6257 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6258 if(ctx->transfer_ctx == nullptr) { 6259 return; 6260 } 6261 6262 ggml_vk_ctx_end(ctx->transfer_ctx); 6263 6264 for (auto& cpy : ctx->transfer_ctx->in_memcpys) { 6265 memcpy(cpy.dst, cpy.src, cpy.n); 6266 } 6267 6268 ggml_vk_submit(ctx->transfer_ctx, ctx->fence); 6269 VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); 6270 ctx->device->device.resetFences({ ctx->fence }); 6271 6272 for (auto& cpy : ctx->transfer_ctx->out_memcpys) { 6273 memcpy(cpy.dst, cpy.src, cpy.n); 6274 } 6275 6276 ctx->transfer_ctx = nullptr; 6277 } 6278 6279 static bool ggml_vk_is_empty(ggml_tensor * node) { 6280 return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; 6281 } 6282 6283 GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { 6284 VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); 6285 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6286 6287 for (int i = 0; i < cgraph->n_nodes; i++) { 6288 ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]); 6289 } 6290 ggml_vk_preallocate_buffers(ctx); 6291 6292 int last_node = cgraph->n_nodes - 1; 6293 6294 // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly 6295 while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) { 6296 last_node -= 1; 6297 } 6298 6299 for (int i = 0; i < cgraph->n_nodes; i++) { 6300 ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); 6301 } 6302 6303 ggml_compute_params params = {}; 6304 params.type = GGML_TASK_TYPE_COMPUTE; 6305 params.ith = 0; 6306 for (int i = 0; i < cgraph->n_nodes; i++) { 6307 ggml_tensor * node = cgraph->nodes[i]; 6308 6309 if (ggml_vk_is_empty(node)) { 6310 continue; 6311 } 6312 6313 bool ok = ggml_vk_compute_forward(ctx, ¶ms, node); 6314 if (!ok) { 6315 fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); 6316 } 6317 #ifdef GGML_VULKAN_CHECK_RESULTS 6318 else { 6319 ggml_vk_check_results_1(ctx, ¶ms, node); 6320 } 6321 #endif 6322 GGML_ASSERT(ok); 6323 } 6324 6325 ggml_vk_graph_cleanup(ctx); 6326 6327 return GGML_STATUS_SUCCESS; 6328 6329 UNUSED(backend); 6330 } 6331 6332 GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { 6333 // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context; 6334 6335 switch (op->op) { 6336 case GGML_OP_UNARY: 6337 switch (ggml_get_unary_op(op)) { 6338 case GGML_UNARY_OP_GELU: 6339 case GGML_UNARY_OP_SILU: 6340 case GGML_UNARY_OP_RELU: 6341 return ggml_is_contiguous(op->src[0]); 6342 default: 6343 return false; 6344 } 6345 break; 6346 case GGML_OP_MUL_MAT: 6347 case GGML_OP_MUL_MAT_ID: 6348 { 6349 switch (op->src[0]->type) { 6350 case GGML_TYPE_F32: 6351 case GGML_TYPE_F16: 6352 case GGML_TYPE_Q4_0: 6353 case GGML_TYPE_Q4_1: 6354 case GGML_TYPE_Q5_0: 6355 case GGML_TYPE_Q5_1: 6356 case GGML_TYPE_Q8_0: 6357 case GGML_TYPE_Q2_K: 6358 case GGML_TYPE_Q3_K: 6359 case GGML_TYPE_Q4_K: 6360 case GGML_TYPE_Q5_K: 6361 case GGML_TYPE_Q6_K: 6362 break; 6363 default: 6364 return false; 6365 } 6366 struct ggml_tensor * a; 6367 struct ggml_tensor * b; 6368 if (op->op == GGML_OP_MUL_MAT) { 6369 a = op->src[0]; 6370 b = op->src[1]; 6371 } else { 6372 a = op->src[2]; 6373 b = op->src[1]; 6374 } 6375 if (a->ne[3] != b->ne[3]) { 6376 return false; 6377 } 6378 return true; 6379 } break; 6380 case GGML_OP_GET_ROWS: 6381 { 6382 switch (op->src[0]->type) { 6383 case GGML_TYPE_F32: 6384 case GGML_TYPE_F16: 6385 case GGML_TYPE_Q4_0: 6386 case GGML_TYPE_Q4_1: 6387 case GGML_TYPE_Q5_0: 6388 case GGML_TYPE_Q5_1: 6389 case GGML_TYPE_Q8_0: 6390 return true; 6391 default: 6392 return false; 6393 } 6394 } break; 6395 case GGML_OP_CPY: 6396 case GGML_OP_DUP: 6397 { 6398 ggml_type src0_type = op->src[0]->type; 6399 ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; 6400 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { 6401 return true; 6402 } 6403 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { 6404 return true; 6405 } 6406 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { 6407 return true; 6408 } 6409 return false; 6410 } break; 6411 // case GGML_OP_REPEAT: 6412 // { 6413 // ggml_type src0_type = op->src[0]->type; 6414 // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; 6415 // } break; 6416 case GGML_OP_ROPE: 6417 return ggml_is_contiguous(op->src[0]); 6418 case GGML_OP_NONE: 6419 case GGML_OP_RESHAPE: 6420 case GGML_OP_VIEW: 6421 case GGML_OP_PERMUTE: 6422 case GGML_OP_TRANSPOSE: 6423 case GGML_OP_NORM: 6424 case GGML_OP_ADD: 6425 case GGML_OP_MUL: 6426 case GGML_OP_DIV: 6427 case GGML_OP_RMS_NORM: 6428 case GGML_OP_SCALE: 6429 case GGML_OP_SQR: 6430 case GGML_OP_CLAMP: 6431 case GGML_OP_CONT: 6432 case GGML_OP_DIAG_MASK_INF: 6433 case GGML_OP_SOFT_MAX: 6434 case GGML_OP_ARGSORT: 6435 case GGML_OP_SUM_ROWS: 6436 return true; 6437 default: 6438 return false; 6439 } 6440 6441 UNUSED(backend); 6442 } 6443 6444 GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { 6445 const int min_batch_size = 32; 6446 6447 return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || 6448 (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); 6449 6450 UNUSED(backend); 6451 } 6452 6453 GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { 6454 if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { 6455 return false; 6456 } 6457 6458 ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; 6459 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; 6460 6461 return buft_ctx->ctx->idx == ctx->idx; 6462 } 6463 6464 // TODO: enable async and synchronize 6465 static ggml_backend_i ggml_backend_vk_interface = { 6466 /* .get_name = */ ggml_backend_vk_name, 6467 /* .free = */ ggml_backend_vk_free, 6468 /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, 6469 /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, 6470 /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, 6471 /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, 6472 /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, 6473 /* .graph_plan_create = */ NULL, 6474 /* .graph_plan_free = */ NULL, 6475 /* .graph_plan_update = */ NULL, 6476 /* .graph_plan_compute = */ NULL, 6477 /* .graph_compute = */ ggml_backend_vk_graph_compute, 6478 /* .supports_op = */ ggml_backend_vk_supports_op, 6479 /* .supports_buft = */ ggml_backend_vk_supports_buft, 6480 /* .offload_op = */ ggml_backend_vk_offload_op, 6481 /* .event_new = */ NULL, 6482 /* .event_free = */ NULL, 6483 /* .event_record = */ NULL, 6484 /* .event_wait = */ NULL, 6485 /* .event_synchronize = */ NULL, 6486 }; 6487 6488 static ggml_guid_t ggml_backend_vk_guid() { 6489 static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; 6490 return &guid; 6491 } 6492 6493 GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { 6494 if (vk_instance.initialized[dev_num]) { 6495 return vk_instance.backends[dev_num]; 6496 } 6497 VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); 6498 6499 ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num]; 6500 ggml_vk_init(ctx, dev_num); 6501 ctx->name = GGML_VK_NAME + std::to_string(dev_num); 6502 vk_instance.buffer_types[dev_num] = { 6503 /* .iface = */ ggml_backend_vk_buffer_type_interface, 6504 /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx }, 6505 }; 6506 vk_instance.initialized[dev_num] = true; 6507 6508 ggml_backend_t vk_backend = new ggml_backend { 6509 /* .guid = */ ggml_backend_vk_guid(), 6510 /* .interface = */ ggml_backend_vk_interface, 6511 /* .context = */ &vk_instance.contexts[ctx->idx], 6512 }; 6513 6514 vk_instance.backends[dev_num] = vk_backend; 6515 6516 return vk_backend; 6517 } 6518 6519 GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { 6520 return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); 6521 } 6522 6523 GGML_CALL int ggml_backend_vk_get_device_count() { 6524 return ggml_vk_get_device_count(); 6525 } 6526 6527 GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { 6528 ggml_vk_get_device_description(device, description, description_size); 6529 } 6530 6531 GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { 6532 GGML_ASSERT(device < (int) vk_instance.device_indices.size()); 6533 6534 vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; 6535 6536 vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); 6537 6538 for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { 6539 if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { 6540 *total = heap.size; 6541 *free = heap.size; 6542 break; 6543 } 6544 } 6545 } 6546 6547 // backend registry 6548 GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) { 6549 ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data); 6550 return vk_backend; 6551 6552 UNUSED(params); 6553 } 6554 6555 extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); 6556 6557 GGML_CALL int ggml_backend_vk_reg_devices() { 6558 ggml_vk_instance_init(); 6559 6560 for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { 6561 char name[128]; 6562 snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i); 6563 ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT 6564 } 6565 return vk_instance.device_indices.size(); 6566 } 6567 6568 // Extension availability 6569 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) { 6570 #ifdef GGML_VULKAN_VALIDATE 6571 bool portability_enumeration_ext = false; 6572 // Check for portability enumeration extension for MoltenVK support 6573 for (const auto& properties : instance_extensions) { 6574 if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { 6575 return true; 6576 } 6577 } 6578 if (!portability_enumeration_ext) { 6579 std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; 6580 } 6581 #endif 6582 return false; 6583 6584 UNUSED(instance_extensions); 6585 } 6586 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) { 6587 #ifdef __APPLE__ 6588 bool portability_enumeration_ext = false; 6589 // Check for portability enumeration extension for MoltenVK support 6590 for (const auto& properties : instance_extensions) { 6591 if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { 6592 return true; 6593 } 6594 } 6595 if (!portability_enumeration_ext) { 6596 std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; 6597 } 6598 #endif 6599 return false; 6600 6601 UNUSED(instance_extensions); 6602 } 6603 6604 // checks 6605 6606 #ifdef GGML_VULKAN_CHECK_RESULTS 6607 static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) { 6608 if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) { 6609 return; 6610 } 6611 for (int j = 0; j < level; j++) { 6612 std::cerr << " "; 6613 } 6614 std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl; 6615 6616 done.push_back(tensor); 6617 6618 for (int i = 0; i < GGML_MAX_SRC; i++) { 6619 if (tensor->src[i] != nullptr) { 6620 ggml_vk_print_graph_origin(tensor->src[i], done, level + 1); 6621 } 6622 } 6623 } 6624 6625 static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) { 6626 if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) { 6627 return; 6628 } 6629 i0 = std::max(i0, 5); 6630 i1 = std::max(i1, 5); 6631 i2 = std::max(i2, 0); 6632 i3 = std::max(i3, 0); 6633 fprintf(stderr, " "); 6634 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 6635 fprintf(stderr, "%7d ", idx1); 6636 } 6637 fprintf(stderr, "\n"); 6638 for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { 6639 fprintf(stderr, "%7d: ", idx0); 6640 for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { 6641 if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { 6642 float val; 6643 if (tensor->type == GGML_TYPE_F32) { 6644 val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); 6645 } else if (tensor->type == GGML_TYPE_F16) { 6646 val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); 6647 } else if (tensor->type == GGML_TYPE_I32) { 6648 val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); 6649 } else { 6650 GGML_ASSERT(false); 6651 } 6652 fprintf(stderr, "% 7.2f ", val); 6653 } else { 6654 fprintf(stderr, " "); 6655 } 6656 } 6657 fprintf(stderr, "\n"); 6658 } 6659 } 6660 6661 static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { 6662 void * tensor_data = tensor->data; 6663 6664 if (ggml_backend_buffer_is_vk(tensor->buffer)) { 6665 const size_t tensor_size = ggml_nbytes(tensor); 6666 tensor_data = malloc(tensor_size); 6667 6668 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 6669 6670 vk_buffer buffer_gpu = extra->buffer_gpu.lock(); 6671 ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); 6672 } 6673 6674 std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; 6675 std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; 6676 if (tensor->src[0] != nullptr) { 6677 std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl; 6678 } 6679 if (tensor->src[1] != nullptr) { 6680 std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; 6681 } 6682 std::cerr << std::endl << "Result:" << std::endl; 6683 ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); 6684 std::cerr << std::endl; 6685 std::cerr << std::endl << "Result:" << std::endl; 6686 ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); 6687 std::cerr << std::endl; 6688 std::vector<const ggml_tensor *> done; 6689 ggml_vk_print_graph_origin(tensor, done); 6690 6691 if (ggml_backend_buffer_is_vk(tensor->buffer)) { 6692 free(tensor_data); 6693 } 6694 } 6695 6696 void * comp_result; 6697 size_t comp_size; 6698 size_t comp_nb[GGML_MAX_DIMS]; 6699 size_t check_counter = 0; 6700 static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { 6701 if (params->ith != 0) { 6702 return; 6703 } 6704 if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { 6705 return; 6706 } 6707 6708 check_counter++; 6709 if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { 6710 return; 6711 } 6712 6713 VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")"); 6714 6715 ggml_tensor * src0 = tensor->src[0]; 6716 ggml_tensor * src1 = tensor->src[1]; 6717 ggml_tensor * src2 = tensor->src[2]; 6718 6719 struct ggml_init_params iparams = { 6720 /*.mem_size =*/ 1024*1024*1024, 6721 /*.mem_buffer =*/ NULL, 6722 /*.no_alloc =*/ false, 6723 }; 6724 6725 struct ggml_context * ggml_ctx = ggml_init(iparams); 6726 6727 struct ggml_tensor * src0_clone = nullptr; 6728 struct ggml_tensor * src1_clone = nullptr; 6729 struct ggml_tensor * src2_clone = nullptr; 6730 struct ggml_tensor * tensor_clone = nullptr; 6731 6732 size_t src0_size; 6733 size_t src1_size; 6734 size_t src2_size; 6735 6736 void * src0_buffer = nullptr; 6737 void * src1_buffer = nullptr; 6738 void * src2_buffer = nullptr; 6739 6740 if (src0 != nullptr) { 6741 src0_clone = ggml_dup_tensor(ggml_ctx, src0); 6742 6743 src0_size = ggml_nbytes(src0); 6744 6745 src0_buffer = malloc(src0_size); 6746 src0_clone->data = src0_buffer; 6747 if (ggml_backend_buffer_is_host(src0->buffer)) { 6748 memcpy(src0_clone->data, src0->data, src0_size); 6749 memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); 6750 } else if (ggml_backend_buffer_is_vk(src0->buffer)) { 6751 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; 6752 vk_buffer buffer_gpu = extra->buffer_gpu.lock(); 6753 uint64_t offset = extra->offset + src0->view_offs; 6754 if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { 6755 for (int i3 = 0; i3 < src0->ne[3]; i3++) { 6756 for (int i2 = 0; i2 < src0->ne[2]; i2++) { 6757 const int idx = i3*src0->ne[2] + i2; 6758 ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); 6759 } 6760 } 6761 6762 src0_clone->nb[0] = src0->nb[0]; 6763 src0_clone->nb[1] = src0->nb[1]; 6764 for (int i = 2; i < GGML_MAX_DIMS; i++) { 6765 src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1]; 6766 } 6767 } else { 6768 if (offset + src0_size >= buffer_gpu->size) { 6769 src0_size = buffer_gpu->size - offset; 6770 } 6771 ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size); 6772 memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); 6773 } 6774 } else { 6775 GGML_ASSERT(false); 6776 } 6777 6778 if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { 6779 ggml_vk_print_tensor(ctx, src0, "src0"); 6780 } 6781 } 6782 if (src1 != nullptr) { 6783 src1_clone = ggml_dup_tensor(ggml_ctx, src1); 6784 6785 src1_size = ggml_nbytes(src1); 6786 6787 src1_buffer = malloc(src1_size); 6788 src1_clone->data = src1_buffer; 6789 if (ggml_backend_buffer_is_host(src1->buffer)) { 6790 memcpy(src1_clone->data, src1->data, src1_size); 6791 memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); 6792 } else if (ggml_backend_buffer_is_vk(src1->buffer)) { 6793 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; 6794 vk_buffer buffer_gpu = extra->buffer_gpu.lock(); 6795 uint64_t offset = extra->offset + src1->view_offs; 6796 if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { 6797 for (int i3 = 0; i3 < src1->ne[3]; i3++) { 6798 for (int i2 = 0; i2 < src1->ne[2]; i2++) { 6799 const int idx = i3*src1->ne[2] + i2; 6800 ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); 6801 } 6802 } 6803 6804 src1_clone->nb[0] = src1->nb[0]; 6805 src1_clone->nb[1] = src1->nb[1]; 6806 for (int i = 2; i < GGML_MAX_DIMS; i++) { 6807 src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1]; 6808 } 6809 } else { 6810 if (offset + src1_size >= buffer_gpu->size) { 6811 src1_size = buffer_gpu->size - offset; 6812 } 6813 ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size); 6814 memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); 6815 } 6816 } else { 6817 GGML_ASSERT(false); 6818 } 6819 6820 if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { 6821 ggml_vk_print_tensor(ctx, src1, "src1"); 6822 std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; 6823 std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; 6824 if (src1->src[0] != nullptr) { 6825 std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl; 6826 } 6827 if (src1->src[1] != nullptr) { 6828 std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl; 6829 } 6830 std::cerr << std::endl << "Result:" << std::endl; 6831 ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0); 6832 std::cerr << std::endl; 6833 std::cerr << std::endl << "Result:" << std::endl; 6834 ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0); 6835 std::cerr << std::endl; 6836 std::vector<const ggml_tensor *> done; 6837 ggml_vk_print_graph_origin(src1_clone, done); 6838 } 6839 } 6840 if (src2 != nullptr) { 6841 src2_clone = ggml_dup_tensor(ggml_ctx, src2); 6842 6843 src2_size = ggml_nbytes(src2); 6844 6845 src2_buffer = malloc(src2_size); 6846 src2_clone->data = src2_buffer; 6847 if (ggml_backend_buffer_is_host(src2->buffer)) { 6848 memcpy(src2_clone->data, src2->data, src2_size); 6849 memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); 6850 } else if (ggml_backend_buffer_is_vk(src2->buffer)) { 6851 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; 6852 vk_buffer buffer_gpu = extra->buffer_gpu.lock(); 6853 uint64_t offset = extra->offset + src2->view_offs; 6854 if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { 6855 for (int i3 = 0; i3 < src2->ne[3]; i3++) { 6856 for (int i2 = 0; i2 < src2->ne[2]; i2++) { 6857 const int idx = i3*src2->ne[2] + i2; 6858 ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]); 6859 } 6860 } 6861 6862 src2_clone->nb[0] = src2->nb[0]; 6863 src2_clone->nb[1] = src2->nb[1]; 6864 for (int i = 2; i < GGML_MAX_DIMS; i++) { 6865 src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1]; 6866 } 6867 } else { 6868 if (offset + src2_size >= buffer_gpu->size) { 6869 src2_size = buffer_gpu->size - offset; 6870 } 6871 ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size); 6872 memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); 6873 } 6874 } else { 6875 GGML_ASSERT(false); 6876 } 6877 6878 if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { 6879 ggml_vk_print_tensor(ctx, src2, "src2"); 6880 std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl; 6881 std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl; 6882 if (src2->src[0] != nullptr) { 6883 std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl; 6884 } 6885 if (src2->src[1] != nullptr) { 6886 std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl; 6887 } 6888 std::cerr << std::endl << "Result:" << std::endl; 6889 ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0); 6890 std::cerr << std::endl; 6891 std::cerr << std::endl << "Result:" << std::endl; 6892 ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0); 6893 std::cerr << std::endl; 6894 std::vector<const ggml_tensor *> done; 6895 ggml_vk_print_graph_origin(src2_clone, done); 6896 } 6897 } 6898 6899 if (tensor->op == GGML_OP_MUL_MAT) { 6900 tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); 6901 } else if (tensor->op == GGML_OP_MUL_MAT_ID) { 6902 tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone); 6903 } else if (tensor->op == GGML_OP_MUL) { 6904 tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone); 6905 } else if (tensor->op == GGML_OP_DIV) { 6906 tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone); 6907 } else if (tensor->op == GGML_OP_SCALE) { 6908 tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); 6909 } else if (tensor->op == GGML_OP_SQR) { 6910 tensor_clone = ggml_sqr(ggml_ctx, src0_clone); 6911 } else if (tensor->op == GGML_OP_CLAMP) { 6912 tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); 6913 } else if (tensor->op == GGML_OP_ADD) { 6914 tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); 6915 } else if (tensor->op == GGML_OP_NORM) { 6916 tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); 6917 } else if (tensor->op == GGML_OP_RMS_NORM) { 6918 tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); 6919 } else if (tensor->op == GGML_OP_SOFT_MAX) { 6920 if (src1 != nullptr) { 6921 tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); 6922 } else { 6923 tensor_clone = ggml_soft_max(ggml_ctx, src0_clone); 6924 } 6925 } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { 6926 tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(int *)tensor->op_params); 6927 } else if (tensor->op == GGML_OP_ROPE) { 6928 const int n_dims = ((int32_t *) tensor->op_params)[1]; 6929 const int mode = ((int32_t *) tensor->op_params)[2]; 6930 //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; 6931 const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; 6932 float freq_base = ((float *) tensor->op_params)[5]; 6933 float freq_scale = ((float *) tensor->op_params)[6]; 6934 float ext_factor = ((float *) tensor->op_params)[7]; 6935 float attn_factor = ((float *) tensor->op_params)[8]; 6936 float beta_fast = ((float *) tensor->op_params)[9]; 6937 float beta_slow = ((float *) tensor->op_params)[10]; 6938 tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); 6939 } else if (tensor->op == GGML_OP_UNARY) { 6940 switch (ggml_get_unary_op(tensor)) { 6941 case GGML_UNARY_OP_SILU: 6942 tensor_clone = ggml_silu(ggml_ctx, src0_clone); 6943 break; 6944 case GGML_UNARY_OP_GELU: 6945 tensor_clone = ggml_gelu(ggml_ctx, src0_clone); 6946 break; 6947 case GGML_UNARY_OP_RELU: 6948 tensor_clone = ggml_relu(ggml_ctx, src0_clone); 6949 break; 6950 default: 6951 std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; 6952 GGML_ASSERT(false); 6953 } 6954 } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { 6955 if (src1 == nullptr) { 6956 tensor_clone = ggml_dup(ggml_ctx, src0_clone); 6957 tensor_clone->type = tensor->type; 6958 } else { 6959 tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone); 6960 } 6961 } else if (tensor->op == GGML_OP_CONT) { 6962 tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); 6963 } else if (tensor->op == GGML_OP_RESHAPE) { 6964 tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); 6965 } else if (tensor->op == GGML_OP_VIEW) { 6966 tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); 6967 } else if (tensor->op == GGML_OP_PERMUTE) { 6968 int32_t * params = (int32_t *)tensor->op_params; 6969 tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]); 6970 } else if (tensor->op == GGML_OP_TRANSPOSE) { 6971 tensor_clone = ggml_transpose(ggml_ctx, src0_clone); 6972 } else if (tensor->op == GGML_OP_GET_ROWS) { 6973 tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone); 6974 } else if (tensor->op == GGML_OP_ARGSORT) { 6975 tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params); 6976 } else if (tensor->op == GGML_OP_SUM_ROWS) { 6977 tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); 6978 } else { 6979 std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; 6980 GGML_ASSERT(false); 6981 } 6982 6983 ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); 6984 ggml_build_forward_expand(cgraph, tensor_clone); 6985 6986 ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); 6987 6988 if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { 6989 ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); 6990 } 6991 6992 comp_size = ggml_nbytes(tensor_clone); 6993 6994 comp_result = malloc(comp_size); 6995 memcpy(comp_result, tensor_clone->data, comp_size); 6996 memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS); 6997 6998 if (src0 != nullptr) { 6999 free(src0_buffer); 7000 } 7001 if (src1 != nullptr) { 7002 free(src1_buffer); 7003 } 7004 7005 ggml_free(ggml_ctx); 7006 } 7007 7008 static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { 7009 if (params->ith != 0) { 7010 return; 7011 } 7012 if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { 7013 return; 7014 } 7015 if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { 7016 return; 7017 } 7018 7019 VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")"); 7020 7021 ggml_tensor * src0 = tensor->src[0]; 7022 ggml_tensor * src1 = tensor->src[1]; 7023 ggml_tensor * src2 = tensor->src[2]; 7024 7025 void * tensor_data = tensor->data; 7026 7027 if (ggml_backend_buffer_is_vk(tensor->buffer)) { 7028 size_t tensor_size = ggml_nbytes(tensor); 7029 tensor_data = malloc(tensor_size); 7030 7031 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; 7032 7033 vk_buffer buffer_gpu = extra->buffer_gpu.lock(); 7034 if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) { 7035 tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs); 7036 } 7037 7038 ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); 7039 } 7040 7041 float first_error_result = -1.0f; 7042 float first_error_correct = -1.0f; 7043 std::array<int, 4> first_error = { -1, -1, -1, -1 }; 7044 double avg_err = 0.0; 7045 size_t counter = 0; 7046 7047 for (int i3 = 0; i3 < tensor->ne[3]; i3++) { 7048 for (int i2 = 0; i2 < tensor->ne[2]; i2++) { 7049 for (int i1 = 0; i1 < tensor->ne[1]; i1++) { 7050 for (int i0 = 0; i0 < tensor->ne[0]; i0++) { 7051 const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size; 7052 float correct = 0.0f; 7053 float result = 0.0f; 7054 7055 if (buffer_size_fit) { 7056 if (tensor->type == GGML_TYPE_F32) { 7057 correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); 7058 result = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); 7059 } else if (tensor->type == GGML_TYPE_F16) { 7060 correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); 7061 result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); 7062 } else if (tensor->type == GGML_TYPE_I32) { 7063 correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); 7064 result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); 7065 } else { 7066 std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl; 7067 } 7068 } else { 7069 std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; 7070 GGML_ASSERT(false); 7071 } 7072 7073 if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { 7074 std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl; 7075 std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; 7076 if (src0 != nullptr) { 7077 std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; 7078 } 7079 if (src1 != nullptr) { 7080 std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; 7081 } 7082 if (src2 != nullptr) { 7083 std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; 7084 } 7085 std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; 7086 std::cerr << std::endl << "Result:" << std::endl; 7087 ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3); 7088 std::cerr << std::endl << "Correct:" << std::endl; 7089 ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3); 7090 std::cerr << std::endl; 7091 std::vector<const ggml_tensor *> done; 7092 ggml_vk_print_graph_origin(tensor, done); 7093 GGML_ASSERT(false); 7094 } 7095 if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) { 7096 first_error[0] = i0; 7097 first_error[1] = i1; 7098 first_error[2] = i2; 7099 first_error[3] = i3; 7100 first_error_result = result; 7101 first_error_correct = correct; 7102 } 7103 7104 // Special case, value is infinite, avoid NaN result in avg_err 7105 // NaN also appears in results, if both are nan error is 0 7106 if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) { 7107 avg_err += std::fabs(correct - result); 7108 } 7109 counter++; 7110 } 7111 } 7112 } 7113 } 7114 7115 avg_err /= counter; 7116 7117 if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { 7118 std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; 7119 std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; 7120 if (src0 != nullptr) { 7121 std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; 7122 } 7123 if (src1 != nullptr) { 7124 std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; 7125 } 7126 if (src2 != nullptr) { 7127 std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; 7128 } 7129 std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; 7130 std::cerr << std::endl << "Result:" << std::endl; 7131 ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); 7132 std::cerr << std::endl << "Correct:" << std::endl; 7133 ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0); 7134 std::cerr << std::endl; 7135 std::cerr << std::endl << "Result:" << std::endl; 7136 ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); 7137 std::cerr << std::endl << "Correct:" << std::endl; 7138 ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0); 7139 std::cerr << std::endl; 7140 std::vector<const ggml_tensor *> done; 7141 ggml_vk_print_graph_origin(tensor, done); 7142 } 7143 7144 if (avg_err > 0.05 || std::isnan(avg_err)) { 7145 std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; 7146 std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; 7147 if (src0 != nullptr) { 7148 std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; 7149 } 7150 if (src1 != nullptr) { 7151 std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; 7152 } 7153 if (src2 != nullptr) { 7154 std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; 7155 } 7156 std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; 7157 std::cerr << std::endl << "Result:" << std::endl; 7158 ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]); 7159 std::cerr << std::endl << "Correct:" << std::endl; 7160 ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]); 7161 std::cerr << std::endl; 7162 std::vector<const ggml_tensor *> done; 7163 ggml_vk_print_graph_origin(tensor, done); 7164 GGML_ASSERT(false); 7165 } else { 7166 std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; 7167 } 7168 7169 free(comp_result); 7170 comp_result = nullptr; 7171 comp_size = 0; 7172 7173 if (ggml_backend_buffer_is_vk(tensor->buffer)) { 7174 free(tensor_data); 7175 } 7176 } 7177 #endif