/ ggml-kompute.cpp
ggml-kompute.cpp
1 #include "ggml.h" 2 #include "ggml-backend.h" 3 #include "ggml-backend-impl.h" 4 #include "ggml-kompute.h" 5 6 // These are generated at build time by cmake custom command 7 #include "shaderop_scale.h" 8 #include "shaderop_scale_8.h" 9 #include "shaderop_add.h" 10 #include "shaderop_addrow.h" 11 #include "shaderop_mul.h" 12 #include "shaderop_silu.h" 13 #include "shaderop_relu.h" 14 #include "shaderop_gelu.h" 15 #include "shaderop_softmax.h" 16 #include "shaderop_norm.h" 17 #include "shaderop_rmsnorm.h" 18 #include "shaderop_diagmask.h" 19 #include "shaderop_mul_mat_f16.h" 20 #include "shaderop_mul_mat_q8_0.h" 21 #include "shaderop_mul_mat_q4_0.h" 22 #include "shaderop_mul_mat_q4_1.h" 23 #include "shaderop_mul_mat_q6_k.h" 24 #include "shaderop_mul_mat_mat_f32.h" 25 #include "shaderop_getrows_f32.h" 26 #include "shaderop_getrows_f16.h" 27 #include "shaderop_getrows_q4_0.h" 28 #include "shaderop_getrows_q4_1.h" 29 #include "shaderop_getrows_q6_k.h" 30 #include "shaderop_rope_f16.h" 31 #include "shaderop_rope_f32.h" 32 #include "shaderop_cpy_f16_f16.h" 33 #include "shaderop_cpy_f16_f32.h" 34 #include "shaderop_cpy_f32_f16.h" 35 #include "shaderop_cpy_f32_f32.h" 36 37 #include <algorithm> 38 #include <array> 39 #include <cassert> 40 #include <cstdint> 41 #include <cstdio> 42 #include <cstring> 43 #include <iostream> 44 #include <memory> 45 #include <stdexcept> 46 #include <string> 47 #include <unordered_map> 48 #include <utility> 49 #include <vector> 50 51 #include <kompute/Kompute.hpp> 52 #include <vulkan/vulkan.hpp> 53 54 #ifdef __linux__ 55 #include <cstdlib> // for setenv 56 #endif 57 58 #define QK4_0 32 59 #define QR4_0 2 60 #define QK4_1 32 61 #define QK_NL 16 62 63 typedef ggml_fp16_t half; 64 65 static std::string ggml_kompute_format_name(int device) { 66 return "Kompute" + std::to_string(device); 67 } 68 69 struct ggml_kompute_context { 70 int device; 71 std::string name; 72 std::shared_ptr<vk::DescriptorPool> pool; 73 74 ggml_kompute_context(int device) 75 : device(device), name(ggml_kompute_format_name(device)) {} 76 }; 77 78 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object 79 // and consolidate the init functions and simplify object lifetime management. As it currently stands, 80 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context 81 // is only created when a device is set and vulkan is explicitly turned on. 82 static ggml_kompute_context *s_kompute_context = nullptr; 83 84 class kompute_manager { 85 kp::Manager *s_mgr = nullptr; 86 87 public: 88 kp::Manager *operator()() { 89 if (s_mgr && !s_mgr->hasInstance()) { 90 destroy(); 91 } 92 if (!s_mgr) { 93 s_mgr = new kp::Manager; 94 } 95 return s_mgr; 96 } 97 98 void destroy() { 99 delete s_mgr; 100 s_mgr = nullptr; 101 } 102 }; 103 104 static kompute_manager komputeManager; 105 106 struct ggml_vk_memory { 107 void *data = nullptr; 108 size_t size = 0; 109 vk::DeviceMemory *primaryMemory = nullptr; 110 vk::Buffer *primaryBuffer = nullptr; 111 vk::DeviceMemory *stagingMemory = nullptr; 112 vk::Buffer *stagingBuffer = nullptr; 113 }; 114 115 #ifdef __linux__ 116 __attribute__((constructor)) 117 static void enable_sam() { 118 setenv("RADV_PERFTEST", "sam", false); 119 } 120 #endif 121 122 static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) { 123 vk::PhysicalDeviceFeatures availableFeatures; 124 physical_device.getFeatures(&availableFeatures); 125 126 if (!availableFeatures.shaderInt16) 127 return false; 128 129 vk::PhysicalDeviceVulkan11Features availableFeatures11; 130 vk::PhysicalDeviceVulkan12Features availableFeatures12; 131 132 availableFeatures11.pNext = &availableFeatures12; 133 availableFeatures12.pNext = nullptr; 134 135 vk::PhysicalDeviceFeatures2 features2; 136 features2.pNext = &availableFeatures11; 137 138 physical_device.getFeatures2(&features2); 139 140 if (!availableFeatures11.uniformAndStorageBuffer16BitAccess || 141 !availableFeatures11.storageBuffer16BitAccess) { 142 return false; 143 } 144 145 if (!availableFeatures12.storageBuffer8BitAccess || 146 !availableFeatures12.uniformAndStorageBuffer8BitAccess || 147 !availableFeatures12.shaderFloat16 || 148 !availableFeatures12.shaderInt8) { 149 return false; 150 } 151 152 return true; 153 } 154 155 static const char * ggml_vk_getVendorName(uint32_t vendorID) { 156 switch (vendorID) { 157 case 0x10DE: 158 return "nvidia"; 159 case 0x1002: 160 return "amd"; 161 case 0x8086: 162 return "intel"; 163 default: 164 return "unknown"; 165 } 166 } 167 168 static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) { 169 std::vector<ggml_vk_device> results; 170 if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance()) 171 return results; 172 173 std::vector<vk::PhysicalDevice> physical_devices; 174 try { 175 physical_devices = komputeManager()->listDevices(); 176 } catch (vk::SystemError & err) { 177 std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n"; 178 return results; 179 } 180 181 uint32_t deviceCount = physical_devices.size(); 182 if (deviceCount == 0) 183 return results; 184 185 std::unordered_map<std::string, size_t> count_by_name; 186 187 for (uint32_t i = 0; i < deviceCount; i++) { 188 const auto & physical_device = physical_devices[i]; 189 190 VkPhysicalDeviceProperties dev_props = physical_device.getProperties(); 191 VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties(); 192 const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion); 193 const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion); 194 if (major < 1 || minor < 2) 195 continue; 196 197 if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device)) 198 continue; 199 200 size_t heapSize = 0; 201 for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) { 202 VkMemoryHeap heap = memoryProperties.memoryHeaps[j]; 203 if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { 204 heapSize = heap.size; 205 break; 206 } 207 } 208 209 if (heapSize < memoryRequired) 210 continue; 211 212 auto ext_props = physical_device.enumerateDeviceExtensionProperties(); 213 bool has_maintenance4 = false; 214 215 // Check if maintenance4 is supported 216 for (const auto & properties : ext_props) { 217 if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { 218 has_maintenance4 = true; 219 } 220 } 221 222 vk::PhysicalDeviceSubgroupProperties subgroup_props; 223 vk::PhysicalDeviceProperties2 dev_props2; 224 vk::PhysicalDeviceMaintenance3Properties dev_props3; 225 vk::PhysicalDeviceMaintenance4Properties dev_props4; 226 dev_props2.pNext = &dev_props3; 227 dev_props3.pNext = &subgroup_props; 228 if (has_maintenance4) { 229 subgroup_props.pNext = &dev_props4; 230 } 231 physical_device.getProperties2(&dev_props2); 232 233 if (subgroup_props.subgroupSize < 32) 234 continue; 235 236 ggml_vk_device d; 237 d.index = i; 238 d.type = dev_props.deviceType; 239 d.heapSize = heapSize; 240 d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID)); 241 d.subgroupSize = subgroup_props.subgroupSize; 242 d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment; 243 244 if (has_maintenance4) { 245 d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize); 246 } else { 247 d.maxAlloc = dev_props3.maxMemoryAllocationSize; 248 } 249 250 std::string name(dev_props.deviceName); 251 size_t n_idx = ++count_by_name[name]; 252 if (n_idx > 1) { 253 name += " (" + std::to_string(n_idx) + ")"; 254 } 255 d.name = strdup(name.c_str()); 256 257 results.push_back(d); 258 } 259 260 std::stable_sort(results.begin(), results.end(), 261 [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool { 262 if (lhs.type != rhs.type) { 263 if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true; 264 if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false; 265 266 if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true; 267 if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false; 268 } 269 return lhs.heapSize < rhs.heapSize; 270 } 271 ); 272 273 return results; 274 } 275 276 // public API returns a C-style array 277 ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) { 278 auto devices = ggml_vk_available_devices_internal(memoryRequired); 279 *count = devices.size(); 280 if (devices.empty()) { 281 return nullptr; 282 } 283 284 size_t nbytes = sizeof (ggml_vk_device) * (devices.size()); 285 auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes)); 286 memcpy(arr, devices.data(), nbytes); 287 return arr; 288 } 289 290 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) { 291 devices.erase( 292 std::remove_if(devices.begin(), devices.end(), 293 [&targetVendor](const ggml_vk_device& device) { 294 return device.vendor != targetVendor; 295 }), 296 devices.end() 297 ); 298 } 299 300 static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) { 301 devices.erase( 302 std::remove_if(devices.begin(), devices.end(), 303 [&targetName](const ggml_vk_device& device) { 304 return device.name != targetName; 305 }), 306 devices.end() 307 ); 308 } 309 310 static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) { 311 if (name.empty()) 312 return false; 313 314 auto devices = ggml_vk_available_devices_internal(memoryRequired); 315 if (name == "amd" || name == "nvidia" || name == "intel") { 316 ggml_vk_filterByVendor(devices, name); 317 } else if (name != "gpu") { 318 ggml_vk_filterByName(devices, name); 319 } 320 321 if (devices.empty()) 322 return false; 323 324 *device = devices.front(); 325 return true; 326 } 327 328 bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) { 329 return ggml_vk_get_device(device, memoryRequired, std::string(name)); 330 } 331 332 bool ggml_vk_has_vulkan() { 333 return komputeManager()->hasVulkan(); 334 } 335 336 bool ggml_vk_has_device() { 337 return komputeManager()->hasDevice(); 338 } 339 340 ggml_vk_device ggml_vk_current_device() { 341 if (!komputeManager()->hasDevice()) 342 return ggml_vk_device(); 343 344 auto devices = ggml_vk_available_devices_internal(0); 345 ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data()); 346 GGML_ASSERT(!devices.empty()); 347 return devices.front(); 348 } 349 350 static 351 void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) { 352 std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = { 353 vk::DescriptorPoolSize( 354 vk::DescriptorType::eStorageBuffer, 355 3 * size // Descriptor count is number of possible tensors to pass into an algorithm 356 ) 357 }; 358 359 vk::DescriptorPoolCreateInfo descriptorPoolInfo( 360 vk::DescriptorPoolCreateFlags(), 361 size, // Max sets 362 static_cast<uint32_t>(descriptorPoolSizes.size()), 363 descriptorPoolSizes.data()); 364 365 ctx->pool = std::make_shared<vk::DescriptorPool>(); 366 vk::Result r = komputeManager()->device()->createDescriptorPool( 367 &descriptorPoolInfo, nullptr, ctx->pool.get()); 368 if (r != vk::Result::eSuccess) 369 std::cerr << "Error allocating descriptor pool" << vk::to_string(r); 370 } 371 372 static 373 void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) { 374 if (ctx->pool) { 375 komputeManager()->device()->destroy( 376 *ctx->pool, 377 (vk::Optional<const vk::AllocationCallbacks>)nullptr); 378 ctx->pool = nullptr; 379 } 380 } 381 382 static 383 vk::Buffer *ggml_vk_allocate_buffer(size_t size) { 384 vk::BufferCreateInfo bufferCreateInfo; 385 bufferCreateInfo.size = size; 386 bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer | 387 vk::BufferUsageFlagBits::eTransferSrc | 388 vk::BufferUsageFlagBits::eTransferDst; 389 bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive; 390 391 vk::Buffer *vkBuffer = new vk::Buffer; 392 vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer); 393 if (r != vk::Result::eSuccess) 394 std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl; 395 return vkBuffer; 396 } 397 398 static 399 vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) { 400 401 uint32_t memoryTypeIndex = -1; 402 bool memoryTypeIndexFound = false; 403 vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties(); 404 for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) { 405 const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i]; 406 const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex]; 407 if (memoryHeap.size < size) { 408 continue; 409 } 410 411 if (requirements.memoryTypeBits & (1 << i)) { 412 if (((memoryProperties.memoryTypes[i]).propertyFlags & 413 flags) == flags) { 414 memoryTypeIndex = i; 415 memoryTypeIndexFound = true; 416 if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) { 417 *isHostVisible = true; 418 } 419 break; 420 } 421 } 422 } 423 if (!memoryTypeIndexFound) { 424 throw std::runtime_error( 425 "Memory type index for buffer creation not found"); 426 } 427 428 vk::MemoryAllocateInfo allocInfo; 429 allocInfo.allocationSize = size; 430 allocInfo.memoryTypeIndex = memoryTypeIndex; 431 vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory; 432 vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory); 433 if (r != vk::Result::eSuccess) { 434 std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl; 435 throw std::runtime_error("Error allocating vulkan memory."); 436 } 437 return vkDeviceMemory; 438 } 439 440 static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) { 441 size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer); 442 443 // If offset is already aligned, return it directly 444 if (offset % minStorageBufferOffsetAlignment == 0) { 445 return offset; 446 } 447 448 // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset 449 return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment; 450 } 451 452 static ggml_vk_memory ggml_vk_allocate(size_t size) { 453 ggml_vk_memory memory; 454 bool isHostVisible = false; 455 { 456 memory.primaryBuffer = ggml_vk_allocate_buffer(size); 457 vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer); 458 vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal; 459 memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); 460 komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0); 461 if (isHostVisible) { 462 vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data); 463 if (r != vk::Result::eSuccess) 464 std::cerr << "Error mapping memory" << vk::to_string(r); 465 } 466 } 467 468 if (!isHostVisible) { 469 memory.stagingBuffer = ggml_vk_allocate_buffer(size); 470 vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer); 471 vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible | 472 vk::MemoryPropertyFlagBits::eHostCoherent | 473 vk::MemoryPropertyFlagBits::eHostCached; 474 memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); 475 komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0); 476 vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data); 477 if (r != vk::Result::eSuccess) 478 std::cerr << "Error mapping memory" << vk::to_string(r); 479 } 480 481 memory.size = size; 482 return memory; 483 } 484 485 static void ggml_vk_free_memory(ggml_vk_memory &memory) 486 { 487 komputeManager()->device()->destroy( 488 *memory.primaryBuffer, 489 (vk::Optional<const vk::AllocationCallbacks>)nullptr); 490 if (memory.stagingBuffer) { 491 komputeManager()->device()->destroy( 492 *memory.stagingBuffer, 493 (vk::Optional<const vk::AllocationCallbacks>)nullptr); 494 } 495 komputeManager()->device()->freeMemory( 496 *memory.primaryMemory, 497 (vk::Optional<const vk::AllocationCallbacks>)nullptr); 498 if (memory.stagingMemory) { 499 komputeManager()->device()->freeMemory( 500 *memory.stagingMemory, 501 (vk::Optional<const vk::AllocationCallbacks>)nullptr); 502 } 503 } 504 505 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft); 506 507 static 508 ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) { 509 ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; 510 511 // compatibility with ggml-backend 512 GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name); 513 514 ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context); 515 516 const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data); 517 518 GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size)); 519 520 offset = uint64_t(ioffs); 521 return buf_ctx; 522 } 523 524 static 525 const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) { 526 uint64_t originalOffset = 0; 527 auto * res = ggml_vk_find_tensor(t, originalOffset); 528 if (!res) { 529 static std::shared_ptr<kp::Tensor> nullTensor = nullptr; 530 return nullTensor; 531 } 532 533 // Create a tensor whose memory will be composed of our buffers at the correct offset 534 const size_t nelements = ggml_nelements(t); 535 size_t nbytes = ggml_nbytes(t); 536 537 size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset); 538 if (alignedOffset) { 539 *alignedOffset = originalOffset - vulkanOffset; 540 nbytes += *alignedOffset; 541 } 542 543 return komputeManager()->tensor( 544 t->data, 545 nelements, 546 nbytes, kp::Tensor::TensorDataTypes::eFloat, 547 res->primaryMemory, res->primaryBuffer, 548 res->stagingMemory, res->stagingBuffer, 549 vulkanOffset); 550 } 551 552 static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) { 553 if (size % sizeof(uint32_t) != 0) { 554 throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)"); 555 } 556 557 const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData); 558 size_t count = size / sizeof(uint32_t); 559 return std::vector<uint32_t>(data_ptr, data_ptr + count); 560 } 561 562 inline static 563 uint32_t safe_divide(uint32_t a, uint32_t b) { 564 if (b <= 1) { 565 return a; 566 } 567 if ((a % b) != 0) { 568 fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); 569 GGML_ASSERT(!"safe_divide result would've had remainder"); 570 } 571 return a / b; 572 } 573 574 static void ggml_vk_add( 575 kp::Sequence& seq, 576 const std::shared_ptr<kp::Tensor>& inA, 577 const std::shared_ptr<kp::Tensor>& inB, 578 const std::shared_ptr<kp::Tensor>& out, 579 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 580 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, 581 int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, 582 int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, 583 int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, 584 int32_t ne0, 585 int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 586 ) { 587 const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv, 588 kp::shader_data::op_add_comp_spv_len); 589 590 struct PushConstants { 591 uint32_t inAOff, inBOff, outOff; 592 int32_t ne00; 593 int32_t nb00, nb01, nb02, nb03; 594 int32_t ne10, ne11, ne12, ne13; 595 int32_t nb10, nb11, nb12, nb13; 596 int32_t ne0; 597 int32_t nb0, nb1, nb2, nb3; 598 } const pushConsts { 599 safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), 600 ne00, 601 nb00, nb01, nb02, nb03, 602 ne10, ne11, ne12, ne13, 603 nb10, nb11, nb12, nb13, 604 ne0, 605 nb0, nb1, nb2, nb3 606 }; 607 608 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 609 if (!komputeManager()->hasAlgorithm(__func__)) { 610 s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); 611 } else { 612 s_algo = komputeManager()->getAlgorithm(__func__); 613 s_algo->setTensors({inA, inB, out}); 614 s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); 615 s_algo->setPushConstants<PushConstants>({pushConsts}); 616 s_algo->updateDescriptors(s_kompute_context->pool.get()); 617 } 618 seq.record<kp::OpAlgoDispatch>(s_algo); 619 } 620 621 static void ggml_vk_addrow(kp::Sequence& seq, 622 const std::shared_ptr<kp::Tensor>& inA, 623 const std::shared_ptr<kp::Tensor>& inB, 624 const std::shared_ptr<kp::Tensor>& out, 625 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 626 uint32_t size, uint32_t row = 0) { 627 628 const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv, 629 kp::shader_data::op_addrow_comp_spv_len); 630 631 struct PushConstants { 632 uint32_t inAOff, inBOff, outOff; 633 uint32_t row; 634 } const pushConsts { 635 safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), 636 row 637 }; 638 639 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 640 if (!komputeManager()->hasAlgorithm(__func__)) 641 s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); 642 else { 643 s_algo = komputeManager()->getAlgorithm(__func__); 644 s_algo->setTensors({inA, inB, out}); 645 s_algo->setWorkgroup({size}); 646 s_algo->setPushConstants<PushConstants>({pushConsts}); 647 s_algo->updateDescriptors(s_kompute_context->pool.get()); 648 } 649 seq.record<kp::OpAlgoDispatch>(s_algo); 650 } 651 652 static void ggml_vk_mul( 653 kp::Sequence& seq, 654 const std::shared_ptr<kp::Tensor>& inA, 655 const std::shared_ptr<kp::Tensor>& inB, 656 const std::shared_ptr<kp::Tensor>& out, 657 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 658 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, 659 int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, 660 int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, 661 int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, 662 int32_t ne0, 663 int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 664 ) { 665 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv, 666 kp::shader_data::op_mul_comp_spv_len); 667 668 struct PushConstants { 669 uint32_t inAOff, inBOff, outOff; 670 int32_t ne00; 671 int32_t nb00, nb01, nb02, nb03; 672 int32_t ne10, ne11, ne12, ne13; 673 int32_t nb10, nb11, nb12, nb13; 674 int32_t ne0; 675 int32_t nb0, nb1, nb2, nb3; 676 } const pushConsts { 677 safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), 678 ne00, 679 nb00, nb01, nb02, nb03, 680 ne10, ne11, ne12, ne13, 681 nb10, nb11, nb12, nb13, 682 ne0, 683 nb0, nb1, nb2, nb3 684 }; 685 686 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 687 if (!komputeManager()->hasAlgorithm(__func__)) { 688 s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); 689 } else { 690 s_algo = komputeManager()->getAlgorithm(__func__); 691 s_algo->setTensors({inA, inB, out}); 692 s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); 693 s_algo->setPushConstants<PushConstants>({pushConsts}); 694 s_algo->updateDescriptors(s_kompute_context->pool.get()); 695 } 696 seq.record<kp::OpAlgoDispatch>(s_algo); 697 } 698 699 static void ggml_vk_scale(kp::Sequence& seq, 700 const std::shared_ptr<kp::Tensor>& in, 701 const std::shared_ptr<kp::Tensor>& out, 702 uint32_t inOff, uint32_t outOff, 703 uint32_t size, float scale) { 704 const static auto spirv_1 = getSpirvShader( 705 kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len 706 ); 707 const static auto spirv_8 = getSpirvShader( 708 kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len 709 ); 710 711 struct PushConstants { 712 uint32_t inOff, outOff; 713 float scale; 714 } const pushConsts { 715 safe_divide(inOff, 4), safe_divide(outOff, 4), 716 scale 717 }; 718 719 const auto * spirv = &spirv_1; 720 std::string name(__func__); 721 if (size % 8 == 0) { 722 size /= 8; 723 name += "_8"; 724 spirv = &spirv_8; 725 } 726 727 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 728 if (!komputeManager()->hasAlgorithm(name)) { 729 s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts}); 730 } else { 731 s_algo = komputeManager()->getAlgorithm(name); 732 s_algo->setTensors({in, out}); 733 s_algo->setWorkgroup({size}); 734 s_algo->setPushConstants<PushConstants>({pushConsts}); 735 s_algo->updateDescriptors(s_kompute_context->pool.get()); 736 } 737 seq.record<kp::OpAlgoDispatch>(s_algo); 738 } 739 740 static void ggml_vk_xxlu( 741 const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq, 742 const std::shared_ptr<kp::Tensor>& in, 743 const std::shared_ptr<kp::Tensor>& out, 744 uint32_t inOff, uint32_t outOff, 745 uint32_t size 746 ) { 747 struct PushConstants { 748 uint32_t inOff, outOff; 749 } const pushConsts { 750 safe_divide(inOff, 4), safe_divide(outOff, 4), 751 }; 752 753 auto name = std::string(__func__) + "_" + suffix; 754 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 755 if (!komputeManager()->hasAlgorithm(name)) { 756 s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts}); 757 } else { 758 s_algo = komputeManager()->getAlgorithm(name); 759 s_algo->setTensors({in, out}); 760 s_algo->setWorkgroup({size}); 761 s_algo->setPushConstants<PushConstants>({pushConsts}); 762 s_algo->updateDescriptors(s_kompute_context->pool.get()); 763 } 764 seq.record<kp::OpAlgoDispatch>(s_algo); 765 } 766 767 template <typename... Args> 768 static void ggml_vk_silu(Args&&... args) { 769 const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv, 770 kp::shader_data::op_silu_comp_spv_len); 771 772 ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...); 773 } 774 775 template <typename... Args> 776 static void ggml_vk_relu(Args&&... args) { 777 const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv, 778 kp::shader_data::op_relu_comp_spv_len); 779 780 ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...); 781 } 782 783 template <typename... Args> 784 static void ggml_vk_gelu(Args&&... args) { 785 const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv, 786 kp::shader_data::op_gelu_comp_spv_len); 787 788 ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...); 789 } 790 791 static void ggml_vk_soft_max( 792 kp::Sequence& seq, 793 const std::shared_ptr<kp::Tensor>& inA, 794 const std::shared_ptr<kp::Tensor>& inB, 795 const std::shared_ptr<kp::Tensor>& out, 796 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 797 int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03, 798 float scale 799 ) { 800 const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, 801 kp::shader_data::op_softmax_comp_spv_len); 802 803 struct PushConstants { 804 uint32_t inAOff, inBOff, outOff; 805 int32_t ne00, ne01, ne02; 806 float scale; 807 int32_t mask; 808 } pushConsts { 809 safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), 810 ne00, ne01, ne02, 811 scale, 812 bool(inB) 813 }; 814 815 auto & inB_ = inB ? inB : inA; 816 817 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 818 if (!komputeManager()->hasAlgorithm(__func__)) { 819 // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device 820 const uint32_t local_x = 32; 821 s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts}); 822 } else { 823 s_algo = komputeManager()->getAlgorithm(__func__); 824 s_algo->setTensors({inA, inB_, out}); 825 s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); 826 s_algo->setPushConstants<PushConstants>({pushConsts}); 827 s_algo->updateDescriptors(s_kompute_context->pool.get()); 828 } 829 seq.record<kp::OpAlgoDispatch>(s_algo); 830 } 831 832 static void ggml_vk_norm_( 833 const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq, 834 const std::shared_ptr<kp::Tensor>& in, 835 const std::shared_ptr<kp::Tensor>& out, 836 uint32_t inOff, uint32_t outOff, 837 int32_t ne00, int32_t nb01, 838 int32_t nrows, float epsilon 839 ) { 840 GGML_ASSERT(nb01%sizeof(float) == 0); 841 GGML_ASSERT(ne00%sizeof(float) == 0); 842 843 struct PushConstants { 844 uint32_t inOff, outOff; 845 uint32_t ne00, nb01; 846 float eps; 847 } pushConsts { 848 safe_divide(inOff, 4), safe_divide(outOff, 4), 849 (uint32_t)ne00, (uint32_t)nb01, epsilon 850 }; 851 852 auto name = std::string(__func__) + "_" + suffix; 853 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 854 if (!komputeManager()->hasAlgorithm(name)) { 855 s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); 856 } else { 857 s_algo = komputeManager()->getAlgorithm(name); 858 s_algo->setTensors({in, out}); 859 s_algo->setWorkgroup({(uint32_t)nrows}); 860 s_algo->setPushConstants<PushConstants>({pushConsts}); 861 s_algo->updateDescriptors(s_kompute_context->pool.get()); 862 } 863 seq.record<kp::OpAlgoDispatch>(s_algo); 864 } 865 866 template <typename... Args> 867 static void ggml_vk_norm(Args&&... args) { 868 const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv, 869 kp::shader_data::op_norm_comp_spv_len); 870 871 ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...); 872 } 873 874 template <typename... Args> 875 static void ggml_vk_rms_norm(Args&&... args) { 876 const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv, 877 kp::shader_data::op_rmsnorm_comp_spv_len); 878 879 ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...); 880 } 881 882 static void ggml_vk_diag_mask_inf(kp::Sequence& seq, 883 const std::shared_ptr<kp::Tensor>& in, 884 const std::shared_ptr<kp::Tensor>& out, 885 uint32_t inOff, uint32_t outOff, 886 uint32_t n_past, 887 int32_t ne00, int32_t ne01, int32_t ne02) { 888 const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv, 889 kp::shader_data::op_diagmask_comp_spv_len); 890 891 struct PushConstants { 892 uint32_t inOff, outOff; 893 uint32_t n_past; 894 int32_t ne00, ne01; 895 } pushConsts { 896 safe_divide(inOff, 4), safe_divide(outOff, 4), 897 n_past, 898 ne00, ne01 899 }; 900 901 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 902 if (!komputeManager()->hasAlgorithm(__func__)) 903 s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts}); 904 else { 905 s_algo = komputeManager()->getAlgorithm(__func__); 906 s_algo->setTensors({in, out}); 907 s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)}); 908 s_algo->setPushConstants<PushConstants>({pushConsts}); 909 s_algo->updateDescriptors(s_kompute_context->pool.get()); 910 } 911 seq.record<kp::OpAlgoDispatch>(s_algo); 912 } 913 914 static void ggml_vk_mul_mat_f16( 915 kp::Sequence& seq, 916 const std::shared_ptr<kp::Tensor>& inA, 917 const std::shared_ptr<kp::Tensor>& inB, 918 const std::shared_ptr<kp::Tensor>& out, 919 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 920 int32_t ne00, int32_t ne01, int32_t ne02, 921 uint32_t nb00, uint32_t nb01, uint32_t nb02, 922 int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, 923 uint32_t nb10, uint32_t nb11, uint32_t nb12, 924 int32_t ne0, int32_t ne1, 925 uint32_t r2, uint32_t r3 926 ) { 927 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv, 928 kp::shader_data::op_mul_mat_f16_comp_spv_len); 929 930 struct PushConstants { 931 uint32_t inAOff, inBOff, outOff; 932 int32_t ne00, ne01, ne02; 933 uint32_t nb00, nb01, nb02; 934 int32_t ne10, ne11, ne12; 935 uint32_t nb10, nb11, nb12; 936 int32_t ne0, ne1; 937 uint32_t r2, r3; 938 } pushConsts { 939 safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), 940 ne00, ne01, ne02, 941 nb00, nb01, nb02, 942 ne10, ne11, ne12, 943 nb10, nb11, nb12, 944 ne0, ne1, 945 r2, r3 946 }; 947 948 const unsigned ny = unsigned((ne11 + 4 - 1)/4); 949 950 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 951 if (!komputeManager()->hasAlgorithm(__func__)) { 952 const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; 953 s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts}); 954 } else { 955 s_algo = komputeManager()->getAlgorithm(__func__); 956 s_algo->setTensors({inA, inB, out}); 957 s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)}); 958 s_algo->setPushConstants<PushConstants>({pushConsts}); 959 s_algo->updateDescriptors(s_kompute_context->pool.get()); 960 } 961 seq.record<kp::OpAlgoDispatch>(s_algo); 962 } 963 964 static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq, 965 const std::shared_ptr<kp::Tensor>& inA, 966 const std::shared_ptr<kp::Tensor>& inB, 967 const std::shared_ptr<kp::Tensor>& out, 968 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 969 int32_t ne00, int32_t ne01, int32_t ne02, 970 uint32_t nb01, uint32_t nb02, 971 int32_t ne11, int32_t ne12, 972 uint32_t nb11, uint32_t nb12, 973 uint32_t nb1, uint32_t nb2) { 974 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv, 975 kp::shader_data::op_mul_mat_mat_f32_comp_spv_len); 976 977 struct PushConstants { 978 uint32_t inAOff, inBOff, outOff; 979 int32_t ne00, ne01, ne02, ne11, ne12; 980 uint32_t nb01, nb02; 981 uint32_t nb11, nb12; 982 uint32_t nb1, nb2; 983 } pushConsts { 984 safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), 985 ne00, ne01, ne02, ne11, ne12, 986 nb01, nb02, nb11, nb12, 987 nb1, nb2 988 }; 989 990 const uint32_t local_x = ggml_vk_current_device().subgroupSize; 991 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 992 if (!komputeManager()->hasAlgorithm(__func__)) { 993 s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), 994 {inA, inB, out}, spirv, 995 {unsigned(ne01), 996 unsigned(ne11), 997 unsigned(std::max(ne12, ne02)) 998 }, 999 {local_x}, 1000 {pushConsts}); 1001 } else { 1002 s_algo = komputeManager()->getAlgorithm(__func__); 1003 s_algo->setTensors({inA, inB, out}); 1004 s_algo->setWorkgroup({unsigned(ne01), 1005 unsigned(ne11), 1006 unsigned(std::max(ne12, ne02)), 1007 }); 1008 s_algo->setPushConstants<PushConstants>({pushConsts}); 1009 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1010 } 1011 seq.record<kp::OpAlgoDispatch>(s_algo); 1012 } 1013 1014 static void ggml_vk_mul_mat_impl( 1015 const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq, 1016 const std::shared_ptr<kp::Tensor>& inA, 1017 const std::shared_ptr<kp::Tensor>& inB, 1018 const std::shared_ptr<kp::Tensor>& out, 1019 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 1020 int32_t ne00, int32_t ne01, int32_t ne02, 1021 int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, 1022 int32_t ne0, int32_t ne1, 1023 uint32_t r2, uint32_t r3 1024 ) { 1025 struct PushConstants { 1026 uint32_t inAOff, inBOff, outOff; 1027 int32_t ne00, ne01, ne02; 1028 int32_t ne10, ne12; 1029 int32_t ne0, ne1; 1030 uint32_t r2, r3; 1031 } pushConsts { 1032 safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), 1033 ne00, ne01, ne02, 1034 ne10, ne12, 1035 ne0, ne1, 1036 r2, r3 1037 }; 1038 1039 auto name = std::string(__func__) + "_" + suffix; 1040 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 1041 if (!komputeManager()->hasAlgorithm(name)) { 1042 const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; 1043 s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts}); 1044 } else { 1045 s_algo = komputeManager()->getAlgorithm(name); 1046 s_algo->setTensors({inA, inB, out}); 1047 s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}); 1048 s_algo->setPushConstants<PushConstants>({pushConsts}); 1049 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1050 } 1051 seq.record<kp::OpAlgoDispatch>(s_algo); 1052 } 1053 1054 template <typename... Args> 1055 static void ggml_vk_mul_mat_q4_0(Args&&... args) { 1056 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv, 1057 kp::shader_data::op_mul_mat_q4_0_comp_spv_len); 1058 1059 ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...); 1060 } 1061 1062 template <typename... Args> 1063 static void ggml_vk_mul_mat_q4_1(Args&&... args) { 1064 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv, 1065 kp::shader_data::op_mul_mat_q4_1_comp_spv_len); 1066 1067 ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...); 1068 } 1069 1070 template <typename... Args> 1071 static void ggml_vk_mul_mat_q8_0(Args&&... args) { 1072 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv, 1073 kp::shader_data::op_mul_mat_q8_0_comp_spv_len); 1074 1075 ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...); 1076 } 1077 1078 static void ggml_vk_mul_mat_q6_k( 1079 kp::Sequence& seq, 1080 const std::shared_ptr<kp::Tensor>& inA, 1081 const std::shared_ptr<kp::Tensor>& inB, 1082 const std::shared_ptr<kp::Tensor>& out, 1083 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 1084 int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, 1085 int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02 1086 ) { 1087 const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, 1088 kp::shader_data::op_mul_mat_q6_k_comp_spv_len); 1089 1090 struct PushConstants { 1091 uint32_t inAOff, inBOff, outOff; 1092 int32_t ne00, ne10, ne0, ne1, ne01, gqa; 1093 } pushConsts { 1094 inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), 1095 ne00, ne10, ne0, ne1, ne01, ne12/ne02 1096 }; 1097 1098 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 1099 if (!komputeManager()->hasAlgorithm(__func__)) { 1100 const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; 1101 s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); 1102 } else { 1103 s_algo = komputeManager()->getAlgorithm(__func__); 1104 s_algo->setTensors({inA, inB, out}); 1105 s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); 1106 s_algo->setPushConstants<PushConstants>({pushConsts}); 1107 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1108 } 1109 seq.record<kp::OpAlgoDispatch>(s_algo); 1110 } 1111 1112 static void ggml_vk_get_rows( 1113 const std::vector<uint32_t>& spirv, 1114 const char * suffix, 1115 unsigned element_size, unsigned qk, 1116 kp::Sequence& seq, 1117 const std::shared_ptr<kp::Tensor>& inA, 1118 const std::shared_ptr<kp::Tensor>& inB, 1119 const std::shared_ptr<kp::Tensor>& out, 1120 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 1121 int32_t ne00, int32_t nb01, int32_t nb1, 1122 uint32_t size 1123 ) { 1124 GGML_ASSERT(nb01%element_size == 0); 1125 GGML_ASSERT(nb1%sizeof(float) == 0); 1126 if (qk) GGML_ASSERT(ne00%qk == 0); 1127 1128 struct PushConstants { 1129 uint32_t inAOff, inBOff, outOff; 1130 int32_t ne00, nb01, nb1; 1131 } pushConsts { 1132 safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), 1133 ne00, nb01, nb1 1134 }; 1135 1136 auto name = std::string(__func__) + "_" + suffix; 1137 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 1138 if (!komputeManager()->hasAlgorithm(name)) { 1139 s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); 1140 } else { 1141 s_algo = komputeManager()->getAlgorithm(name); 1142 s_algo->setTensors({inA, inB, out}); 1143 s_algo->setWorkgroup({size}); 1144 s_algo->setPushConstants<PushConstants>({pushConsts}); 1145 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1146 } 1147 seq.record<kp::OpAlgoDispatch>(s_algo); 1148 } 1149 1150 template <typename... Args> 1151 static void ggml_vk_get_rows_f32(Args&&... args) { 1152 const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv, 1153 kp::shader_data::op_getrows_f32_comp_spv_len); 1154 1155 ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...); 1156 } 1157 1158 template <typename... Args> 1159 static void ggml_vk_get_rows_f16(Args&&... args) { 1160 const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, 1161 kp::shader_data::op_getrows_f16_comp_spv_len); 1162 1163 ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...); 1164 } 1165 1166 template <typename... Args> 1167 static void ggml_vk_get_rows_q4_0(Args&&... args) { 1168 const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv, 1169 kp::shader_data::op_getrows_q4_0_comp_spv_len); 1170 1171 ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...); 1172 } 1173 1174 template <typename... Args> 1175 static void ggml_vk_get_rows_q4_1(Args&&... args) { 1176 const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv, 1177 kp::shader_data::op_getrows_q4_1_comp_spv_len); 1178 1179 ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...); 1180 } 1181 1182 template <typename... Args> 1183 static void ggml_vk_get_rows_q6_k(Args&&... args) { 1184 const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv, 1185 kp::shader_data::op_getrows_q6_k_comp_spv_len); 1186 ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...); 1187 } 1188 1189 static void ggml_vk_rope( 1190 kp::Sequence& seq, 1191 const std::shared_ptr<kp::Tensor>& inA, 1192 const std::shared_ptr<kp::Tensor>& inB, 1193 const std::shared_ptr<kp::Tensor>& out, 1194 uint32_t inAOff, uint32_t inBOff, uint32_t outOff, 1195 ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig, 1196 float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow, 1197 int32_t ne01, int32_t ne02, int32_t ne03, 1198 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, 1199 int32_t ne0, 1200 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 1201 ) { 1202 GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); 1203 1204 static const auto spirv_f16 = getSpirvShader( 1205 kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len 1206 ); 1207 static const auto spirv_f32 = getSpirvShader( 1208 kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len 1209 ); 1210 1211 int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; 1212 1213 GGML_ASSERT(nb03 % type_size == 0); 1214 GGML_ASSERT(nb02 % type_size == 0); 1215 GGML_ASSERT(nb01 % type_size == 0); 1216 GGML_ASSERT(nb00 % type_size == 0); 1217 GGML_ASSERT(nb3 % type_size == 0); 1218 GGML_ASSERT(nb2 % type_size == 0); 1219 GGML_ASSERT(nb1 % type_size == 0); 1220 GGML_ASSERT(nb0 % type_size == 0); 1221 1222 struct PushConstants { 1223 uint32_t inAOff, inBOff, outOff; 1224 int32_t n_dims, mode, n_ctx_orig; 1225 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; 1226 uint32_t nb00, nb01, nb02, nb03; 1227 int32_t ne0; 1228 uint32_t nb0, nb1, nb2, nb3; 1229 } pushConsts { 1230 safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size), 1231 n_dims, mode, n_ctx_orig, 1232 freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 1233 nb00, nb01, nb02, nb03, 1234 ne0, 1235 nb0, nb1, nb2, nb3 1236 }; 1237 1238 auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); 1239 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 1240 if (!komputeManager()->hasAlgorithm(name)) { 1241 s_algo = komputeManager()->algorithm<float, PushConstants>( 1242 name, s_kompute_context->pool.get(), {inA, inB, out}, 1243 src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32, 1244 {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} 1245 ); 1246 } else { 1247 s_algo = komputeManager()->getAlgorithm(name); 1248 s_algo->setTensors({inA, inB, out}); 1249 s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); 1250 s_algo->setPushConstants<PushConstants>({pushConsts}); 1251 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1252 } 1253 seq.record<kp::OpAlgoDispatch>(s_algo); 1254 } 1255 1256 static void ggml_vk_cpy( 1257 const std::vector<uint32_t>& spirv, 1258 uint32_t in_element_size, uint32_t out_element_size, 1259 kp::Sequence& seq, 1260 const std::shared_ptr<kp::Tensor>& in, 1261 const std::shared_ptr<kp::Tensor>& out, 1262 uint32_t inOff, uint32_t outOff, 1263 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, 1264 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, 1265 int32_t ne0, int32_t ne1, int32_t ne2, 1266 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 1267 ) { 1268 struct PushConstants { 1269 uint32_t inOff, outOff; 1270 int32_t ne00, ne01, ne02; 1271 uint32_t nb00, nb01, nb02, nb03; 1272 int32_t ne0, ne1, ne2; 1273 uint32_t nb0, nb1, nb2, nb3; 1274 } pushConsts { 1275 safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size), 1276 ne00, ne01, ne02, 1277 nb00, nb01, nb02, nb03, 1278 ne0, ne1, ne2, 1279 nb0, nb1, nb2, nb3 1280 }; 1281 1282 std::string name = std::string(__func__) 1283 + "_i_" + std::to_string(in_element_size) 1284 + "_o_" + std::to_string(out_element_size); 1285 std::shared_ptr<kp::Algorithm> s_algo = nullptr; 1286 if (!komputeManager()->hasAlgorithm(name)) 1287 s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); 1288 else { 1289 s_algo = komputeManager()->getAlgorithm(name); 1290 s_algo->setTensors({in, out}); 1291 s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); 1292 s_algo->setPushConstants<PushConstants>({pushConsts}); 1293 s_algo->updateDescriptors(s_kompute_context->pool.get()); 1294 } 1295 seq.record<kp::OpAlgoDispatch>(s_algo); 1296 } 1297 1298 template <typename... Args> 1299 static void ggml_vk_cpy_f32_f16(Args&&... args) { 1300 const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv, 1301 kp::shader_data::op_cpy_f32_f16_comp_spv_len); 1302 ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...); 1303 } 1304 1305 template <typename... Args> 1306 static void ggml_vk_cpy_f32_f32(Args&&... args) { 1307 const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv, 1308 kp::shader_data::op_cpy_f32_f32_comp_spv_len); 1309 ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...); 1310 } 1311 1312 template <typename... Args> 1313 static void ggml_vk_cpy_f16_f16(Args&&... args) { 1314 const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv, 1315 kp::shader_data::op_cpy_f16_f16_comp_spv_len); 1316 ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...); 1317 } 1318 1319 template <typename... Args> 1320 static void ggml_vk_cpy_f16_f32(Args&&... args) { 1321 const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv, 1322 kp::shader_data::op_cpy_f16_f32_comp_spv_len); 1323 ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...); 1324 } 1325 1326 static bool ggml_vk_supports_op(const struct ggml_tensor * op) { 1327 switch (op->type) { 1328 case GGML_TYPE_F16: 1329 case GGML_TYPE_F32: 1330 case GGML_TYPE_Q4_0: 1331 case GGML_TYPE_Q4_1: 1332 break; 1333 default: 1334 return false; 1335 } 1336 1337 switch (op->op) { 1338 case GGML_OP_UNARY: 1339 switch (ggml_get_unary_op(op)) { 1340 case GGML_UNARY_OP_RELU: 1341 case GGML_UNARY_OP_GELU: 1342 case GGML_UNARY_OP_SILU: 1343 return ggml_is_contiguous(op->src[0]); 1344 default: 1345 ; 1346 } 1347 break; 1348 case GGML_OP_NONE: 1349 case GGML_OP_RESHAPE: 1350 case GGML_OP_VIEW: 1351 case GGML_OP_TRANSPOSE: 1352 case GGML_OP_PERMUTE: 1353 case GGML_OP_ADD: 1354 case GGML_OP_MUL: 1355 case GGML_OP_SCALE: 1356 case GGML_OP_SOFT_MAX: 1357 case GGML_OP_RMS_NORM: 1358 case GGML_OP_NORM: 1359 case GGML_OP_ROPE: 1360 return true; 1361 case GGML_OP_DUP: 1362 case GGML_OP_CPY: 1363 case GGML_OP_CONT: 1364 switch (op->src[0]->type) { 1365 case GGML_TYPE_F32: 1366 case GGML_TYPE_F16: 1367 break; 1368 default: 1369 return false; 1370 } 1371 switch (op->type) { 1372 case GGML_TYPE_F32: 1373 case GGML_TYPE_F16: 1374 break; 1375 default: 1376 return false; 1377 } 1378 return true; 1379 case GGML_OP_DIAG_MASK_INF: 1380 return op->ne[3] == 1; 1381 case GGML_OP_GET_ROWS: 1382 switch (op->src[0]->type) { 1383 case GGML_TYPE_F32: 1384 case GGML_TYPE_F16: 1385 case GGML_TYPE_Q4_0: 1386 case GGML_TYPE_Q4_1: 1387 case GGML_TYPE_Q6_K: 1388 return op->ne[2] == 1 && op->ne[3] == 1; 1389 default: 1390 ; 1391 } 1392 return false; 1393 case GGML_OP_MUL_MAT: 1394 if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1])) 1395 return false; 1396 1397 switch (op->src[0]->type) { 1398 case GGML_TYPE_F32: 1399 case GGML_TYPE_Q6_K: 1400 return op->ne[3] == 1; 1401 case GGML_TYPE_F16: 1402 case GGML_TYPE_Q8_0: 1403 case GGML_TYPE_Q4_0: 1404 case GGML_TYPE_Q4_1: 1405 return true; 1406 default: 1407 ; 1408 } 1409 default: 1410 ; 1411 } 1412 return false; 1413 } 1414 1415 static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { 1416 const int n_seq = 8; 1417 1418 // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting 1419 // it to the size of the graph, but I think it can be made smaller? 1420 ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes); 1421 1422 std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq); 1423 1424 for (auto& sequence : sequences) { 1425 sequence = komputeManager()->sequence(); 1426 } 1427 for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { 1428 const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; 1429 1430 auto& seq = *sequences[seq_idx]; 1431 1432 const int node_start = (seq_idx + 0) * n_nodes_per_seq; 1433 const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes); 1434 1435 bool any_commands_recorded = false; 1436 1437 for (int i = node_start; i < node_end; ++i) { 1438 struct ggml_tensor * src0 = gf->nodes[i]->src[0]; 1439 struct ggml_tensor * src1 = gf->nodes[i]->src[1]; 1440 struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2); 1441 struct ggml_tensor * dst = gf->nodes[i]; 1442 GGML_ASSERT(dst->data != nullptr); 1443 1444 if (ggml_is_empty(dst)) { 1445 continue; 1446 } 1447 1448 switch (dst->op) { 1449 case GGML_OP_NONE: 1450 case GGML_OP_RESHAPE: 1451 case GGML_OP_VIEW: 1452 case GGML_OP_TRANSPOSE: 1453 case GGML_OP_PERMUTE: 1454 continue; // noop -> next node 1455 default: 1456 break; 1457 } 1458 1459 any_commands_recorded = true; 1460 1461 if (!ggml_vk_supports_op(dst)) { 1462 fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); 1463 GGML_ASSERT(!"unsupported op"); 1464 } 1465 1466 const int32_t ne00 = src0 ? src0->ne[0] : 0; 1467 const int32_t ne01 = src0 ? src0->ne[1] : 0; 1468 const int32_t ne02 = src0 ? src0->ne[2] : 0; 1469 const int32_t ne03 = src0 ? src0->ne[3] : 0; 1470 1471 const uint32_t nb00 = src0 ? src0->nb[0] : 0; 1472 const uint32_t nb01 = src0 ? src0->nb[1] : 0; 1473 const uint32_t nb02 = src0 ? src0->nb[2] : 0; 1474 const uint32_t nb03 = src0 ? src0->nb[3] : 0; 1475 1476 const int32_t ne10 = src1 ? src1->ne[0] : 0; 1477 const int32_t ne11 = src1 ? src1->ne[1] : 0; 1478 const int32_t ne12 = src1 ? src1->ne[2] : 0; 1479 const int32_t ne13 = src1 ? src1->ne[3] : 0; 1480 1481 const uint32_t nb10 = src1 ? src1->nb[0] : 0; 1482 const uint32_t nb11 = src1 ? src1->nb[1] : 0; 1483 const uint32_t nb12 = src1 ? src1->nb[2] : 0; 1484 const uint32_t nb13 = src1 ? src1->nb[3] : 0; 1485 1486 const int32_t ne0 = dst ? dst->ne[0] : 0; 1487 const int32_t ne1 = dst ? dst->ne[1] : 0; 1488 const int32_t ne2 = dst ? dst->ne[2] : 0; 1489 // const int32_t ne3 = dst ? dst->ne[3] : 0; 1490 1491 const uint32_t nb0 = dst ? dst->nb[0] : 0; 1492 const uint32_t nb1 = dst ? dst->nb[1] : 0; 1493 const uint32_t nb2 = dst ? dst->nb[2] : 0; 1494 const uint32_t nb3 = dst ? dst->nb[3] : 0; 1495 1496 const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; 1497 const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; 1498 const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; 1499 1500 const static std::shared_ptr<kp::Tensor> nullTensor = nullptr; 1501 uint32_t off_src0 = 0; 1502 uint32_t off_src1 = 0; 1503 uint32_t off_dst = 0; 1504 const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; 1505 const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; 1506 const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; 1507 1508 switch (dst->op) { 1509 case GGML_OP_ADD: 1510 { 1511 if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { 1512 // src1 is a row 1513 ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00); 1514 } else { 1515 ggml_vk_add( 1516 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1517 ne00, ne01, ne02, ne03, 1518 nb00, nb01, nb02, nb03, 1519 ne10, ne11, ne12, ne13, 1520 nb10, nb11, nb12, nb13, 1521 ne0, 1522 nb0, nb1, nb2, nb3 1523 ); 1524 } 1525 } break; 1526 case GGML_OP_MUL: 1527 { 1528 ggml_vk_mul( 1529 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1530 ne00, ne01, ne02, ne03, 1531 nb00, nb01, nb02, nb03, 1532 ne10, ne11, ne12, ne13, 1533 nb10, nb11, nb12, nb13, 1534 ne0, 1535 nb0, nb1, nb2, nb3 1536 ); 1537 } break; 1538 case GGML_OP_SCALE: 1539 { 1540 float scale; memcpy(&scale, dst->op_params, sizeof(float)); 1541 1542 ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale); 1543 } break; 1544 case GGML_OP_UNARY: 1545 { 1546 int64_t n = ggml_nelements(dst); 1547 GGML_ASSERT(n % 4 == 0); 1548 switch (ggml_get_unary_op(gf->nodes[i])) { 1549 case GGML_UNARY_OP_SILU: 1550 { 1551 ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4); 1552 } break; 1553 case GGML_UNARY_OP_RELU: 1554 { 1555 ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4); 1556 } break; 1557 case GGML_UNARY_OP_GELU: 1558 { 1559 GGML_ASSERT(n % 8 == 0); 1560 ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8); 1561 } break; 1562 default: 1563 { 1564 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); 1565 GGML_ASSERT(false); 1566 } 1567 } 1568 } break; 1569 case GGML_OP_SOFT_MAX: 1570 { 1571 float scale; 1572 float max_bias; 1573 1574 memcpy(&scale, (float *)dst->op_params + 0, sizeof(float)); 1575 memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float)); 1576 1577 #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support") 1578 #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") 1579 GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); 1580 1581 #pragma message("TODO: add ALiBi support") 1582 #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192") 1583 GGML_ASSERT(max_bias == 0.0f); 1584 1585 ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); 1586 } break; 1587 case GGML_OP_DIAG_MASK_INF: 1588 { 1589 const int n_past = ((int32_t *)(dst->op_params))[0]; 1590 ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02); 1591 } break; 1592 case GGML_OP_NORM: 1593 { 1594 float eps; 1595 memcpy(&eps, dst->op_params, sizeof(float)); 1596 ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); 1597 } break; 1598 case GGML_OP_RMS_NORM: 1599 { 1600 GGML_ASSERT(ne00 % 4 == 0); 1601 1602 float eps; 1603 memcpy(&eps, dst->op_params, sizeof(float)); 1604 ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); 1605 } break; 1606 case GGML_OP_MUL_MAT: 1607 { 1608 GGML_ASSERT(ne00 == ne10); 1609 1610 GGML_ASSERT(ne12 % ne02 == 0); 1611 GGML_ASSERT(ne13 % ne03 == 0); 1612 1613 const uint32_t r2 = ne12/ne02; 1614 const uint32_t r3 = ne13/ne03; 1615 1616 if (src1t != GGML_TYPE_F32) { 1617 fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); 1618 goto not_implemented; 1619 } 1620 1621 if (ggml_is_transposed(src0) || 1622 ggml_is_transposed(src1)) { 1623 fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); 1624 goto not_implemented; 1625 } 1626 1627 switch (src0t) { 1628 case GGML_TYPE_F32: 1629 ggml_vk_mul_mat_mat_f32( 1630 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1631 ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2 1632 ); 1633 break; 1634 case GGML_TYPE_F16: 1635 ggml_vk_mul_mat_f16( 1636 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1637 ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12, 1638 ne0, ne1, r2, r3 1639 ); 1640 break; 1641 case GGML_TYPE_Q8_0: 1642 ggml_vk_mul_mat_q8_0( 1643 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1644 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 1645 ); 1646 break; 1647 case GGML_TYPE_Q4_0: 1648 ggml_vk_mul_mat_q4_0( 1649 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1650 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 1651 ); 1652 break; 1653 case GGML_TYPE_Q4_1: 1654 ggml_vk_mul_mat_q4_1( 1655 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1656 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 1657 ); 1658 break; 1659 case GGML_TYPE_Q6_K: 1660 ggml_vk_mul_mat_q6_k( 1661 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, 1662 ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02 1663 ); 1664 break; 1665 default: { 1666 fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); 1667 goto not_implemented; 1668 } 1669 } 1670 1671 } break; 1672 case GGML_OP_GET_ROWS: 1673 { 1674 if (src0t == GGML_TYPE_F32) { 1675 ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); 1676 } else if (src0t == GGML_TYPE_F16) { 1677 ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); 1678 } else if (src0t == GGML_TYPE_Q4_0) { 1679 ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); 1680 } else if (src0t == GGML_TYPE_Q4_1) { 1681 ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); 1682 } else if (src0t == GGML_TYPE_Q6_K) { 1683 ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); 1684 } else { 1685 fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); 1686 goto not_implemented; 1687 } 1688 } break; 1689 case GGML_OP_ROPE: 1690 { 1691 #pragma message("TODO: implement phi3 frequency factors support") 1692 #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") 1693 GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); 1694 1695 #pragma message("TODO: update rope NORM mode to match NEOX mode") 1696 #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634") 1697 1698 GGML_ASSERT(ne10 == ne02); 1699 GGML_ASSERT(src0t == dstt); 1700 // const int n_past = ((int32_t *) dst->op_params)[0]; 1701 const int n_dims = ((int32_t *) dst->op_params)[1]; 1702 const int mode = ((int32_t *) dst->op_params)[2]; 1703 // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan 1704 const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; 1705 1706 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; 1707 memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); 1708 memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); 1709 memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); 1710 memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); 1711 memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); 1712 memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); 1713 ggml_vk_rope( 1714 seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig, 1715 freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 1716 ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 1717 ); 1718 } break; 1719 case GGML_OP_DUP: 1720 case GGML_OP_CPY: 1721 case GGML_OP_CONT: 1722 { 1723 switch (src0t) { 1724 case GGML_TYPE_F32: 1725 { 1726 switch (dstt) { 1727 case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; 1728 case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; 1729 default: goto not_implemented; 1730 } 1731 } break; 1732 case GGML_TYPE_F16: 1733 { 1734 switch (dstt) { 1735 case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; 1736 case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; 1737 default: goto not_implemented; 1738 } break; 1739 default: goto not_implemented; 1740 } 1741 } 1742 } break; 1743 default: goto not_implemented; 1744 } 1745 continue; 1746 not_implemented: {} 1747 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); 1748 //GGML_ASSERT(false); 1749 } 1750 1751 // Evaluate sequence 1752 if (any_commands_recorded) { 1753 seq.evalAsync(); 1754 } 1755 } 1756 1757 // Wait for all sequences to finish 1758 for (auto& sequence : sequences) { 1759 if (sequence->isRunning()) 1760 sequence->evalAwait(); 1761 } 1762 1763 ggml_vk_free_descriptor_pool(ctx); 1764 } 1765 1766 template<> 1767 kp::Tensor::TensorDataTypes 1768 kp::TensorT<half>::dataType() 1769 { 1770 return TensorDataTypes::eFloat; 1771 } 1772 1773 template<> 1774 kp::Tensor::TensorDataTypes 1775 kp::TensorT<uint8_t>::dataType() 1776 { 1777 return TensorDataTypes::eUnsignedInt; 1778 } 1779 1780 //////////////////////////////////////////////////////////////////////////////// 1781 1782 // backend interface 1783 1784 struct ggml_backend_kompute_buffer_type_context { 1785 int device; 1786 int device_ref = 0; 1787 uint64_t buffer_alignment; 1788 uint64_t max_alloc; 1789 std::string name; 1790 1791 ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc) 1792 : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {} 1793 }; 1794 1795 static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) { 1796 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context); 1797 1798 if (!ctx->device_ref) { 1799 komputeManager()->initializeDevice( 1800 ctx->device, {}, { 1801 "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", 1802 "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info" 1803 } 1804 ); 1805 } 1806 1807 assert(ggml_vk_has_device()); 1808 ctx->device_ref++; 1809 } 1810 1811 static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) { 1812 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context); 1813 1814 assert(ctx->device_ref > 0); 1815 1816 ctx->device_ref--; 1817 1818 if (!ctx->device_ref) { 1819 komputeManager.destroy(); 1820 } 1821 } 1822 1823 static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) { 1824 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context); 1825 return ctx->name.c_str(); 1826 } 1827 1828 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { 1829 auto * memory = (ggml_vk_memory *)buffer->context; 1830 if (ggml_vk_has_device()) { 1831 ggml_vk_free_memory(*memory); 1832 } 1833 delete memory; 1834 } 1835 1836 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { 1837 return ((ggml_vk_memory *)buffer->context)->data; 1838 } 1839 1840 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { 1841 GGML_UNUSED(buffer); 1842 1843 const auto res = ggml_vk_get_tensor(tensor); 1844 GGML_ASSERT(res); 1845 1846 memcpy((char *)tensor->data + offset, data, size); 1847 1848 komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res}); 1849 } 1850 1851 static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { 1852 GGML_UNUSED(buffer); 1853 1854 const auto res = ggml_vk_get_tensor(tensor); 1855 GGML_ASSERT(res); 1856 1857 komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res}); 1858 1859 memcpy(data, (const char *)tensor->data + offset, size); 1860 } 1861 1862 static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { 1863 auto * memory = (ggml_vk_memory *)buffer->context; 1864 memset(memory->data, value, buffer->size); 1865 1866 if (memory->stagingBuffer) 1867 komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size); 1868 } 1869 1870 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { 1871 /* .get_name = */ ggml_backend_kompute_buffer_get_name, 1872 /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, 1873 /* .get_base = */ ggml_backend_kompute_buffer_get_base, 1874 /* .init_tensor = */ NULL, 1875 /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, 1876 /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, 1877 /* .cpy_tensor = */ NULL, 1878 /* .clear = */ ggml_backend_kompute_buffer_clear, 1879 /* .reset = */ NULL, 1880 }; 1881 1882 // default buffer type 1883 1884 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { 1885 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context); 1886 return ctx->name.c_str(); 1887 } 1888 1889 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { 1890 ggml_backend_kompute_device_ref(buft); 1891 auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); 1892 return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); 1893 } 1894 1895 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { 1896 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context); 1897 return ctx->buffer_alignment; 1898 } 1899 1900 static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { 1901 auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context); 1902 return ctx->max_alloc; 1903 } 1904 1905 static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { 1906 /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, 1907 /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, 1908 /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, 1909 /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, 1910 /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes 1911 /* .is_host = */ NULL, 1912 }; 1913 1914 ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { 1915 static std::vector<ggml_backend_buffer_type> bufts = []() { 1916 std::vector<ggml_backend_buffer_type> vec; 1917 auto devices = ggml_vk_available_devices_internal(0); 1918 vec.reserve(devices.size()); 1919 1920 for (const auto & dev : devices) { 1921 vec.push_back({ 1922 /* .iface = */ ggml_backend_kompute_buffer_type_interface, 1923 /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc) 1924 }); 1925 } 1926 return vec; 1927 }(); 1928 1929 auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { 1930 return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device; 1931 }); 1932 return it < bufts.end() ? &*it : nullptr; 1933 } 1934 1935 // backend 1936 1937 static const char * ggml_backend_kompute_name(ggml_backend_t backend) { 1938 auto * ctx = static_cast<ggml_kompute_context *>(backend->context); 1939 return ctx->name.c_str(); 1940 } 1941 1942 static void ggml_backend_kompute_free(ggml_backend_t backend) { 1943 auto * ctx = static_cast<ggml_kompute_context *>(backend->context); 1944 1945 assert(ctx == s_kompute_context); 1946 s_kompute_context = nullptr; 1947 if (ctx != nullptr) { 1948 delete ctx; 1949 } 1950 1951 delete backend; 1952 } 1953 1954 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) { 1955 auto * ctx = static_cast<ggml_kompute_context *>(backend->context); 1956 return ggml_backend_kompute_buffer_type(ctx->device); 1957 } 1958 1959 static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { 1960 auto * ctx = static_cast<ggml_kompute_context *>(backend->context); 1961 ggml_vk_graph_compute(ctx, cgraph); 1962 return GGML_STATUS_SUCCESS; 1963 } 1964 1965 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { 1966 GGML_UNUSED(backend); 1967 return ggml_vk_supports_op(op); 1968 } 1969 1970 static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { 1971 GGML_UNUSED(backend); 1972 return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name; 1973 } 1974 1975 static struct ggml_backend_i kompute_backend_i = { 1976 /* .get_name = */ ggml_backend_kompute_name, 1977 /* .free = */ ggml_backend_kompute_free, 1978 /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type, 1979 /* .set_tensor_async = */ NULL, 1980 /* .get_tensor_async = */ NULL, 1981 /* .cpy_tensor_async = */ NULL, 1982 /* .synchronize = */ NULL, 1983 /* .graph_plan_create = */ NULL, 1984 /* .graph_plan_free = */ NULL, 1985 /* .graph_plan_update = */ NULL, 1986 /* .graph_plan_compute = */ NULL, 1987 /* .graph_compute = */ ggml_backend_kompute_graph_compute, 1988 /* .supports_op = */ ggml_backend_kompute_supports_op, 1989 /* .supports_buft = */ ggml_backend_kompute_supports_buft, 1990 /* .offload_op = */ NULL, 1991 /* .event_new = */ NULL, 1992 /* .event_free = */ NULL, 1993 /* .event_record = */ NULL, 1994 /* .event_wait = */ NULL, 1995 /* .event_synchronize = */ NULL, 1996 }; 1997 1998 static ggml_guid_t ggml_backend_kompute_guid() { 1999 static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca }; 2000 return &guid; 2001 } 2002 2003 ggml_backend_t ggml_backend_kompute_init(int device) { 2004 GGML_ASSERT(s_kompute_context == nullptr); 2005 s_kompute_context = new ggml_kompute_context(device); 2006 2007 ggml_backend_t kompute_backend = new ggml_backend { 2008 /* .guid = */ ggml_backend_kompute_guid(), 2009 /* .interface = */ kompute_backend_i, 2010 /* .context = */ s_kompute_context, 2011 }; 2012 2013 return kompute_backend; 2014 } 2015 2016 bool ggml_backend_is_kompute(ggml_backend_t backend) { 2017 return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); 2018 } 2019 2020 static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) { 2021 GGML_UNUSED(params); 2022 return ggml_backend_kompute_init(intptr_t(user_data)); 2023 } 2024 2025 extern "C" int ggml_backend_kompute_reg_devices(); 2026 2027 int ggml_backend_kompute_reg_devices() { 2028 auto devices = ggml_vk_available_devices_internal(0); 2029 for (const auto & device : devices) { 2030 ggml_backend_register( 2031 ggml_kompute_format_name(device.index).c_str(), 2032 ggml_backend_reg_kompute_init, 2033 ggml_backend_kompute_buffer_type(device.index), 2034 reinterpret_cast<void *>(intptr_t(device.index)) 2035 ); 2036 } 2037 return devices.size(); 2038 }