/ ggml-alloc.c
ggml-alloc.c
   1  #include "ggml-alloc.h"
   2  #include "ggml-backend-impl.h"
   3  #include "ggml.h"
   4  #include "ggml-impl.h"
   5  #include <assert.h>
   6  #include <limits.h>
   7  #include <stdarg.h>
   8  #include <stdio.h>
   9  #include <stdlib.h>
  10  #include <string.h>
  11  
  12  #define MAX(a, b) ((a) > (b) ? (a) : (b))
  13  #define MAX_FREE_BLOCKS 256
  14  
  15  //#define GGML_ALLOCATOR_DEBUG
  16  
  17  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
  18  #define AT_PRINTF(...)
  19  
  20  
  21  static bool ggml_is_view(const struct ggml_tensor * t) {
  22      return t->view_src != NULL;
  23  }
  24  
  25  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
  26      if (a->type != b->type) {
  27          return false;
  28      }
  29      for (int i = 0; i < GGML_MAX_DIMS; i++) {
  30          if (a->ne[i] != b->ne[i]) {
  31              return false;
  32          }
  33          if (a->nb[i] != b->nb[i]) {
  34              return false;
  35          }
  36      }
  37      return true;
  38  }
  39  
  40  static bool ggml_op_can_inplace(enum ggml_op op) {
  41      switch (op) {
  42          case GGML_OP_SCALE:
  43          case GGML_OP_DIAG_MASK_ZERO:
  44          case GGML_OP_DIAG_MASK_INF:
  45          case GGML_OP_ADD:
  46          case GGML_OP_ADD1:
  47          case GGML_OP_SUB:
  48          case GGML_OP_MUL:
  49          case GGML_OP_DIV:
  50          case GGML_OP_SQR:
  51          case GGML_OP_SQRT:
  52          case GGML_OP_LOG:
  53          case GGML_OP_UNARY:
  54          case GGML_OP_ROPE:
  55          case GGML_OP_RMS_NORM:
  56          case GGML_OP_SOFT_MAX:
  57              return true;
  58  
  59          default:
  60              return false;
  61      }
  62  }
  63  
  64  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  65      assert(alignment && !(alignment & (alignment - 1))); // power of 2
  66      size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  67      return offset + align;
  68  }
  69  
  70  // tallocr
  71  
  72  struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  73      void * base = ggml_backend_buffer_get_base(buffer);
  74      size_t align = ggml_backend_buffer_get_alignment(buffer);
  75  
  76      assert(align && !(align & (align - 1))); // power of 2
  77  
  78      struct ggml_tallocr talloc = (struct ggml_tallocr) {
  79          /*.buffer    = */ buffer,
  80          /*.base      = */ base,
  81          /*.alignment = */ align,
  82          /*.offset    = */ aligned_offset(base, 0, align),
  83      };
  84      return talloc;
  85  }
  86  
  87  void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  88      size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  89      size = GGML_PAD(size, talloc->alignment);
  90  
  91      if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  92          fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  93                  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  94          GGML_ASSERT(!"not enough space in the buffer");
  95          return;
  96      }
  97  
  98      void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  99      talloc->offset += size;
 100  
 101      assert(((uintptr_t)addr % talloc->alignment) == 0);
 102  
 103      ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
 104  }
 105  
 106  // dynamic tensor allocator
 107  
 108  struct free_block {
 109      size_t offset;
 110      size_t size;
 111  };
 112  
 113  struct ggml_dyn_tallocr {
 114      size_t alignment;
 115      int n_free_blocks;
 116      struct free_block free_blocks[MAX_FREE_BLOCKS];
 117      size_t max_size;
 118  
 119  #ifdef GGML_ALLOCATOR_DEBUG
 120      struct {
 121          const struct ggml_tensor * tensor;
 122          size_t offset;
 123      } allocated_tensors[1024];
 124  #endif
 125  };
 126  
 127  #ifdef GGML_ALLOCATOR_DEBUG
 128  static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
 129      for (int i = 0; i < 1024; i++) {
 130          if (alloc->allocated_tensors[i].tensor == NULL) {
 131              alloc->allocated_tensors[i].tensor = tensor;
 132              alloc->allocated_tensors[i].offset = offset;
 133              return;
 134          }
 135      }
 136      GGML_ASSERT(!"out of allocated_tensors");
 137  }
 138  static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
 139      for (int i = 0; i < 1024; i++) {
 140          if (alloc->allocated_tensors[i].offset == offset) {
 141              alloc->allocated_tensors[i].tensor = NULL;
 142              return;
 143          }
 144      }
 145      fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
 146      GGML_ASSERT(!"tensor not found");
 147  }
 148  #endif
 149  
 150  static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
 151      size = aligned_offset(NULL, size, alloc->alignment);
 152  
 153      AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 154  
 155      size_t max_avail = 0;
 156  
 157      // find the best fitting free block besides the last block
 158      int best_fit_block = -1;
 159      size_t best_fit_size = SIZE_MAX;
 160      for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
 161          struct free_block * block = &alloc->free_blocks[i];
 162          max_avail = MAX(max_avail, block->size);
 163          if (block->size >= size && block->size <= best_fit_size) {
 164              best_fit_block = i;
 165              best_fit_size = block->size;
 166          }
 167      }
 168  
 169      if (best_fit_block == -1) {
 170          // the last block is our last resort
 171          struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
 172          max_avail = MAX(max_avail, block->size);
 173          if (block->size >= size) {
 174              best_fit_block = alloc->n_free_blocks - 1;
 175          } else {
 176              // this should never happen
 177              fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
 178                      __func__, size, max_avail);
 179              GGML_ASSERT(!"not enough space in the buffer");
 180              GGML_UNREACHABLE();
 181          }
 182      }
 183  
 184      struct free_block * block = &alloc->free_blocks[best_fit_block];
 185      size_t offset = block->offset;
 186      block->offset = offset + size;
 187      block->size -= size;
 188      if (block->size == 0) {
 189          // remove block if empty
 190          alloc->n_free_blocks--;
 191          for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
 192              alloc->free_blocks[j] = alloc->free_blocks[j+1];
 193          }
 194      }
 195  
 196      AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
 197  
 198  #ifdef GGML_ALLOCATOR_DEBUG
 199      add_allocated_tensor(alloc, offset, tensor);
 200      size_t cur_max = offset + size;
 201      if (cur_max > alloc->max_size) {
 202          // sort allocated_tensors by offset
 203          for (int i = 0; i < 1024; i++) {
 204              for (int j = i + 1; j < 1024; j++) {
 205                  if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
 206                      const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
 207                      size_t tmp_offset = alloc->allocated_tensors[i].offset;
 208                      alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
 209                      alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
 210                      alloc->allocated_tensors[j].tensor = tmp_tensor;
 211                      alloc->allocated_tensors[j].offset = tmp_offset;
 212                  }
 213              }
 214          }
 215          fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
 216          for (int i = 0; i < 1024; i++) {
 217              if (alloc->allocated_tensors[i].tensor) {
 218                  fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
 219                      alloc->allocated_tensors[i].offset,
 220                      alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
 221                      ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
 222              }
 223          }
 224          fprintf(stderr, "\n");
 225      }
 226  #endif
 227  
 228      alloc->max_size = MAX(alloc->max_size, offset + size);
 229  
 230      return offset;
 231  
 232      GGML_UNUSED(tensor);
 233  }
 234  
 235  // this is a very naive implementation, but for our case the number of free blocks should be very small
 236  static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
 237      size = aligned_offset(NULL, size, alloc->alignment);
 238  
 239      AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
 240  
 241  #ifdef GGML_ALLOCATOR_DEBUG
 242      remove_allocated_tensor(alloc, offset, tensor);
 243  #endif
 244  
 245      // see if we can merge with an existing block
 246      for (int i = 0; i < alloc->n_free_blocks; i++) {
 247          struct free_block * block = &alloc->free_blocks[i];
 248          // check if ptr is at the end of the block
 249          if (block->offset + block->size == offset) {
 250              block->size += size;
 251              // check if we can merge with the next block
 252              if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
 253                  block->size += alloc->free_blocks[i+1].size;
 254                  alloc->n_free_blocks--;
 255                  for (int j = i+1; j < alloc->n_free_blocks; j++) {
 256                      alloc->free_blocks[j] = alloc->free_blocks[j+1];
 257                  }
 258              }
 259              return;
 260          }
 261          // check if ptr is at the beginning of the block
 262          if (offset + size == block->offset) {
 263              block->offset = offset;
 264              block->size += size;
 265              // check if we can merge with the previous block
 266              if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
 267                  alloc->free_blocks[i-1].size += block->size;
 268                  alloc->n_free_blocks--;
 269                  for (int j = i; j < alloc->n_free_blocks; j++) {
 270                      alloc->free_blocks[j] = alloc->free_blocks[j+1];
 271                  }
 272              }
 273              return;
 274          }
 275      }
 276      // otherwise, add a new block
 277      GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
 278      // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
 279      int insert_pos = 0;
 280      while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
 281          insert_pos++;
 282      }
 283      // shift all blocks from insert_pos onward to make room for the new block
 284      for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
 285          alloc->free_blocks[i] = alloc->free_blocks[i-1];
 286      }
 287      // insert the new block
 288      alloc->free_blocks[insert_pos].offset = offset;
 289      alloc->free_blocks[insert_pos].size = size;
 290      alloc->n_free_blocks++;
 291  
 292      GGML_UNUSED(tensor);
 293  }
 294  
 295  static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 296      alloc->n_free_blocks = 1;
 297      alloc->free_blocks[0].offset = 0;
 298      alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
 299      alloc->max_size = 0;
 300  }
 301  
 302  static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
 303      struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 304  
 305      *alloc = (struct ggml_dyn_tallocr) {
 306          /*.alignment     = */ alignment,
 307          /*.n_free_blocks = */ 0,
 308          /*.free_blocks   = */ {{0}},
 309          /*.max_size      = */ 0,
 310  #ifdef GGML_ALLOCATOR_DEBUG
 311          /*.allocated_tensors = */ {{0}},
 312  #endif
 313      };
 314  
 315      ggml_dyn_tallocr_reset(alloc);
 316  
 317      return alloc;
 318  }
 319  
 320  static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 321      free(alloc);
 322  }
 323  
 324  static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 325      return alloc->max_size;
 326  }
 327  
 328  
 329  /////////////////////////////////////
 330  
 331  // graph allocator
 332  
 333  struct hash_node {
 334      int n_children;
 335      int n_views;
 336      int buffer_id;
 337      size_t offset; // offset within the buffer
 338      bool allocated;
 339  };
 340  
 341  struct tensor_alloc {
 342      int buffer_id;
 343      size_t offset;
 344      size_t size_max; // 0 = pre-allocated, unused, or view
 345  };
 346  
 347  struct leaf_alloc {
 348      int buffer_id;
 349      struct tensor_alloc leaf;
 350  };
 351  
 352  struct node_alloc {
 353      struct tensor_alloc dst;
 354      struct tensor_alloc src[GGML_MAX_SRC];
 355  };
 356  
 357  struct ggml_gallocr {
 358      ggml_backend_buffer_type_t * bufts; // [n_buffers]
 359      ggml_backend_buffer_t * buffers; // [n_buffers]
 360      struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
 361      int n_buffers;
 362  
 363      struct ggml_hash_set hash_set;
 364      struct hash_node * hash_values; // [hash_set.size]
 365  
 366      struct node_alloc * node_allocs; // [n_nodes]
 367      int n_nodes;
 368  
 369      struct leaf_alloc * leaf_allocs; // [n_leafs]
 370      int n_leafs;
 371  };
 372  
 373  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
 374      ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
 375      GGML_ASSERT(galloc != NULL);
 376  
 377      galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
 378      GGML_ASSERT(galloc->bufts != NULL);
 379  
 380      galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
 381      GGML_ASSERT(galloc->buffers != NULL);
 382  
 383      galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
 384      GGML_ASSERT(galloc->buf_tallocs != NULL);
 385  
 386      for (int i = 0; i < n_bufs; i++) {
 387          galloc->bufts[i] = bufts[i];
 388          galloc->buffers[i] = NULL;
 389  
 390          // check if the same buffer type is used multiple times and reuse the same allocator
 391          for (int j = 0; j < i; j++) {
 392              if (bufts[i] == bufts[j]) {
 393                  galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
 394                  break;
 395              }
 396          }
 397  
 398          if (galloc->buf_tallocs[i] == NULL) {
 399              size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
 400              galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
 401          }
 402      }
 403      galloc->n_buffers = n_bufs;
 404  
 405      return galloc;
 406  }
 407  
 408  ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
 409      return ggml_gallocr_new_n(&buft, 1);
 410  }
 411  
 412  void ggml_gallocr_free(ggml_gallocr_t galloc) {
 413      if (galloc == NULL) {
 414          return;
 415      }
 416  
 417      for (int i = 0; i < galloc->n_buffers; i++) {
 418          if (galloc->buffers != NULL) {
 419              // skip if already freed
 420              bool freed = false;
 421              for (int j = 0; j < i; j++) {
 422                  if (galloc->buffers[j] == galloc->buffers[i]) {
 423                      freed = true;
 424                      break;
 425                  }
 426              }
 427              if (!freed) {
 428                  ggml_backend_buffer_free(galloc->buffers[i]);
 429              }
 430          }
 431          if (galloc->buf_tallocs != NULL) {
 432              // skip if already freed
 433              bool freed = false;
 434              for (int j = 0; j < i; j++) {
 435                  if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
 436                      freed = true;
 437                      break;
 438                  }
 439              }
 440              if (!freed) {
 441                  ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
 442              }
 443          }
 444      }
 445  
 446      free(galloc->hash_set.keys);
 447      free(galloc->hash_values);
 448      free(galloc->bufts);
 449      free(galloc->buffers);
 450      free(galloc->buf_tallocs);
 451      free(galloc->node_allocs);
 452      free(galloc->leaf_allocs);
 453      free(galloc);
 454  }
 455  
 456  typedef struct ggml_gallocr * ggml_gallocr_t;
 457  
 458  static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 459      size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
 460      return &galloc->hash_values[i];
 461  }
 462  
 463  static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 464      return ggml_gallocr_hash_get(galloc, t)->allocated;
 465  }
 466  
 467  static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
 468      struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 469      hn->buffer_id = buffer_id;
 470      hn->offset = offset;
 471      hn->allocated = true;
 472  }
 473  
 474  static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 475      return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 476  }
 477  
 478  static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
 479      struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 480  
 481      if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
 482          hn->allocated = true;
 483          assert(hn->offset == 0);
 484  
 485          // try to reuse a parent's buffer (inplace)
 486          if (ggml_op_can_inplace(node->op)) {
 487              for (int i = 0; i < GGML_MAX_SRC; i++) {
 488                  struct ggml_tensor * parent = node->src[i];
 489                  if (parent == NULL) {
 490                      continue;
 491                  }
 492  
 493                  // if the node's data is external, then we cannot re-use it
 494                  if (!ggml_gallocr_is_own(galloc, parent)) {
 495                      AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
 496                      continue;
 497                  }
 498  
 499                  // outputs cannot be reused
 500                  if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
 501                      AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
 502                      continue;
 503                  }
 504  
 505                  if (!ggml_are_same_layout(node, parent)) {
 506                      AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
 507                      continue;
 508                  }
 509  
 510                  struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
 511                  if (p_hn->n_children == 1 && p_hn->n_views == 0) {
 512                      if (ggml_is_view(parent)) {
 513                          struct ggml_tensor * view_src = parent->view_src;
 514                          struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
 515                          if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
 516                              AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
 517                              assert(view_src_hn->offset == p_hn->offset);
 518                              hn->buffer_id = p_hn->buffer_id;
 519                              hn->offset = p_hn->offset;
 520                              p_hn->allocated = false; // avoid freeing the parent
 521                              view_src_hn->allocated = false;
 522                              return;
 523                          }
 524                      } else {
 525                          AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
 526                          hn->buffer_id = p_hn->buffer_id;
 527                          hn->offset = p_hn->offset;
 528                          p_hn->allocated = false; // avoid freeing the parent
 529                          return;
 530                      }
 531                  }
 532              }
 533          }
 534          // allocate tensor from the buffer
 535          struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
 536          ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
 537          size_t size = ggml_backend_buft_get_alloc_size(buft, node);
 538          size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
 539          hn->buffer_id = buffer_id;
 540          hn->offset = offset;
 541          return;
 542      }
 543  }
 544  
 545  static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
 546      // graph outputs are never freed
 547      if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
 548          AT_PRINTF("not freeing output %s\n", node->name);
 549          return;
 550      }
 551  
 552      struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 553      size_t offset = hn->offset;
 554      int buffer_id = hn->buffer_id;
 555      struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
 556      ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
 557      size_t size = ggml_backend_buft_get_alloc_size(buft, node);
 558      ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
 559      hn->allocated = false;
 560  }
 561  
 562  static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 563      return node_buffer_ids ? node_buffer_ids[i] : 0;
 564  }
 565  
 566  static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
 567      // clear hash tables
 568      memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
 569      memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
 570  
 571      // allocate leafs
 572      // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
 573      for (int i = 0; i < graph->n_leafs; i++) {
 574          struct ggml_tensor * leaf = graph->leafs[i];
 575          ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
 576      }
 577  
 578      // count number of children and views
 579      // allocate other graph inputs and leafs first to avoid overwriting them
 580      for (int i = 0; i < graph->n_nodes; i++) {
 581          struct ggml_tensor * node = graph->nodes[i];
 582  
 583          // TODO: better way to add external dependencies
 584          // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
 585          // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
 586          // itself is never used and should not be considered a dependency
 587          if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
 588              struct ggml_tensor * view_src = node->view_src;
 589              ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
 590          }
 591  
 592          if (node->flags & GGML_TENSOR_FLAG_INPUT) {
 593              ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
 594          }
 595  
 596          for (int j = 0; j < GGML_MAX_SRC; j++) {
 597              struct ggml_tensor * src = node->src[j];
 598              if (src == NULL) {
 599                  continue;
 600              }
 601  
 602              ggml_gallocr_hash_get(galloc, src)->n_children += 1;
 603  
 604              // allocate explicit inputs
 605              if (src->flags & GGML_TENSOR_FLAG_INPUT) {
 606                  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
 607              }
 608          }
 609      }
 610  
 611      // allocate tensors
 612      for (int i = 0; i < graph->n_nodes; i++) {
 613          struct ggml_tensor * node = graph->nodes[i];
 614          int buffer_id = get_node_buffer_id(node_buffer_ids, i);
 615  
 616          // allocate parents (only leafs need to be allocated at this point)
 617          for (int j = 0; j < GGML_MAX_SRC; j++) {
 618              struct ggml_tensor * parent = node->src[j];
 619              if (parent == NULL) {
 620                  continue;
 621              }
 622              ggml_gallocr_allocate_node(galloc, parent, buffer_id);
 623          }
 624  
 625          // allocate node
 626          ggml_gallocr_allocate_node(galloc, node, buffer_id);
 627  
 628          AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
 629          for (int j = 0; j < GGML_MAX_SRC; j++) {
 630              struct ggml_tensor * parent = node->src[j];
 631              if (parent == NULL) {
 632                  continue;
 633              }
 634              AT_PRINTF("%s", parent->name);
 635              if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
 636                  AT_PRINTF(", ");
 637              }
 638          }
 639          AT_PRINTF("\n");
 640  
 641          // update parents
 642          for (int j = 0; j < GGML_MAX_SRC; j++) {
 643              struct ggml_tensor * parent = node->src[j];
 644              if (parent == NULL) {
 645                  continue;
 646              }
 647              struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
 648              p_hn->n_children -= 1;
 649  
 650              AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
 651                  parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
 652  
 653              if (p_hn->n_children == 0 && p_hn->n_views == 0) {
 654                  if (ggml_is_view(parent)) {
 655                      struct ggml_tensor * view_src = parent->view_src;
 656                      struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
 657                      view_src_hn->n_views -= 1;
 658                      AT_PRINTF("view_src %s: %d children, %d views\n",
 659                          view_src->name, view_src_hn->n_children, view_src_hn->n_views);
 660                      if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
 661                          ggml_gallocr_free_node(galloc, view_src);
 662                      }
 663                  }
 664                  else if (p_hn->allocated) {
 665                      ggml_gallocr_free_node(galloc, parent);
 666                  }
 667              }
 668              AT_PRINTF("\n");
 669          }
 670      }
 671  }
 672  
 673  bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
 674      size_t hash_size = graph->visited_hash_table.size;
 675  
 676      // initialize hash table
 677      if (galloc->hash_set.size < hash_size) {
 678          free(galloc->hash_set.keys);
 679          free(galloc->hash_values);
 680          galloc->hash_set.size = hash_size;
 681          galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
 682          galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
 683          GGML_ASSERT(galloc->hash_set.keys != NULL);
 684          GGML_ASSERT(galloc->hash_values != NULL);
 685      } else {
 686          // reset hash table
 687          memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
 688          memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
 689      }
 690  
 691      // reset allocators
 692      for (int i = 0; i < galloc->n_buffers; i++) {
 693          ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
 694      }
 695  
 696      // allocate in hash table
 697      ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
 698  
 699      // set the node_allocs from the hash table
 700      if (galloc->n_nodes < graph->n_nodes) {
 701          free(galloc->node_allocs);
 702          galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
 703          GGML_ASSERT(galloc->node_allocs != NULL);
 704      }
 705      galloc->n_nodes = graph->n_nodes;
 706      for (int i = 0; i < graph->n_nodes; i++) {
 707          struct ggml_tensor * node = graph->nodes[i];
 708          struct node_alloc * node_alloc = &galloc->node_allocs[i];
 709          if (node->view_src || node->data) {
 710              node_alloc->dst.buffer_id = -1;
 711              node_alloc->dst.offset = SIZE_MAX;
 712              node_alloc->dst.size_max = 0;
 713          } else {
 714              struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 715              node_alloc->dst.buffer_id = hn->buffer_id;
 716              node_alloc->dst.offset    = hn->offset;
 717              node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
 718          }
 719          for (int j = 0; j < GGML_MAX_SRC; j++) {
 720              struct ggml_tensor * src = node->src[j];
 721              if (!src || src->view_src || src->data) {
 722                  node_alloc->src[j].buffer_id = -1;
 723                  node_alloc->src[j].offset = SIZE_MAX;
 724                  node_alloc->src[j].size_max = 0;
 725              } else {
 726                  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
 727                  node_alloc->src[j].buffer_id = hn->buffer_id;
 728                  node_alloc->src[j].offset   = hn->offset;
 729                  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
 730              }
 731          }
 732      }
 733      if (galloc->n_leafs < graph->n_leafs) {
 734          free(galloc->leaf_allocs);
 735          galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
 736          GGML_ASSERT(galloc->leaf_allocs != NULL);
 737      }
 738      galloc->n_leafs = graph->n_leafs;
 739      for (int i = 0; i < graph->n_leafs; i++) {
 740          struct ggml_tensor * leaf = graph->leafs[i];
 741          struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
 742          galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
 743          if (leaf->view_src || leaf->data) {
 744              galloc->leaf_allocs[i].leaf.buffer_id = -1;
 745              galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
 746              galloc->leaf_allocs[i].leaf.size_max = 0;
 747          } else {
 748              galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
 749              galloc->leaf_allocs[i].leaf.offset = hn->offset;
 750              galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
 751          }
 752      }
 753  
 754      // reallocate buffers if needed
 755      for (int i = 0; i < galloc->n_buffers; i++) {
 756          // if the buffer type is used multiple times, we reuse the same buffer
 757          for (int j = 0; j < i; j++) {
 758              if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
 759                  galloc->buffers[i] = galloc->buffers[j];
 760                  break;
 761              }
 762          }
 763  
 764          size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
 765          size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 766  
 767          // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
 768          if (new_size > cur_size || galloc->buffers[i] == NULL) {
 769  #ifndef NDEBUG
 770              fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 771  #endif
 772  
 773              ggml_backend_buffer_free(galloc->buffers[i]);
 774              galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
 775              if (galloc->buffers[i] == NULL) {
 776                  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
 777                  return false;
 778              }
 779          }
 780      }
 781  
 782      return true;
 783  }
 784  
 785  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
 786      return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 787  }
 788  
 789  static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
 790      int buffer_id = tensor_alloc->buffer_id;
 791      assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 792  
 793      if (tensor->view_src != NULL) {
 794          if (tensor->buffer == NULL) {
 795              assert(tensor_alloc->offset == SIZE_MAX);
 796              if (tensor->view_src->buffer == NULL) {
 797                  // this tensor was allocated without ggml-backend
 798                  return;
 799              }
 800              ggml_backend_view_init(tensor);
 801          }
 802      } else {
 803          if (tensor->data == NULL) {
 804              assert(tensor_alloc->offset != SIZE_MAX);
 805              assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 806              void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
 807              void * addr = (char *)base + tensor_alloc->offset;
 808              ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
 809          } else {
 810              if (tensor->buffer == NULL) {
 811                  // this tensor was allocated without ggml-backend
 812                  return;
 813              }
 814          }
 815      }
 816  }
 817  
 818  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
 819      ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
 820      size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
 821      return talloc->size_max >= node_size;
 822  }
 823  
 824  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
 825      if (galloc->n_nodes != graph->n_nodes) {
 826  #ifndef NDEBUG
 827          fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
 828  #endif
 829          return true;
 830      }
 831  
 832      if (galloc->n_leafs != graph->n_leafs) {
 833  #ifndef NDEBUG
 834          fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
 835  #endif
 836          return true;
 837      }
 838  
 839      for (int i = 0; i < graph->n_nodes; i++) {
 840          struct ggml_tensor * node = graph->nodes[i];
 841          struct node_alloc * node_alloc = &galloc->node_allocs[i];
 842  
 843          if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 844  #ifndef NDEBUG
 845              fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
 846  #endif
 847              return true;
 848          }
 849  
 850          for (int j = 0; j < GGML_MAX_SRC; j++) {
 851              struct ggml_tensor * src = node->src[j];
 852              if (src == NULL) {
 853                  continue;
 854              }
 855              if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 856  #ifndef NDEBUG
 857                  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 858  #endif
 859                  return true;
 860              }
 861          }
 862      }
 863  
 864      return false;
 865  }
 866  
 867  bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
 868      if (ggml_gallocr_needs_realloc(galloc, graph)) {
 869          if (galloc->n_buffers == 1) {
 870  #ifndef NDEBUG
 871              fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
 872  #endif
 873              if (!ggml_gallocr_reserve(galloc, graph)) {
 874                  return false;
 875              }
 876          } else {
 877  #ifndef NDEBUG
 878              fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
 879  #endif
 880              return false;
 881          }
 882      }
 883  
 884      // reset buffers
 885      for (int i = 0; i < galloc->n_buffers; i++) {
 886          if (galloc->buffers[i] != NULL) {
 887              ggml_backend_buffer_reset(galloc->buffers[i]);
 888          }
 889      }
 890  
 891      // allocate the graph tensors from the previous assignments
 892      // leafs
 893      for (int i = 0; i < graph->n_leafs; i++) {
 894          struct ggml_tensor * leaf = graph->leafs[i];
 895          struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
 896          ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
 897      }
 898      // nodes
 899      for (int i = 0; i < graph->n_nodes; i++) {
 900          struct ggml_tensor * node = graph->nodes[i];
 901          struct node_alloc * node_alloc = &galloc->node_allocs[i];
 902          for (int j = 0; j < GGML_MAX_SRC; j++) {
 903              struct ggml_tensor * src = node->src[j];
 904              if (src == NULL) {
 905                  continue;
 906              }
 907              ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
 908          }
 909          ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
 910      }
 911  
 912      return true;
 913  }
 914  
 915  size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
 916      GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
 917  
 918      if (galloc->buffers[buffer_id] == NULL) {
 919          return 0;
 920      }
 921  
 922      for (int i = 0; i < buffer_id; i++) {
 923          if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
 924              // this buffer is the same as a previous one due to the same buffer type being used multiple times
 925              // only return the buffer size the first time it appears to avoid double counting
 926              return 0;
 927          }
 928      }
 929  
 930      return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 931  }
 932  
 933  // utils
 934  
 935  static bool alloc_tensor_range(struct ggml_context * ctx,
 936          struct ggml_tensor * first, struct ggml_tensor * last,
 937          ggml_backend_buffer_type_t buft, size_t size,
 938          ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
 939      ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
 940      if (buffer == NULL) {
 941  #ifndef NDEBUG
 942          fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 943  #endif
 944          for (size_t i = 0; i < *n_buffers; i++) {
 945              ggml_backend_buffer_free((*buffers)[i]);
 946          }
 947          free(*buffers);
 948          return false;
 949      }
 950  
 951      struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
 952  
 953      for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
 954          if (t->data == NULL) {
 955              if (t->view_src == NULL) {
 956                  ggml_tallocr_alloc(&tallocr, t);
 957              } else if (t->buffer == NULL) {
 958                  ggml_backend_view_init(t);
 959              }
 960          } else {
 961              if (t->view_src != NULL && t->buffer == NULL) {
 962                  // view of a pre-allocated tensor
 963                  ggml_backend_view_init(t);
 964              }
 965          }
 966      }
 967  
 968      *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
 969      (*buffers)[(*n_buffers)++] = buffer;
 970  
 971      return true;
 972  }
 973  
 974  ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
 975      GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
 976  
 977      size_t alignment = ggml_backend_buft_get_alignment(buft);
 978      size_t max_size = ggml_backend_buft_get_max_size(buft);
 979  
 980      ggml_backend_buffer_t * buffers = NULL;
 981      size_t n_buffers = 0;
 982  
 983      size_t cur_buf_size = 0;
 984      struct ggml_tensor * first = ggml_get_first_tensor(ctx);
 985      for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
 986          size_t this_size = 0;
 987          if (t->data == NULL && t->view_src == NULL) {
 988              this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
 989          }
 990  
 991          if (this_size > max_size) {
 992              fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
 993                      __func__, t->name,
 994                      ggml_backend_buft_name(buft),
 995                      this_size, max_size);
 996              for (size_t i = 0; i < n_buffers; i++) {
 997                  ggml_backend_buffer_free(buffers[i]);
 998              }
 999              free(buffers);
1000              return NULL;
1001          }
1002  
1003          if ((cur_buf_size + this_size) > max_size) {
1004              // allocate tensors in the current buffer
1005              if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1006                  return NULL;
1007              }
1008              first = t;
1009              cur_buf_size = this_size;
1010          } else {
1011              cur_buf_size += this_size;
1012          }
1013      }
1014  
1015      // allocate remaining tensors
1016      if (cur_buf_size > 0) {
1017          if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1018              return NULL;
1019          }
1020      }
1021  
1022      if (n_buffers == 0) {
1023  #ifndef NDEBUG
1024          fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1025  #endif
1026          return NULL;
1027      }
1028  
1029      ggml_backend_buffer_t buffer;
1030      if (n_buffers == 1) {
1031          buffer = buffers[0];
1032      } else {
1033          buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1034      }
1035      free(buffers);
1036      return buffer;
1037  }
1038  
1039  ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1040      return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1041  }