llmnpc - llama.cpp/ggml/src/ggml-alloc.c

Path: llmnpc / llama.cpp / ggml / src / ggml-alloc.c (raw)
   1#include "ggml-alloc.h"
   2#include "ggml-backend-impl.h"
   3#include "ggml.h"
   4#include "ggml-impl.h"
   5#include <assert.h>
   6#include <limits.h>
   7#include <stdarg.h>
   8#include <stdio.h>
   9#include <stdlib.h>
  10#include <string.h>
  11
  12#define MAX(a, b) ((a) > (b) ? (a) : (b))
  13#define MAX_FREE_BLOCKS 256
  14
  15//#define GGML_ALLOCATOR_DEBUG
  16
  17//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
  18#define AT_PRINTF(...)
  19
  20
  21static bool ggml_is_view(const struct ggml_tensor * t) {
  22    return t->view_src != NULL;
  23}
  24
  25// ops that return true for this function must not use restrict pointers for their backend implementations
  26bool ggml_op_can_inplace(enum ggml_op op) {
  27    switch (op) {
  28        case GGML_OP_FILL:
  29        case GGML_OP_SCALE:
  30        case GGML_OP_DIAG_MASK_ZERO:
  31        case GGML_OP_DIAG_MASK_INF:
  32        case GGML_OP_ADD:
  33        case GGML_OP_ADD_ID:
  34        case GGML_OP_ADD1:
  35        case GGML_OP_SUB:
  36        case GGML_OP_MUL:
  37        case GGML_OP_DIV:
  38        case GGML_OP_SQR:
  39        case GGML_OP_SQRT:
  40        case GGML_OP_LOG:
  41        case GGML_OP_UNARY:
  42        case GGML_OP_ROPE:
  43        case GGML_OP_ROPE_BACK:
  44        case GGML_OP_SILU_BACK:
  45        case GGML_OP_RMS_NORM:
  46        case GGML_OP_RMS_NORM_BACK:
  47        case GGML_OP_SOFT_MAX:
  48        case GGML_OP_SOFT_MAX_BACK:
  49            return true;
  50
  51        default:
  52            return false;
  53    }
  54}
  55
  56static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  57    assert(alignment && !(alignment & (alignment - 1))); // power of 2
  58    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  59    return offset + align;
  60}
  61
  62// tallocr
  63
  64struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  65    void * base = ggml_backend_buffer_get_base(buffer);
  66    size_t align = ggml_backend_buffer_get_alignment(buffer);
  67
  68    assert(align && !(align & (align - 1))); // power of 2
  69
  70    struct ggml_tallocr talloc = (struct ggml_tallocr) {
  71        /*.buffer    = */ buffer,
  72        /*.base      = */ base,
  73        /*.alignment = */ align,
  74        /*.offset    = */ aligned_offset(base, 0, align),
  75    };
  76    return talloc;
  77}
  78
  79enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  80    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  81    size = GGML_PAD(size, talloc->alignment);
  82
  83    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  84        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  85                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  86        GGML_ABORT("not enough space in the buffer");
  87    }
  88
  89    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  90    talloc->offset += size;
  91
  92    assert(((uintptr_t)addr % talloc->alignment) == 0);
  93
  94    return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
  95}
  96
  97// dynamic tensor allocator
  98
  99#define GGML_VBUFFER_MAX_CHUNKS 16
 100
 101// relative memory address within an allocation that can be split into multiple buffers (chunks)
 102struct buffer_address {
 103    int chunk;     // index of a backend buffer
 104    size_t offset; // local memory offset within the buffer
 105};
 106
 107static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
 108
 109static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
 110    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
 111}
 112
 113struct free_block {
 114    size_t offset;
 115    size_t size;
 116};
 117
 118struct tallocr_chunk {
 119    struct free_block free_blocks[MAX_FREE_BLOCKS];
 120    int n_free_blocks;
 121    size_t max_size;
 122};
 123
 124struct ggml_dyn_tallocr {
 125    size_t alignment;
 126    size_t max_chunk_size;
 127    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
 128    int n_chunks;
 129
 130#ifdef GGML_ALLOCATOR_DEBUG
 131    struct {
 132        const struct ggml_tensor * tensor;
 133        struct buffer_address addr;
 134    } allocated_tensors[1024];
 135#endif
 136};
 137
 138static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
 139    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
 140    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
 141    int insert_pos = 0;
 142    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
 143        insert_pos++;
 144    }
 145    // shift all blocks from insert_pos onward to make room for the new block
 146    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
 147        chunk->free_blocks[i] = chunk->free_blocks[i-1];
 148    }
 149    // insert the new block
 150    chunk->free_blocks[insert_pos].offset = offset;
 151    chunk->free_blocks[insert_pos].size = size;
 152    chunk->n_free_blocks++;
 153}
 154
 155static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
 156    // shift all elements after idx by 1 to the left, overwriting the element at idx
 157    for (int i = idx; i < chunk->n_free_blocks; i++) {
 158        chunk->free_blocks[i] = chunk->free_blocks[i+1];
 159    }
 160    chunk->n_free_blocks--;
 161}
 162
 163static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
 164    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
 165        return -1;
 166    }
 167    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
 168    chunk->n_free_blocks = 1;
 169    chunk->free_blocks[0].offset = 0;
 170    // available space in a chunk is limited to max_chunk_size, but can be higher if:
 171    // 1. a single tensor exceeds the maximum, and cannot fit any other way
 172    // 2. we are running out of chunks
 173    // backends will either manage to allocate the larger size, or report an error.
 174    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
 175    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
 176        chunk->free_blocks[0].size = SIZE_MAX/2;
 177    }
 178    alloc->chunks[alloc->n_chunks] = chunk;
 179    alloc->n_chunks++;
 180    return alloc->n_chunks - 1;
 181}
 182
 183#ifdef GGML_ALLOCATOR_DEBUG
 184static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
 185    for (int i = 0; i < 1024; i++) {
 186        if (alloc->allocated_tensors[i].tensor == NULL) {
 187            alloc->allocated_tensors[i].tensor = tensor;
 188            alloc->allocated_tensors[i].addr = addr;
 189            return;
 190        }
 191    }
 192    GGML_ABORT("out of allocated_tensors");
 193}
 194static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
 195    for (int i = 0; i < 1024; i++) {
 196        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
 197            alloc->allocated_tensors[i].tensor = NULL;
 198            return;
 199        }
 200    }
 201    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
 202}
 203#endif
 204
 205static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
 206    size = aligned_offset(NULL, size, alloc->alignment);
 207
 208    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 209
 210    int best_fit_chunk = -1;
 211    int best_fit_block = -1;
 212    size_t max_avail = 0;
 213
 214    // find the best fitting free block besides the last block, within any chunk
 215    for (int c = 0; c < alloc->n_chunks; ++c) {
 216        struct tallocr_chunk * chunk = alloc->chunks[c];
 217        size_t best_fit_size = SIZE_MAX;
 218        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
 219            struct free_block * block = &chunk->free_blocks[i];
 220            max_avail = MAX(max_avail, block->size);
 221            if (block->size >= size && block->size <= best_fit_size) {
 222                best_fit_chunk = c;
 223                best_fit_block = i;
 224                best_fit_size = block->size;
 225            }
 226        }
 227    }
 228
 229    if (best_fit_block == -1) {
 230        // no suitable block found, try the last block (this may grow a chunks size)
 231        int64_t best_reuse = INT64_MIN;
 232        for (int c = 0; c < alloc->n_chunks; ++c) {
 233            struct tallocr_chunk * chunk = alloc->chunks[c];
 234            if (chunk->n_free_blocks > 0) {
 235                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
 236                max_avail = MAX(max_avail, block->size);
 237                int64_t reuse_factor = chunk->max_size - block->offset - size;
 238                // reuse_factor < 0 : amount of extra memory that needs to be allocated
 239                // reuse_factor = 0 : allocated free space exactly matches tensor size
 240                // reuse_factor > 0 : superfluous memory that will remain unused
 241                bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
 242                bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
 243                if (block->size >= size && (better_reuse || better_fit)) {
 244                    best_fit_chunk = c;
 245                    best_fit_block = chunk->n_free_blocks - 1;
 246                    best_reuse = reuse_factor;
 247                }
 248            }
 249        }
 250    }
 251
 252    if (best_fit_block == -1) {
 253        // none of the existing chunks have enough space left
 254        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
 255        best_fit_block = 0;
 256    }
 257    if (best_fit_chunk == -1) {
 258        // since the last chunk always has virtually endless memory, this should never happen
 259        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
 260            __func__, size, max_avail);
 261        GGML_ABORT("graph allocation: failed to reserve memory");
 262    }
 263
 264    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
 265    struct free_block    * block = &chunk->free_blocks[best_fit_block];
 266    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
 267    block->offset += size;
 268    block->size -= size;
 269    if (block->size == 0) {
 270        // remove block if empty
 271        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
 272    }
 273
 274    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
 275
 276#ifdef GGML_ALLOCATOR_DEBUG
 277    add_allocated_tensor(alloc, addr, tensor);
 278    size_t cur_max = addr.offset + size;
 279    if (cur_max > chunk->max_size) {
 280        // sort allocated_tensors by chunk/offset
 281        for (int i = 0; i < 1024; i++) {
 282            for (int j = i + 1; j < 1024; j++) {
 283                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
 284                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
 285                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
 286                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
 287                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
 288                    alloc->allocated_tensors[j].tensor = tmp_tensor;
 289                    alloc->allocated_tensors[j].addr = tmp_addr;
 290                }
 291            }
 292        }
 293        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
 294        for (int i = 0; i < 1024; i++) {
 295            if (alloc->allocated_tensors[i].tensor) {
 296                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
 297                    alloc->allocated_tensors[i].addr.chunk,
 298                    alloc->allocated_tensors[i].addr.offset,
 299                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
 300                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
 301            }
 302        }
 303        GGML_LOG_DEBUG("\n");
 304    }
 305#endif
 306
 307    chunk->max_size = MAX(chunk->max_size, addr.offset + size);
 308
 309    return addr;
 310
 311    GGML_UNUSED(tensor);
 312}
 313
 314// this is a very naive implementation, but for our case the number of free blocks should be very small
 315static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
 316    size = aligned_offset(NULL, size, alloc->alignment);
 317
 318    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
 319
 320    // see if we can merge with an existing block
 321    for (int i = 0; i < chunk->n_free_blocks; i++) {
 322        struct free_block * block = &chunk->free_blocks[i];
 323        // check if ptr is at the end of the block
 324        if (block->offset + block->size == addr.offset) {
 325            block->size += size;
 326            // check if we can merge with the next block
 327            if (i < chunk->n_free_blocks - 1) {
 328                struct free_block * next = &chunk->free_blocks[i+1];
 329                if (block->offset + block->size == next->offset) {
 330                    block->size += next->size;
 331                    ggml_dyn_tallocr_remove_block(chunk, i+1);
 332                }
 333            }
 334            return;
 335        }
 336        // check if ptr is at the beginning of the block
 337        if (addr.offset + size == block->offset) {
 338            block->offset = addr.offset;
 339            block->size += size;
 340            // check if we can merge with the previous block
 341            if (i > 0) {
 342                struct free_block * prev = &chunk->free_blocks[i-1];
 343                if (prev->offset + prev->size == block->offset) {
 344                    prev->size += block->size;
 345                    ggml_dyn_tallocr_remove_block(chunk, i);
 346                }
 347            }
 348            return;
 349        }
 350    }
 351    // otherwise, add a new block
 352    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
 353}
 354
 355static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 356    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
 357        free(alloc->chunks[i]);
 358        alloc->chunks[i] = NULL;
 359    }
 360    alloc->n_chunks = 0;
 361
 362#ifdef GGML_ALLOCATOR_DEBUG
 363    for (int i = 0; i < 1024; i++) {
 364        alloc->allocated_tensors[i].tensor = NULL;
 365    }
 366#endif
 367}
 368
 369static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
 370    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 371
 372    *alloc = (struct ggml_dyn_tallocr) {
 373        /*.alignment      = */ alignment,
 374        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
 375        /*.chunks         = */ {NULL},
 376        /*.n_chunks       = */ 0,
 377#ifdef GGML_ALLOCATOR_DEBUG
 378        /*.allocated_tensors = */ {{0}},
 379#endif
 380    };
 381
 382    ggml_dyn_tallocr_reset(alloc);
 383
 384    return alloc;
 385}
 386
 387static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 388    for (int i = 0; i < alloc->n_chunks; ++i) {
 389        free(alloc->chunks[i]);
 390    }
 391    free(alloc);
 392}
 393
 394static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
 395    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
 396}
 397
 398
 399// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
 400
 401struct vbuffer {
 402    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
 403};
 404
 405static void ggml_vbuffer_free(struct vbuffer * buf) {
 406    if (buf == NULL) {
 407        return;
 408    }
 409    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
 410        ggml_backend_buffer_free(buf->chunks[i]);
 411    }
 412    free(buf);
 413}
 414
 415static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
 416    return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
 417}
 418
 419static size_t ggml_vbuffer_size(struct vbuffer * buf) {
 420    size_t size = 0;
 421    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
 422        size += ggml_backend_buffer_get_size(buf->chunks[i]);
 423    }
 424    return size;
 425}
 426
 427static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
 428    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
 429    if (buf == NULL) {
 430        return NULL;
 431    }
 432
 433    for (int n = 0; n < talloc->n_chunks; n++) {
 434        size_t chunk_size = talloc->chunks[n]->max_size;
 435        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
 436        if (buf->chunks[n] == NULL) {
 437            ggml_vbuffer_free(buf);
 438            return NULL;
 439        }
 440        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
 441    }
 442    return buf;
 443}
 444
 445static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
 446    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
 447    void * addr = (char *)base + buf_addr.offset;
 448    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
 449}
 450
 451static void ggml_vbuffer_reset(struct vbuffer * buf) {
 452    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
 453        ggml_backend_buffer_reset(buf->chunks[i]);
 454    }
 455}
 456
 457
 458/////////////////////////////////////
 459
 460// graph allocator
 461
 462struct hash_node {
 463    int n_children;
 464    int n_views;
 465    int buffer_id;
 466    struct buffer_address addr;
 467    bool allocated;
 468};
 469
 470struct tensor_alloc {
 471    int buffer_id;
 472    struct buffer_address addr;
 473    size_t size_max; // 0 = pre-allocated, unused, or view
 474};
 475
 476struct leaf_alloc {
 477    struct tensor_alloc leaf;
 478};
 479
 480struct node_alloc {
 481    struct tensor_alloc dst;
 482    struct tensor_alloc src[GGML_MAX_SRC];
 483};
 484
 485struct ggml_gallocr {
 486    ggml_backend_buffer_type_t * bufts; // [n_buffers]
 487    struct vbuffer ** buffers; // [n_buffers]
 488    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
 489    int n_buffers;
 490
 491    struct ggml_hash_set hash_set;
 492    struct hash_node * hash_values; // [hash_set.size]
 493
 494    struct node_alloc * node_allocs; // [n_nodes]
 495    int n_nodes;
 496
 497    struct leaf_alloc * leaf_allocs; // [n_leafs]
 498    int n_leafs;
 499};
 500
 501ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
 502    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
 503    GGML_ASSERT(galloc != NULL);
 504
 505    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
 506    GGML_ASSERT(galloc->bufts != NULL);
 507
 508    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
 509    GGML_ASSERT(galloc->buffers != NULL);
 510
 511    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
 512    GGML_ASSERT(galloc->buf_tallocs != NULL);
 513
 514    for (int i = 0; i < n_bufs; i++) {
 515        galloc->bufts[i] = bufts[i];
 516        galloc->buffers[i] = NULL;
 517
 518        // check if the same buffer type is used multiple times and reuse the same allocator
 519        for (int j = 0; j < i; j++) {
 520            if (bufts[i] == bufts[j]) {
 521                galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
 522                break;
 523            }
 524        }
 525
 526        if (galloc->buf_tallocs[i] == NULL) {
 527            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
 528            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
 529            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
 530        }
 531    }
 532    galloc->n_buffers = n_bufs;
 533
 534    return galloc;
 535}
 536
 537ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
 538    return ggml_gallocr_new_n(&buft, 1);
 539}
 540
 541void ggml_gallocr_free(ggml_gallocr_t galloc) {
 542    if (galloc == NULL) {
 543        return;
 544    }
 545
 546    for (int i = 0; i < galloc->n_buffers; i++) {
 547        if (galloc->buffers != NULL) {
 548            // skip if already freed
 549            bool freed = false;
 550            for (int j = 0; j < i; j++) {
 551                if (galloc->buffers[j] == galloc->buffers[i]) {
 552                    freed = true;
 553                    break;
 554                }
 555            }
 556            if (!freed) {
 557                ggml_vbuffer_free(galloc->buffers[i]);
 558            }
 559        }
 560        if (galloc->buf_tallocs != NULL) {
 561            // skip if already freed
 562            bool freed = false;
 563            for (int j = 0; j < i; j++) {
 564                if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
 565                    freed = true;
 566                    break;
 567                }
 568            }
 569            if (!freed) {
 570                ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
 571            }
 572        }
 573    }
 574
 575    ggml_hash_set_free(&galloc->hash_set);
 576    free(galloc->hash_values);
 577    free(galloc->bufts);
 578    free(galloc->buffers);
 579    free(galloc->buf_tallocs);
 580    free(galloc->node_allocs);
 581    free(galloc->leaf_allocs);
 582    free(galloc);
 583}
 584
 585typedef struct ggml_gallocr * ggml_gallocr_t;
 586
 587static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 588    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
 589    return &galloc->hash_values[i];
 590}
 591
 592static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 593    return ggml_gallocr_hash_get(galloc, t)->allocated;
 594}
 595
 596static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 597    return t->data != NULL // tensor data already set externally
 598        || t->buffer // tensor on external buffer (but not yet allocated)
 599        || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
 600}
 601
 602// free the extra space at the end if the new tensor is smaller
 603static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
 604    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 605    struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
 606
 607    size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
 608    size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
 609
 610    GGML_ASSERT(parent_size >= node_size);
 611
 612    // note: we want after the freeing the chunks to continue to be aligned
 613    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
 614    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
 615    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
 616
 617    if (parent_size > node_size) {
 618        struct buffer_address p_addr = p_hn->addr;
 619        p_addr.offset += node_size;
 620        size_t extra_size = parent_size - node_size;
 621        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
 622        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
 623    }
 624}
 625
 626static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
 627    GGML_ASSERT(buffer_id >= 0);
 628    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 629
 630    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
 631        hn->allocated = true;
 632        assert(hn->addr.offset == 0);
 633
 634        // try to reuse a parent's buffer (inplace)
 635        if (ggml_op_can_inplace(node->op)) {
 636            for (int i = 0; i < GGML_MAX_SRC; i++) {
 637                struct ggml_tensor * parent = node->src[i];
 638                if (parent == NULL) {
 639                    continue;
 640                }
 641
 642                // if the node's data is external, then we cannot re-use it
 643                if (!ggml_gallocr_is_own(galloc, parent)) {
 644                    AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
 645                    continue;
 646                }
 647
 648                // outputs cannot be reused
 649                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
 650                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
 651                    continue;
 652                }
 653
 654                if (!ggml_are_same_layout(node, parent)) {
 655                    AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
 656                    continue;
 657                }
 658
 659                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
 660                if (p_hn->n_children == 1 && p_hn->n_views == 0) {
 661                    if (ggml_is_view(parent)) {
 662                        struct ggml_tensor * view_src = parent->view_src;
 663                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
 664                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
 665                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
 666                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
 667                            hn->buffer_id = p_hn->buffer_id;
 668                            hn->addr = p_hn->addr;
 669                            p_hn->allocated = false; // avoid freeing the parent
 670                            view_src_hn->allocated = false;
 671                            ggml_gallocr_free_extra_space(galloc, node, view_src);
 672                            return;
 673                        }
 674                    } else {
 675                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
 676                        hn->buffer_id = p_hn->buffer_id;
 677                        hn->addr = p_hn->addr;
 678                        p_hn->allocated = false; // avoid freeing the parent
 679                        ggml_gallocr_free_extra_space(galloc, node, parent);
 680                        return;
 681                    }
 682                }
 683            }
 684        }
 685        // allocate tensor from the buffer
 686        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
 687        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
 688        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
 689        hn->buffer_id = buffer_id;
 690        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
 691    }
 692}
 693
 694static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
 695    // graph outputs are never freed
 696    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
 697        AT_PRINTF("not freeing output %s\n", node->name);
 698        return;
 699    }
 700
 701    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 702    int buffer_id = hn->buffer_id;
 703    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
 704    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
 705    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
 706
 707    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
 708        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
 709#ifdef GGML_ALLOCATOR_DEBUG
 710    remove_allocated_tensor(alloc, hn->addr, node);
 711#endif
 712
 713    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
 714    hn->allocated = false;
 715}
 716
 717static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 718    return node_buffer_ids ? node_buffer_ids[i] : 0;
 719}
 720
 721static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
 722    // clear hash tables
 723    ggml_hash_set_reset(&galloc->hash_set);
 724    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
 725
 726    // allocate leafs
 727    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
 728    for (int i = 0; i < graph->n_leafs; i++) {
 729        struct ggml_tensor * leaf = graph->leafs[i];
 730        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
 731    }
 732
 733    // count number of children and views
 734    // allocate other graph inputs and leafs first to avoid overwriting them
 735    for (int i = 0; i < graph->n_nodes; i++) {
 736        struct ggml_tensor * node = graph->nodes[i];
 737
 738        // TODO: better way to add external dependencies
 739        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
 740        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
 741        // itself is never used and should not be considered a dependency
 742        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
 743            struct ggml_tensor * view_src = node->view_src;
 744            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
 745        }
 746
 747        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
 748            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
 749        }
 750
 751        for (int j = 0; j < GGML_MAX_SRC; j++) {
 752            struct ggml_tensor * src = node->src[j];
 753            if (src == NULL) {
 754                continue;
 755            }
 756
 757            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
 758
 759            // allocate explicit inputs
 760            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
 761                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
 762            }
 763        }
 764    }
 765
 766    // allocate tensors
 767    for (int i = 0; i < graph->n_nodes; i++) {
 768        struct ggml_tensor * node = graph->nodes[i];
 769        int buffer_id = get_node_buffer_id(node_buffer_ids, i);
 770
 771        // allocate parents (only leafs need to be allocated at this point)
 772        for (int j = 0; j < GGML_MAX_SRC; j++) {
 773            struct ggml_tensor * parent = node->src[j];
 774            if (parent == NULL) {
 775                continue;
 776            }
 777            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
 778        }
 779
 780        // allocate node
 781        ggml_gallocr_allocate_node(galloc, node, buffer_id);
 782
 783        AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
 784        for (int j = 0; j < GGML_MAX_SRC; j++) {
 785            struct ggml_tensor * parent = node->src[j];
 786            if (parent == NULL) {
 787                continue;
 788            }
 789            AT_PRINTF("%s", parent->name);
 790            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
 791                AT_PRINTF(", ");
 792            }
 793        }
 794        AT_PRINTF("\n");
 795
 796        // update parents
 797        for (int j = 0; j < GGML_MAX_SRC; j++) {
 798            struct ggml_tensor * parent = node->src[j];
 799            if (parent == NULL) {
 800                continue;
 801            }
 802            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
 803            p_hn->n_children -= 1;
 804
 805            AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
 806                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
 807
 808            if (p_hn->n_children == 0 && p_hn->n_views == 0) {
 809                if (ggml_is_view(parent)) {
 810                    struct ggml_tensor * view_src = parent->view_src;
 811                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
 812                    view_src_hn->n_views -= 1;
 813                    AT_PRINTF("view_src %s: %d children, %d views\n",
 814                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);
 815                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
 816                        ggml_gallocr_free_node(galloc, view_src);
 817                    }
 818                }
 819                else if (p_hn->allocated) {
 820                    ggml_gallocr_free_node(galloc, parent);
 821                }
 822            }
 823            AT_PRINTF("\n");
 824        }
 825    }
 826}
 827
 828static bool ggml_gallocr_reserve_n_impl(
 829        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
 830    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
 831    // add 25% margin to avoid hash collisions
 832    min_hash_size += min_hash_size / 4;
 833
 834    // initialize hash table
 835    if (galloc->hash_set.size < min_hash_size) {
 836        ggml_hash_set_free(&galloc->hash_set);
 837        galloc->hash_set = ggml_hash_set_new(min_hash_size);
 838        GGML_ASSERT(galloc->hash_set.keys != NULL);
 839
 840        free(galloc->hash_values);
 841        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
 842        GGML_ASSERT(galloc->hash_values != NULL);
 843    }
 844
 845    // reset allocators
 846    for (int i = 0; i < galloc->n_buffers; i++) {
 847        ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
 848    }
 849
 850    // allocate in hash table
 851    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
 852
 853    // set the node_allocs from the hash table
 854    if (galloc->n_nodes < graph->n_nodes) {
 855        free(galloc->node_allocs);
 856        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
 857        GGML_ASSERT(galloc->node_allocs != NULL);
 858    }
 859    galloc->n_nodes = graph->n_nodes;
 860    for (int i = 0; i < graph->n_nodes; i++) {
 861        struct ggml_tensor * node = graph->nodes[i];
 862        struct node_alloc * node_alloc = &galloc->node_allocs[i];
 863        if (node->view_src || node->data) {
 864            node_alloc->dst.buffer_id = -1;
 865            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
 866            node_alloc->dst.size_max = 0;
 867        } else {
 868            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 869            node_alloc->dst.buffer_id = hn->buffer_id;
 870            node_alloc->dst.addr = hn->addr;
 871            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
 872        }
 873        for (int j = 0; j < GGML_MAX_SRC; j++) {
 874            struct ggml_tensor * src = node->src[j];
 875            if (!src || src->view_src || src->data) {
 876                node_alloc->src[j].buffer_id = -1;
 877                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
 878                node_alloc->src[j].size_max = 0;
 879            } else {
 880                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
 881                node_alloc->src[j].buffer_id = hn->buffer_id;
 882                node_alloc->src[j].addr = hn->addr;
 883                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
 884            }
 885        }
 886    }
 887    if (galloc->n_leafs < graph->n_leafs) {
 888        free(galloc->leaf_allocs);
 889        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
 890        GGML_ASSERT(galloc->leaf_allocs != NULL);
 891    }
 892    galloc->n_leafs = graph->n_leafs;
 893    for (int i = 0; i < graph->n_leafs; i++) {
 894        struct ggml_tensor * leaf = graph->leafs[i];
 895        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
 896        if (leaf->view_src || leaf->data) {
 897            galloc->leaf_allocs[i].leaf.buffer_id = -1;
 898            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
 899            galloc->leaf_allocs[i].leaf.size_max = 0;
 900        } else {
 901            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
 902            galloc->leaf_allocs[i].leaf.addr = hn->addr;
 903            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
 904        }
 905    }
 906
 907    // reallocate buffers if needed
 908    for (int i = 0; i < galloc->n_buffers; i++) {
 909        // if the buffer type is used multiple times, we reuse the same buffer
 910        for (int j = 0; j < i; j++) {
 911            if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
 912                galloc->buffers[i] = galloc->buffers[j];
 913                break;
 914            }
 915        }
 916
 917        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
 918        bool realloc = galloc->buffers[i] == NULL;
 919        size_t new_size = 0;
 920        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
 921            size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
 922            size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
 923            new_size += new_chunk_size;
 924            if (new_chunk_size > cur_chunk_size) {
 925                realloc = true;
 926            }
 927        }
 928        if (realloc) {
 929#ifndef NDEBUG
 930            {
 931                size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
 932                if (cur_size > 0) {
 933                    GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
 934                        __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 935                }
 936            }
 937#endif
 938            ggml_vbuffer_free(galloc->buffers[i]);
 939            if (no_alloc) {
 940                galloc->buffers[i] = NULL;
 941            } else {
 942                galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 943                if (galloc->buffers[i] == NULL) {
 944                    GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
 945                    return false;
 946                }
 947            }
 948        }
 949    }
 950
 951    return true;
 952}
 953
 954void ggml_gallocr_reserve_n_size(
 955        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
 956    GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
 957    for (int i = 0; i < galloc->n_buffers; i++) {
 958        sizes[i] = 0;
 959        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
 960            sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
 961        }
 962    }
 963}
 964
 965bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
 966    return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
 967}
 968
 969bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
 970    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 971}
 972
 973static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
 974    int buffer_id = tensor_alloc->buffer_id;
 975    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
 976
 977    if (tensor->view_src != NULL) {
 978        if (tensor->buffer == NULL) {
 979            assert(tensor_alloc->addr.offset == SIZE_MAX);
 980            if (tensor->view_src->buffer == NULL) {
 981                // this tensor was allocated without ggml-backend
 982                return;
 983            }
 984            ggml_backend_view_init(tensor);
 985        }
 986    } else {
 987        if (tensor->data == NULL) {
 988            assert(tensor_alloc->addr.offset != SIZE_MAX);
 989            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
 990            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
 991        } else {
 992            if (tensor->buffer == NULL) {
 993                // this tensor was allocated without ggml-backend
 994                return;
 995            }
 996        }
 997    }
 998}
 999
1000static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
1001    size_t node_size = 0;
1002    if (!node->data && !node->view_src) {
1003        // If we previously had data but don't now then reallocate
1004        if (talloc->buffer_id < 0) {
1005            return false;
1006        }
1007        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
1008    }
1009    return talloc->size_max >= node_size;
1010}
1011
1012static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
1013    if (galloc->n_nodes != graph->n_nodes) {
1014#ifndef NDEBUG
1015        GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
1016#endif
1017        return true;
1018    }
1019
1020    if (galloc->n_leafs != graph->n_leafs) {
1021#ifndef NDEBUG
1022        GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
1023#endif
1024        return true;
1025    }
1026
1027    for (int i = 0; i < graph->n_nodes; i++) {
1028        struct ggml_tensor * node = graph->nodes[i];
1029        struct node_alloc * node_alloc = &galloc->node_allocs[i];
1030
1031        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
1032#ifndef NDEBUG
1033            GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
1034#endif
1035            return true;
1036        }
1037
1038        for (int j = 0; j < GGML_MAX_SRC; j++) {
1039            struct ggml_tensor * src = node->src[j];
1040            if (src == NULL) {
1041                continue;
1042            }
1043            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
1044#ifndef NDEBUG
1045                GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
1046#endif
1047                return true;
1048            }
1049        }
1050    }
1051
1052    return false;
1053}
1054
1055bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
1056    if (ggml_gallocr_needs_realloc(galloc, graph)) {
1057        if (galloc->n_buffers == 1) {
1058#ifndef NDEBUG
1059            GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
1060#endif
1061            if (!ggml_gallocr_reserve(galloc, graph)) {
1062                return false;
1063            }
1064        } else {
1065#ifndef NDEBUG
1066            GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
1067#endif
1068            return false;
1069        }
1070    }
1071
1072    // reset buffers
1073    for (int i = 0; i < galloc->n_buffers; i++) {
1074        if (galloc->buffers[i] != NULL) {
1075            ggml_vbuffer_reset(galloc->buffers[i]);
1076        }
1077    }
1078
1079    // allocate the graph tensors from the previous assignments
1080    // leafs
1081    for (int i = 0; i < graph->n_leafs; i++) {
1082        struct ggml_tensor * leaf = graph->leafs[i];
1083        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
1084        ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
1085    }
1086    // nodes
1087    for (int i = 0; i < graph->n_nodes; i++) {
1088        struct ggml_tensor * node = graph->nodes[i];
1089        struct node_alloc * node_alloc = &galloc->node_allocs[i];
1090        for (int j = 0; j < GGML_MAX_SRC; j++) {
1091            struct ggml_tensor * src = node->src[j];
1092            if (src == NULL) {
1093                continue;
1094            }
1095            ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
1096        }
1097        ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
1098    }
1099
1100    return true;
1101}
1102
1103size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
1104    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
1105
1106    if (galloc->buffers[buffer_id] == NULL) {
1107        return 0;
1108    }
1109
1110    for (int i = 0; i < buffer_id; i++) {
1111        if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
1112            // this buffer is the same as a previous one due to the same buffer type being used multiple times
1113            // only return the buffer size the first time it appears to avoid double counting
1114            return 0;
1115        }
1116    }
1117
1118    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
1119}
1120
1121// utils
1122
1123static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
1124    for (size_t i = 0; i < *n_buffers; i++) {
1125        ggml_backend_buffer_free((*buffers)[i]);
1126    }
1127    free(*buffers);
1128}
1129
1130static bool alloc_tensor_range(struct ggml_context * ctx,
1131        struct ggml_tensor * first, struct ggml_tensor * last,
1132        ggml_backend_buffer_type_t buft, size_t size,
1133        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
1134
1135    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
1136    if (buffer == NULL) {
1137        GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
1138        free_buffers(buffers, n_buffers);
1139        return false;
1140    }
1141
1142    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
1143    (*buffers)[(*n_buffers)++] = buffer;
1144
1145    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
1146
1147    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
1148        enum ggml_status status = GGML_STATUS_SUCCESS;
1149        if (t->data == NULL) {
1150            if (t->view_src == NULL) {
1151                status = ggml_tallocr_alloc(&tallocr, t);
1152            } else if (t->buffer == NULL) {
1153                status = ggml_backend_view_init(t);
1154            }
1155        } else {
1156            if (t->view_src != NULL && t->buffer == NULL) {
1157                // view of a pre-allocated tensor
1158                status = ggml_backend_view_init(t);
1159            }
1160        }
1161        if (status != GGML_STATUS_SUCCESS) {
1162            GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
1163            free_buffers(buffers, n_buffers);
1164            return false;
1165        }
1166    }
1167
1168    return true;
1169}
1170
1171static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
1172        struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
1173    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
1174
1175    size_t alignment = ggml_backend_buft_get_alignment(buft);
1176    size_t max_size = ggml_backend_buft_get_max_size(buft);
1177
1178    ggml_backend_buffer_t * buffers = NULL;
1179    size_t n_buffers = 0;
1180    *nbytes_total = 0;
1181
1182    size_t cur_buf_size = 0;
1183    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
1184    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1185        size_t this_size = 0;
1186        if (t->data == NULL && t->view_src == NULL) {
1187            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
1188        }
1189
1190        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
1191            // allocate tensors in the current buffer
1192            if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1193                return NULL;
1194            }
1195            first = t;
1196            *nbytes_total += cur_buf_size;
1197            cur_buf_size = this_size;
1198        } else {
1199            cur_buf_size += this_size;
1200        }
1201    }
1202
1203    // allocate remaining tensors
1204    if (cur_buf_size > 0) {
1205        *nbytes_total += cur_buf_size;
1206        if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1207            return NULL;
1208        }
1209    }
1210
1211    if (no_alloc) {
1212        return NULL;
1213    }
1214
1215    if (n_buffers == 0) {
1216#ifndef NDEBUG
1217        GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1218#endif
1219        GGML_ASSERT(!buffers);
1220        return NULL;
1221    }
1222
1223    ggml_backend_buffer_t buffer;
1224    if (n_buffers == 1) {
1225        buffer = buffers[0];
1226    } else {
1227        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1228    }
1229    if (buffers) {
1230        free(buffers); // can be NULL if context is empty or no_alloc
1231    }
1232    return buffer;
1233}
1234
1235size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1236    size_t nbytes_total = 0;
1237    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
1238    GGML_ASSERT(!buf);
1239    return nbytes_total;
1240}
1241
1242ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1243    size_t nbytes_total = 0;
1244    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
1245}
1246
1247ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1248    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1249}