Spaces:

natasa365
/

whisper.cpp

Running

slaren commited on Feb 11, 2024

Commit

5cffd6f

unverified ·

1 Parent(s): 469988b

ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

Files changed (7) hide show

ggml-alloc.c +563 -490
ggml-alloc.h +39 -65
ggml-backend.c +225 -258
ggml-backend.h +5 -10
ggml.c +19 -9
ggml.h +15 -3
whisper.cpp +175 -225

ggml-alloc.c CHANGED Viewed

@@ -17,6 +17,50 @@
 //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
 #define AT_PRINTF(...)
 // TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
     return offset + align;
 }
 struct free_block {
-    void * addr;
     size_t size;
 };
-struct ggml_tallocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
-    void * base;
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
     size_t max_size;
-    bool measure;
 #ifdef GGML_ALLOCATOR_DEBUG
-    struct ggml_tensor * allocated_tensors[1024];
 #endif
 };
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i] == NULL) {
-            alloc->allocated_tensors[i] = tensor;
             return;
         }
     }
     GGML_ASSERT(!"out of allocated_tensors");
 }
-static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i] == tensor ||
-            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
-            alloc->allocated_tensors[i] = NULL;
             return;
         }
     }
-    printf("tried to free tensor %s not found\n", tensor->name);
     GGML_ASSERT(!"tensor not found");
 }
 #endif
-// check if a tensor is allocated by this buffer
-static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
-}
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->view_src != NULL;
-}
-void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
-    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
-            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
-                    __func__, tensor->name, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
-            return;
         }
     }
     struct free_block * block = &alloc->free_blocks[best_fit_block];
-    void * addr = block->addr;
-    block->addr = (char*)block->addr + size;
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
         }
     }
-    AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
-    tensor->data = addr;
-    tensor->buffer = alloc->buffer;
-    if (!alloc->measure) {
-        ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
-    }
 #ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, tensor);
-    size_t cur_max = (char*)addr - (char*)alloc->base + size;
     if (cur_max > alloc->max_size) {
-        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
-            if (alloc->allocated_tensors[i]) {
-                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
             }
         }
-        printf("\n");
     }
 #endif
-    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
-}
-// this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    if (ggml_tallocr_is_own(alloc, tensor) == false) {
-        // the tensor was not allocated in this buffer
-        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
-        // the easiest way to deal with this is just to ignore it
-        // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
-        return;
-    }
-    void * ptr = tensor->data;
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
 #ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, tensor);
 #endif
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         // check if ptr is at the end of the block
-        if ((char*)block->addr + block->size == ptr) {
             block->size += size;
             // check if we can merge with the next block
-            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
                 block->size += alloc->free_blocks[i+1].size;
                 alloc->n_free_blocks--;
                 for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
             return;
         }
         // check if ptr is at the beginning of the block
-        if ((char*)ptr + size == block->addr) {
-            block->addr = ptr;
             block->size += size;
             // check if we can merge with the previous block
-            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
                 alloc->free_blocks[i-1].size += block->size;
                 alloc->n_free_blocks--;
                 for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
     GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
     int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
         alloc->free_blocks[i] = alloc->free_blocks[i-1];
     }
     // insert the new block
-    alloc->free_blocks[insert_pos].addr = ptr;
     alloc->free_blocks[insert_pos].size = size;
     alloc->n_free_blocks++;
 }
-void ggml_tallocr_reset(ggml_tallocr_t alloc) {
     alloc->n_free_blocks = 1;
-    size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
-    alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
-    if (alloc->measure) {
-        alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
-    } else {
-        alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
-        ggml_backend_buffer_reset(alloc->buffer);
-    }
 }
-ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
-    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
-    *alloc = (struct ggml_tallocr) {
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ true,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.max_size      = */ 0,
-        /*.measure       = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ {0},
 #endif
     };
-    ggml_tallocr_reset(alloc);
-    return alloc;
-}
-ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
-    ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
-    alloc->measure = true;
     return alloc;
 }
-ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
-    // create a backend buffer to get the correct tensor allocation sizes
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
-    // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
-    ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
-    alloc->buffer_owned = true;
-    alloc->measure = true;
-    ggml_tallocr_reset(alloc);
-    return alloc;
-}
-ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
-    return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
-}
-ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
-    // create a backend buffer to get the correct tensor allocation sizes
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
-    ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
-    alloc->buffer_owned = true;
-    return alloc;
-}
-ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
-    return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
-}
-ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
-    *alloc = (struct ggml_tallocr) {
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ false,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
-        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
-        /*.measure       = */ false,
-#ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ {0},
-#endif
-    };
-    ggml_tallocr_reset(alloc);
-    return alloc;
-}
-struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
-    return alloc->buffer;
-}
-void ggml_tallocr_free(ggml_tallocr_t alloc) {
-    if (alloc == NULL) {
-        return;
-    }
-    if (alloc->buffer_owned) {
-        ggml_backend_buffer_free(alloc->buffer);
-    }
     free(alloc);
 }
-bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
-    return alloc->measure;
 }
-size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
-    // to avoid this, we add a 10% margin to the buffer size
-    return alloc->max_size + alloc->max_size/10;
-}
 // graph allocator
 struct hash_node {
     int n_children;
     int n_views;
 };
 struct ggml_gallocr {
-    ggml_tallocr_t talloc;
     struct ggml_hash_set hash_set;
-    struct hash_node * hash_values;
-    size_t hash_values_size;
-    ggml_tallocr_t * hash_allocs;
-    int * parse_seq;
-    int parse_seq_len;
 };
-ggml_gallocr_t ggml_gallocr_new(void) {
-    ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
-    *galloc = (struct ggml_gallocr) {
-        /*.talloc           = */ NULL,
-        /*.hash_set         = */ {0},
-        /*.hash_values      = */ NULL,
-        /*.hash_values_size = */ 0,
-        /*.hash_allocs      = */ NULL,
-        /*.parse_seq        = */ NULL,
-        /*.parse_seq_len    = */ 0,
-    };
     return galloc;
 }
 void ggml_gallocr_free(ggml_gallocr_t galloc) {
     if (galloc == NULL) {
         return;
     }
-    if (galloc->hash_set.keys != NULL) {
-        free(galloc->hash_set.keys);
-    }
-    if (galloc->hash_values != NULL) {
-        free(galloc->hash_values);
-    }
-    if (galloc->hash_allocs != NULL) {
-        free(galloc->hash_allocs);
-    }
-    if (galloc->parse_seq != NULL) {
-        free(galloc->parse_seq);
     }
     free(galloc);
 }
-void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
-    free(galloc->parse_seq);
-    galloc->parse_seq = malloc(sizeof(int) * n);
-    for (int i = 0; i < n; i++) {
-        galloc->parse_seq[i] = list[i];
-    }
-    galloc->parse_seq_len = n;
-}
-static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
     return &galloc->hash_values[i];
 }
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
 }
-static bool ggml_op_can_inplace(enum ggml_op op) {
-    switch (op) {
-        case GGML_OP_SCALE:
-        case GGML_OP_DIAG_MASK_ZERO:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
-        case GGML_OP_SUB:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-        case GGML_OP_UNARY:
-        case GGML_OP_ROPE:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SOFT_MAX:
-            return true;
-        default:
-            return false;
-    }
 }
-static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    if (galloc->talloc != NULL) {
-        return galloc->talloc;
-    }
-    return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
 }
-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, view);
-    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
-    if (update_backend) {
-        view->backend = view->view_src->backend;
-    }
-    // views are initialized in the alloc buffer rather than the view_src buffer
-    view->buffer  = alloc->buffer;
-    view->data    = (char *)view->view_src->data + view->view_offs;
-    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
-    if (!alloc->measure) {
-        ggml_backend_buffer_init_tensor(alloc->buffer, view);
-    }
-}
-static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, node);
-    if (node->data == NULL) {
-        if (ggml_is_view(node)) {
-            init_view(galloc, node, true);
-        } else {
-            // see if we can reuse a parent's buffer (inplace)
-            if (ggml_op_can_inplace(node->op)) {
-                for (int i = 0; i < GGML_MAX_SRC; i++) {
-                    struct ggml_tensor * parent = node->src[i];
-                    if (parent == NULL) {
-                        break;
-                    }
-                    // if the node's data is external, then we cannot re-use it
-                    if (ggml_tallocr_is_own(alloc, parent) == false) {
-                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
-                        continue;
-                    }
-                    struct hash_node * p_hn = hash_get(galloc, parent);
-                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
-                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = parent->view_src;
-                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
-                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
-                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
-                                // the parent's data that it will need later (same layout requirement). the problem is that then
-                                // we cannot free the tensor because the original address of the allocation is lost.
-                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
-                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
-                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->view_src = view_src;
-                                view_src_hn->n_views += 1;
-                                init_view(galloc, node, false);
-                                return;
-                            }
-                        } else {
-                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->view_src = parent;
-                            p_hn->n_views += 1;
-                            init_view(galloc, node, false);
                             return;
                         }
                     }
                 }
             }
-            ggml_tallocr_alloc(alloc, node);
         }
     }
 }
-static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, node);
-    ggml_tallocr_free_tensor(alloc, node);
 }
-static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
-    const int * parse_seq     = galloc->parse_seq;
-    int         parse_seq_len = galloc->parse_seq_len;
     // count number of children and views
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
         if (ggml_is_view(node)) {
             struct ggml_tensor * view_src = node->view_src;
-            hash_get(galloc, view_src)->n_views += 1;
-            if (node->buffer == NULL && node->data != NULL) {
-                // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node, true);
-            }
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
             if (parent == NULL) {
                 break;
             }
-            hash_get(galloc, parent)->n_children += 1;
-            if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent, true);
-            }
         }
    }
     // allocate tensors
-    // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-    int last_barrier_pos = 0;
-    int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
-    for (int ind = 0; ind < n_nodes; ind++) {
-        // allocate a node if there is no parse_seq or this is not a barrier
-        if (parse_seq_len == 0 || parse_seq[ind] != -1) {
-            int i = parse_seq_len ? parse_seq[ind] : ind;
-            struct ggml_tensor * node = gf->nodes[i];
-            // allocate parents (leafs)
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                allocate_node(galloc, parent);
             }
-            // allocate node
-            allocate_node(galloc, node);
-            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                AT_PRINTF("%s", parent->name);
-                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
-                    AT_PRINTF(", ");
-                }
             }
-            AT_PRINTF("\n");
         }
         // update parents
-        // update immediately if there is no parse_seq
-        // update only at barriers if there is parse_seq
-        if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
-            int update_start = parse_seq_len ? last_barrier_pos : ind;
-            int update_end   = parse_seq_len ? ind              : ind + 1;
-            for (int i = update_start; i < update_end; i++) {
-                int node_i = parse_seq_len ? parse_seq[i] : i;
-                struct ggml_tensor * node = gf->nodes[node_i];
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    struct ggml_tensor * parent = node->src[j];
-                    if (parent == NULL) {
-                        break;
-                    }
-                    struct hash_node * p_hn = hash_get(galloc, parent);
-                    p_hn->n_children -= 1;
-                    //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-                    if (p_hn->n_children == 0 && p_hn->n_views == 0) {
-                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = parent->view_src;
-                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
-                            view_src_hn->n_views -= 1;
-                            AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
-                            if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
-                                free_node(galloc, view_src);
-                            }
-                        }
-                        else {
-                            free_node(galloc, parent);
-                        }
                     }
                 }
             }
             AT_PRINTF("\n");
-            if (parse_seq_len) {
-                last_barrier_pos = ind + 1;
-            }
         }
     }
 }
-size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
     size_t hash_size = graph->visited_hash_table.size;
-    // check if the hash table is initialized and large enough
     if (galloc->hash_set.size < hash_size) {
-        if (galloc->hash_set.keys != NULL) {
-            free(galloc->hash_set.keys);
-        }
-        if (galloc->hash_values != NULL) {
-            free(galloc->hash_values);
-        }
-        galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
         galloc->hash_set.size = hash_size;
-        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
     }
-    // reset hash table
-    memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
-    memset(galloc->hash_values,   0, sizeof(struct hash_node) * hash_size);
-    galloc->talloc = talloc;
-    ggml_tallocr_alloc_graph_impl(galloc, graph);
-    galloc->talloc = NULL;
-    size_t max_size = ggml_tallocr_max_size(talloc);
-    return max_size;
-}
-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
-    const size_t hash_size = hash_set.size;
-    GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
-    galloc->talloc = NULL;
-    // alloc hash_values if needed
-    if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
-        free(galloc->hash_values);
-        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
-        galloc->hash_values_size = hash_size;
     }
-    // free hash_set.keys if needed
-    if (galloc->hash_set.keys != NULL) {
-        free(galloc->hash_set.keys);
     }
-    galloc->hash_set = hash_set;
-    // reset hash values
-    memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
-    galloc->hash_allocs = hash_node_talloc;
-    ggml_tallocr_alloc_graph_impl(galloc, graph);
-    // remove unowned resources
-    galloc->hash_set.keys = NULL;
-    galloc->hash_allocs = NULL;
 }
-// legacy API wrapper
-struct ggml_allocr {
-    ggml_tallocr_t talloc;
-    ggml_gallocr_t galloc;
-};
-static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
-    ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
-    *alloc = (struct ggml_allocr) {
-        /*.talloc = */ talloc,
-        /*.galloc = */ ggml_gallocr_new(),
-    };
-    return alloc;
 }
-ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
-}
-ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
-}
-ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
 }
-ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
 }
-ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
-}
-struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
-    return ggml_tallocr_get_buffer(alloc->talloc);
-}
-void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
-    ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
-}
-void ggml_allocr_free(ggml_allocr_t alloc) {
-    if (alloc == NULL) {
-        return;
     }
-    ggml_gallocr_free(alloc->galloc);
-    ggml_tallocr_free(alloc->talloc);
-    free(alloc);
 }
-bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
-    return ggml_tallocr_is_measure(alloc->talloc);
-}
-void ggml_allocr_reset(ggml_allocr_t alloc) {
-    ggml_tallocr_reset(alloc->talloc);
-}
-void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
-    ggml_tallocr_alloc(alloc->talloc, tensor);
-}
-size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
-    return ggml_tallocr_max_size(alloc->talloc);
 }
-size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
-    return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
 }
 // utils
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         return false;
     }
-    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
                 ggml_tallocr_alloc(tallocr, t);
-            } else {
                 ggml_backend_view_init(buffer, t);
             }
         } else {
-            if (t->view_src != NULL) {
                 // view of a pre-allocated tensor
                 ggml_backend_view_init(buffer, t);
             }
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
         }
         if (this_size > max_size) {
-            // tensor is too large to fit in a single buffer
             fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                     __func__, t->name,
                     ggml_backend_buft_name(buft),
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     }
     if (n_buffers == 0) {
-        // all the tensors in the context are already allocated
 #ifndef NDEBUG
         fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
 #endif

 //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
 #define AT_PRINTF(...)
+static bool ggml_is_view(const struct ggml_tensor * t) {
+    return t->view_src != NULL;
+}
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SOFT_MAX:
+            return true;
+        default:
+            return false;
+    }
+}
 // TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
     return offset + align;
 }
+// tallocr
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
+    ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
+    if (talloc == NULL) {
+        return NULL;
+    }
+    void * base = ggml_backend_buffer_get_base(buffer);
+    size_t align = ggml_backend_buffer_get_alignment(buffer);
+    assert(align && !(align & (align - 1))); // power of 2
+    *talloc = (struct ggml_tallocr) {
+        /*.buffer    = */ buffer,
+        /*.base      = */ base,
+        /*.alignment = */ align,
+        /*.offset    = */ aligned_offset(base, 0, align),
+    };
+    return talloc;
+}
+void ggml_tallocr_free(ggml_tallocr_t talloc) {
+    free(talloc);
+}
+void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
+    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
+    size = GGML_PAD(size, talloc->alignment);
+    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
+        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
+        GGML_ASSERT(!"not enough space in the buffer");
+        return;
+    }
+    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
+    talloc->offset += size;
+    assert(((uintptr_t)addr % talloc->alignment) == 0);
+    ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
+}
+// dynamic tensor allocator
 struct free_block {
+    size_t offset;
     size_t size;
 };
+struct ggml_dyn_tallocr {
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
     size_t max_size;
 #ifdef GGML_ALLOCATOR_DEBUG
+    struct {
+        const struct ggml_tensor * tensor;
+        size_t offset;
+    } allocated_tensors[1024];
 #endif
 };
 #ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i].tensor == NULL) {
+            alloc->allocated_tensors[i].tensor = tensor;
+            alloc->allocated_tensors[i].offset = offset;
             return;
         }
     }
     GGML_ASSERT(!"out of allocated_tensors");
 }
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i].offset == offset) {
+            alloc->allocated_tensors[i].tensor = NULL;
             return;
         }
     }
+    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
     GGML_ASSERT(!"tensor not found");
 }
 #endif
+static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
+            // this should never happen
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+                    __func__, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
+            GGML_UNREACHABLE();
         }
     }
     struct free_block * block = &alloc->free_blocks[best_fit_block];
+    size_t offset = block->offset;
+    block->offset = offset + size;
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
         }
     }
+    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
 #ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, offset, tensor);
+    size_t cur_max = offset + size;
     if (cur_max > alloc->max_size) {
+        // sort allocated_tensors by offset
+        for (int i = 0; i < 1024; i++) {
+            for (int j = i + 1; j < 1024; j++) {
+                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
+                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
+                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
+                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
+                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
+                    alloc->allocated_tensors[j].tensor = tmp_tensor;
+                    alloc->allocated_tensors[j].offset = tmp_offset;
+                }
+            }
+        }
+        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i].tensor) {
+                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                    alloc->allocated_tensors[i].offset,
+                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
             }
         }
+        fprintf(stderr, "\n");
     }
 #endif
+    alloc->max_size = MAX(alloc->max_size, offset + size);
+    return offset;
+    GGML_UNUSED(tensor);
+}
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
 #ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, offset, tensor);
 #endif
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         // check if ptr is at the end of the block
+        if (block->offset + block->size == offset) {
             block->size += size;
             // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
                 block->size += alloc->free_blocks[i+1].size;
                 alloc->n_free_blocks--;
                 for (int j = i+1; j < alloc->n_free_blocks; j++) {
             return;
         }
         // check if ptr is at the beginning of the block
+        if (offset + size == block->offset) {
+            block->offset = offset;
             block->size += size;
             // check if we can merge with the previous block
+            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
                 alloc->free_blocks[i-1].size += block->size;
                 alloc->n_free_blocks--;
                 for (int j = i; j < alloc->n_free_blocks; j++) {
     GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
     int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
         alloc->free_blocks[i] = alloc->free_blocks[i-1];
     }
     // insert the new block
+    alloc->free_blocks[insert_pos].offset = offset;
     alloc->free_blocks[insert_pos].size = size;
     alloc->n_free_blocks++;
+    GGML_UNUSED(tensor);
 }
+static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->n_free_blocks = 1;
+    alloc->free_blocks[0].offset = 0;
+    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
+    alloc->max_size = 0;
 }
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
+    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
+    *alloc = (struct ggml_dyn_tallocr) {
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.max_size      = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ {{0}},
 #endif
     };
+    ggml_dyn_tallocr_reset(alloc);
     return alloc;
 }
+static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
     free(alloc);
 }
+static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
+    return alloc->max_size;
 }
+/////////////////////////////////////
 // graph allocator
 struct hash_node {
     int n_children;
     int n_views;
+    int buffer_id;
+    size_t offset; // offset within the buffer
+    bool allocated;
+};
+//
+struct tensor_alloc {
+    size_t offset;
+    size_t size_max; // 0 = pre-allocated, unused, or view
+};
+struct node_alloc {
+    int buffer_id;
+    struct tensor_alloc dst;
+    struct tensor_alloc src[GGML_MAX_SRC];
 };
 struct ggml_gallocr {
+    ggml_backend_buffer_type_t * bufts; // [n_buffers]
+    ggml_backend_buffer_t * buffers; // [n_buffers]
+    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+    int n_buffers;
     struct ggml_hash_set hash_set;
+    struct hash_node * hash_values; // [hash_set.size]
+    struct node_alloc * node_allocs; // [n_nodes]
+    int n_nodes;
 };
+ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
+    GGML_ASSERT(galloc != NULL);
+    galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
+    GGML_ASSERT(galloc->bufts != NULL);
+    galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
+    GGML_ASSERT(galloc->buffers != NULL);
+    galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
+    GGML_ASSERT(galloc->buf_tallocs != NULL);
+    for (int i = 0; i < n_bufs; i++) {
+        galloc->bufts[i] = bufts[i];
+        galloc->buffers[i] = NULL;
+        size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
+        galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+    }
+    galloc->n_buffers = n_bufs;
     return galloc;
 }
+ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
+    return ggml_gallocr_new_n(&buft, 1);
+}
 void ggml_gallocr_free(ggml_gallocr_t galloc) {
     if (galloc == NULL) {
         return;
     }
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        if (galloc->buffers != NULL) {
+            ggml_backend_buffer_free(galloc->buffers[i]);
+        }
+        if (galloc->buf_tallocs != NULL) {
+            ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+        }
     }
+    free(galloc->hash_set.keys);
+    free(galloc->hash_values);
+    free(galloc->bufts);
+    free(galloc->buffers);
+    free(galloc->buf_tallocs);
+    free(galloc->node_allocs);
     free(galloc);
 }
+typedef struct ggml_gallocr * ggml_gallocr_t;
+static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
     return &galloc->hash_values[i];
 }
+static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
+    return ggml_gallocr_hash_get(galloc, t)->allocated;
 }
+static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    hn->buffer_id = buffer_id;
+    hn->offset = offset;
+    hn->allocated = true;
 }
+static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
+    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
+static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
+        hn->allocated = true;
+        assert(hn->offset == 0);
+        // try to reuse a parent's buffer (inplace)
+        if (ggml_op_can_inplace(node->op)) {
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                struct ggml_tensor * parent = node->src[i];
+                if (parent == NULL) {
+                    break;
+                }
+                // if the node's data is external, then we cannot re-use it
+                if (!ggml_gallocr_is_own(galloc, parent)) {
+                    AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                    continue;
+                }
+                // outputs cannot be reused
+                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
+                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
+                    continue;
+                }
+                if (!ggml_are_same_layout(node, parent)) {
+                    AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
+                    continue;
+                }
+                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+                if (p_hn->n_children == 1 && p_hn->n_views == 0) {
+                    if (ggml_is_view(parent)) {
+                        struct ggml_tensor * view_src = parent->view_src;
+                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
+                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                            assert(view_src_hn->offset == p_hn->offset);
+                            hn->buffer_id = p_hn->buffer_id;
+                            hn->offset = p_hn->offset;
+                            p_hn->allocated = false; // avoid freeing the parent
+                            view_src_hn->allocated = false;
                             return;
                         }
+                    } else {
+                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                        hn->buffer_id = p_hn->buffer_id;
+                        hn->offset = p_hn->offset;
+                        p_hn->allocated = false; // avoid freeing the parent
+                        return;
                     }
                 }
             }
         }
+        // allocate tensor from the buffer
+        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
+        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
+        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
+        hn->buffer_id = buffer_id;
+        hn->offset = offset;
+        return;
     }
 }
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+    // graph outputs are never freed
+    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        AT_PRINTF("not freeing output %s\n", node->name);
+        return;
+    }
+    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    size_t offset = hn->offset;
+    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
+    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
+    hn->allocated = false;
 }
+static int get_node_buffer_id(const int * node_buffer_ids, int i) {
+    return node_buffer_ids ? node_buffer_ids[i] : 0;
+}
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+    // clear hash tables
+    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
+    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    // allocate all graph inputs first to avoid overwriting them
+    for (int i = 0; i < graph->n_nodes; i++) {
+        if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (graph->nodes[i]->src[j] == NULL) {
+                break;
+            }
+            if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
+                ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
+            }
+        }
+    }
     // count number of children and views
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
         if (ggml_is_view(node)) {
             struct ggml_tensor * view_src = node->view_src;
+            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             if (parent == NULL) {
                 break;
             }
+            ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
         }
    }
     // allocate tensors
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        int buffer_id = get_node_buffer_id(node_buffer_ids, i);
+        // allocate parents (only leafs need to be allocated at this point)
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * parent = node->src[j];
+            if (parent == NULL) {
+                break;
             }
+            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
+        }
+        // allocate node
+        ggml_gallocr_allocate_node(galloc, node, buffer_id);
+        AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * parent = node->src[j];
+            if (parent == NULL) {
+                break;
+            }
+            AT_PRINTF("%s", parent->name);
+            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                AT_PRINTF(", ");
             }
         }
+        AT_PRINTF("\n");
         // update parents
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * parent = node->src[j];
+            if (parent == NULL) {
+                break;
+            }
+            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+            p_hn->n_children -= 1;
+            AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
+                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
+            if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                if (ggml_is_view(parent)) {
+                    struct ggml_tensor * view_src = parent->view_src;
+                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
+                    view_src_hn->n_views -= 1;
+                    AT_PRINTF("view_src %s: %d children, %d views\n",
+                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
+                        ggml_gallocr_free_node(galloc, view_src, buffer_id);
                     }
                 }
+                else if (p_hn->allocated) {
+                    ggml_gallocr_free_node(galloc, parent, buffer_id);
+                }
             }
             AT_PRINTF("\n");
         }
     }
 }
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
     size_t hash_size = graph->visited_hash_table.size;
+    // initialize hash table
     if (galloc->hash_set.size < hash_size) {
+        free(galloc->hash_set.keys);
+        free(galloc->hash_values);
         galloc->hash_set.size = hash_size;
+        galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
+        galloc->hash_values   = calloc(sizeof(struct hash_node), hash_size);
+        GGML_ASSERT(galloc->hash_set.keys != NULL);
+        GGML_ASSERT(galloc->hash_values != NULL);
+    } else {
+        // reset hash table
+        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
+        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
     }
+    // reset allocators
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
+    }
+    // allocate in hash table
+    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
+    // set the node_allocs from the hash table
+    if (galloc->n_nodes < graph->n_nodes) {
+        free(galloc->node_allocs);
+        galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
+        GGML_ASSERT(galloc->node_allocs != NULL);
     }
+    galloc->n_nodes = graph->n_nodes;
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
+        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
+        if (node->view_src || node->data) {
+            node_alloc->dst.offset = SIZE_MAX;
+            node_alloc->dst.size_max = 0;
+        } else {
+            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+            node_alloc->dst.offset   = hn->offset;
+            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * src = node->src[j];
+            if (!src || src->view_src || src->data) {
+                node_alloc->src[j].offset = SIZE_MAX;
+                node_alloc->src[j].size_max = 0;
+            } else {
+                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
+                node_alloc->src[j].offset   = hn->offset;
+                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
+            }
+        }
     }
+    // reallocate buffers if needed
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
+        if (new_size > cur_size) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            ggml_backend_buffer_free(galloc->buffers[i]);
+            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+            if (galloc->buffers[i] == NULL) {
+                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                return false;
+            }
+        }
+    }
+    return true;
 }
+bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+    return ggml_gallocr_reserve_n(galloc, graph, NULL);
 }
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
+    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+    if (node->view_src != NULL) {
+        if (node->buffer == NULL) {
+            assert(tensor_alloc->offset == SIZE_MAX);
+            if (node->view_src->buffer == NULL) {
+                // this tensor was allocated without ggml-backend
+                return;
+            }
+            ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+        }
+    } else {
+        if (node->data == NULL) {
+            assert(tensor_alloc->offset != SIZE_MAX);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+            void * addr = (char *)base + tensor_alloc->offset;
+            ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+        } else {
+            if (node->buffer == NULL) {
+                // this tensor was allocated without ggml-backend
+                return;
+            }
+#ifndef NDEBUG
+            size_t offset =
+                (char *)node->data -
+                (char *)ggml_backend_buffer_get_base(node->buffer);
+            size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
+            assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
+            assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
+#endif
+        }
+    }
 }
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
+    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+    return talloc->size_max >= node_size;
 }
+static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
+    if (galloc->n_nodes != graph->n_nodes) {
+#ifndef NDEBUG
+        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+#endif
+        return true;
+    }
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
+        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+#endif
+            return true;
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+#ifndef NDEBUG
+                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+#endif
+                return true;
+            }
+        }
     }
+    return false;
 }
+bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
+    if (ggml_gallocr_needs_realloc(galloc, graph)) {
+        if (galloc->n_buffers == 1) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+#endif
+            if (!ggml_gallocr_reserve(galloc, graph)) {
+                return false;
+            }
+        } else {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+#endif
+            return false;
+        }
+    }
+    // reset buffers
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        // zero size buffers are not allocated
+        if (galloc->buffers[i] != NULL) {
+            ggml_backend_buffer_reset(galloc->buffers[i]);
+        }
+    }
+    // allocate the graph tensors from the previous assignments
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+        }
+        ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+    }
+    return true;
 }
+size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+    if (galloc->buffers[buffer_id] == NULL) {
+        return 0;
+    }
+    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 // utils
         return false;
     }
+    struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
                 ggml_tallocr_alloc(tallocr, t);
+            } else if (t->buffer == NULL) {
                 ggml_backend_view_init(buffer, t);
             }
         } else {
+            if (t->view_src != NULL && t->buffer == NULL) {
                 // view of a pre-allocated tensor
                 ggml_backend_view_init(buffer, t);
             }
         }
         if (this_size > max_size) {
             fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                     __func__, t->name,
                     ggml_backend_buft_name(buft),
     }
     if (n_buffers == 0) {
 #ifndef NDEBUG
         fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
 #endif

ggml-alloc.h CHANGED Viewed

@@ -6,88 +6,62 @@
 extern "C" {
 #endif
-struct ggml_backend;
-struct ggml_backend_buffer;
-struct ggml_backend_buffer_type;
-//
-// Legacy API
-//
-typedef struct ggml_allocr * ggml_allocr_t;
-// initialize allocator for use with CPU backend only
-GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-// initialize allocator for use with ggml-backend
-GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-// tell the allocator to parse nodes following the order described in the list
-// you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
-GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
-GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
-//
-// ggml-backend v2 API
-//
-// Separate tensor and graph allocator objects
-// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-// The original API is kept as a wrapper around the new API
-// Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
-GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
-GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
-// Graph allocator
 typedef struct ggml_gallocr * ggml_gallocr_t;
-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
-GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
-// Allocate tensors from the allocators given by the hash table
-GGML_API void   ggml_gallocr_alloc_graph_n(
-                    ggml_gallocr_t galloc,
-                    struct ggml_cgraph * graph,
-                    struct ggml_hash_set hash_set,
-                    ggml_tallocr_t * hash_node_talloc);
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
 #ifdef  __cplusplus
 }

 extern "C" {
 #endif
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;
+// Tensor allocator
+typedef struct ggml_tallocr * ggml_tallocr_t;
+GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
+GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
 typedef struct ggml_gallocr * ggml_gallocr_t;
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 #ifdef  __cplusplus
 }

ggml-backend.c CHANGED Viewed

@@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
 // backend CPU
 GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
     return "CPU";
@@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
 }
 GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
 }
 GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
     /* .reset           = */ NULL,
 };
-static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
@@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
-    GGML_ASSERT(data != NULL && "failed to allocate buffer");
     return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
@@ -766,6 +775,9 @@ static struct ggml_backend_i cpu_backend_i = {
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
     ctx->work_data           = NULL;
@@ -774,6 +786,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->abort_callback_data = NULL;
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
     *cpu_backend = (struct ggml_backend) {
         /* .interface = */ cpu_backend_i,
@@ -865,6 +881,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
     ctx->n_buffers = n_buffers;
     ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
     size_t total_size = 0;
     for (size_t i = 0; i < n_buffers; i++) {
         ctx->buffers[i] = buffers[i];
@@ -886,6 +904,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
     }
 }
 // scheduler
@@ -894,7 +924,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
 #define GGML_MAX_SPLIT_INPUTS 16
 struct ggml_backend_sched_split {
-    ggml_tallocr_t tallocr;
     int i_start;
     int i_end;
     struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
@@ -909,15 +939,17 @@ struct ggml_backend_sched {
     int n_backends;
     ggml_backend_t backends[GGML_MAX_BACKENDS];
     ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
-    ggml_tallocr_t  tallocs[GGML_MAX_BACKENDS];
     ggml_gallocr_t galloc;
     // hash keys of the nodes in the graph
     struct ggml_hash_set    hash_set;
-    // hash values (arrays of [hash_set.size])
-    ggml_tallocr_t *        node_talloc;                     // tallocr assigned to each node (indirectly this is the backend)
-    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
     // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
@@ -927,77 +959,46 @@ struct ggml_backend_sched {
     struct ggml_context * ctx;
     // align context_buffer to GGML_MEM_ALIGN
     #ifdef _MSC_VER
     __declspec(align(GGML_MEM_ALIGN))
     #else
     __attribute__((aligned(GGML_MEM_ALIGN)))
     #endif
-    char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
-    ggml_backend_sched_eval_callback callback_eval;
-    void * callback_eval_user_data;
 };
 #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
-#define node_allocr(node) sched->node_talloc[hash_id(node)]
-static bool ggml_is_view_op(enum ggml_op op) {
-    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-// returns the priority of the backend, lower is better
-static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
     for (int i = 0; i < sched->n_backends; i++) {
         if (sched->backends[i] == backend) {
             return i;
         }
     }
-    return INT_MAX;
 }
-static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->tallocs[i] == allocr) {
-            return i;
-        }
-    }
-    return INT_MAX;
-}
-static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
     if (buffer == NULL) {
-        return NULL;
-    }
-    // check if this is already allocate in a allocr buffer (from user manual allocations)
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
-            return sched->tallocs[i];
-        }
     }
     // find highest prio backend that supports the buffer type
     for (int i = 0; i < sched->n_backends; i++) {
         if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
-            return sched->tallocs[i];
         }
     }
     GGML_ASSERT(false && "tensor buffer type not supported by any backend");
 }
-static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
-    if (allocr == NULL) {
-        return NULL;
-    }
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->tallocs[i] == allocr) {
-            return sched->backends[i];
-        }
-    }
-    GGML_UNREACHABLE();
-}
 #if 0
 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@@ -1008,37 +1009,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
 #endif
 // returns the backend that should be used for the node based on the current locations
-static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
     // assign pre-allocated nodes to their backend
     // dst
-    ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
-    if (cur_allocr != NULL) {
         SET_CAUSE(node, "1.dst");
-        return cur_allocr;
     }
     // view_src
-    if (node->view_src != NULL) {
-        cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
-        if (cur_allocr != NULL) {
             SET_CAUSE(node, "1.vsrc");
-            return cur_allocr;
         }
     }
     // assign nodes that use weights to the backend of the weights
     for (int i = 0; i < GGML_MAX_SRC; i++) {
-        const struct ggml_tensor * src = node->src[i];
         if (src == NULL) {
             break;
         }
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
             // operations with weights are always run on the same backend as the weights
             SET_CAUSE(node, "1.wgt%d", i);
-            return src_allocr;
         }
     }
-    return NULL;
 }
 static char * fmt_size(size_t size) {
@@ -1051,11 +1054,11 @@ static char * fmt_size(size_t size) {
     return buffer;
 }
-static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     int cur_split = 0;
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
-            ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
             fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
@@ -1069,17 +1072,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
         if (ggml_is_view_op(node->op)) {
             continue;
         }
-        ggml_tallocr_t node_allocr = node_allocr(node);
-        ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
-            fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
             fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
@@ -1087,23 +1088,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
     }
 }
-// creates a copy of the tensor with the same memory layout
-static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
-    struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        dup->nb[i] = tensor->nb[i];
-    }
-    return dup;
-}
 //#define DEBUG_PASS1
 //#define DEBUG_PASS2
 //#define DEBUG_PASS3
 //#define DEBUG_PASS4
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     // reset splits
     sched->n_splits = 0;
     sched->is_reset = false;
@@ -1125,28 +1116,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 1: assign backends to ops with pre-allocated inputs
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
-        if (node_allocr(leaf) != NULL) {
             // do not overwrite user assignments
             continue;
         }
-        node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
     }
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        if (node_allocr(node) != NULL) {
             // do not overwrite user assignments
             continue;
         }
-        node_allocr(node) = sched_allocr_from_cur(sched, node);
         // src
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
-            if (node_allocr(src) == NULL) {
-                node_allocr(src) = sched_allocr_from_cur(sched, src);
             }
         }
     }
@@ -1161,22 +1152,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 2.1 expand gpu up
     {
-        ggml_tallocr_t cur_allocr = NULL;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
-                    cur_allocr = NULL;
                 } else {
-                    cur_allocr = node_allocr;
                 }
             } else {
-                node_allocr(node) = cur_allocr;
                 SET_CAUSE(node, "2.1");
             }
         }
@@ -1184,22 +1175,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 2.2 expand gpu down
     {
-        ggml_tallocr_t cur_allocr = NULL;
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
-                    cur_allocr = NULL;
                 } else {
-                    cur_allocr = node_allocr;
                 }
             } else {
-                node_allocr(node) = cur_allocr;
                 SET_CAUSE(node, "2.2");
             }
         }
@@ -1207,17 +1198,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 2.3 expand rest up
     {
-        ggml_tallocr_t cur_allocr = NULL;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                cur_allocr = node_allocr;
             } else {
-                node_allocr(node) = cur_allocr;
                 SET_CAUSE(node, "2.3");
             }
         }
@@ -1225,17 +1216,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 2.4 expand rest down
     {
-        ggml_tallocr_t cur_allocr = NULL;
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                cur_allocr = node_allocr;
             } else {
-                node_allocr(node) = cur_allocr;
                 SET_CAUSE(node, "2.4");
             }
         }
@@ -1247,9 +1238,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t cur_allocr = node_allocr(node);
-        if (node->view_src != NULL && cur_allocr == NULL) {
-            cur_allocr = node_allocr(node) = node_allocr(node->view_src);
             SET_CAUSE(node, "3.vsrc");
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1257,14 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             if (src == NULL) {
                 break;
             }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr == NULL) {
                 if (src->view_src != NULL) {
                     // views are always on the same backend as the source
-                    node_allocr(src) = node_allocr(src->view_src);
                     SET_CAUSE(src, "3.vsrc");
                 } else {
-                    node_allocr(src) = cur_allocr;
                     SET_CAUSE(src, "3.cur");
                 }
             }
@@ -1281,15 +1272,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (!ggml_is_view_op(node->op)) {
-                sched->splits[0].tallocr = node_allocr(node);
                 break;
             }
         }
         sched->splits[0].i_start = 0;
         sched->splits[0].n_inputs = 0;
         memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
-        ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
-        size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
@@ -1297,19 +1287,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                 continue;
             }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
-            if (node_allocr != cur_allocr) {
                 sched->splits[cur_split].i_end = i;
                 cur_split++;
                 GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
-                sched->splits[cur_split].tallocr = node_allocr;
                 sched->splits[cur_split].i_start = i;
                 sched->splits[cur_split].n_inputs = 0;
-                cur_allocr = node_allocr;
-                cur_backend_id = sched_allocr_prio(sched, cur_allocr);
             }
             // find inputs that are not on the same backend
@@ -1318,43 +1307,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                 if (src == NULL) {
                     break;
                 }
-                ggml_tallocr_t src_allocr = node_allocr(src);
-                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
-                if (src_allocr != node_allocr) {
                     // create a copy of the input in the split's backend
                     size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                         struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                         ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
                         SET_CAUSE(tensor_copy, "4.cpy");
                         int n_inputs = sched->splits[cur_split].n_inputs++;
                         GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                         sched->splits[cur_split].inputs[n_inputs] = src;
                     }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
-#if 0
-                    // check if the input is already in the split
-                    bool found = false;
-                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
-                        if (sched->splits[cur_split].inputs[k] == src) {
-                            found = true;
-                            break;
-                        }
-                    }
-                    if (!found) {
-                        int n_inputs = sched->splits[cur_split].n_inputs++;
-                        //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
-                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
-                        sched->splits[cur_split].inputs[n_inputs] = src;
-                    }
-#endif
                 }
             }
         }
@@ -1369,30 +1340,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     // sanity check: all sources should have the same backend as the node
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t node_allocr = node_allocr(node);
-        if (node_allocr == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
-        if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
             fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
-                node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
-                node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
-                    node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
-                    j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
             }
-            if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
                 fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
-                    src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
-                    src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
             }
         }
     }
@@ -1406,32 +1377,43 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         struct ggml_backend_sched_split * split = &sched->splits[i];
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
-        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
             // add a dependency to the input source so that it is not freed before the copy is done
-            GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
-            input_cpy->src[0] = input;
             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
         }
         for (int j = split->i_start; j < split->i_end; j++) {
             graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
         }
     }
     sched->graph = graph_copy;
 }
-static void sched_alloc_splits(ggml_backend_sched_t sched) {
-    ggml_gallocr_alloc_graph_n(
-        sched->galloc,
-        sched->graph,
-        sched->hash_set,
-        sched->node_talloc);
 }
-static void sched_compute_splits(ggml_backend_sched_t sched) {
     uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
     uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
@@ -1439,20 +1421,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &splits[i];
-        ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
-        int split_backend_id = sched_backend_prio(sched, split_backend);
         // copy the input tensors to the split backend
         uint64_t copy_start_us = ggml_time_us();
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
             GGML_ASSERT(input->buffer != NULL);
             GGML_ASSERT(input_cpy->buffer != NULL);
-            // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
-            // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
             ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
         }
         //ggml_backend_synchronize(split_backend); // necessary to measure copy time
@@ -1468,7 +1448,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         uint64_t compute_start_us = ggml_time_us();
         if (!sched->callback_eval) {
-            ggml_backend_graph_compute(split_backend, &split->graph);
             //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
             // similar to ggml_backend_compare_graph_backend
@@ -1488,7 +1470,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
-                ggml_backend_graph_compute(split_backend, &gv);
                 if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
                     break;
@@ -1510,19 +1494,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         }
     }
 #endif
-}
-static void sched_reset(ggml_backend_sched_t sched) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        ggml_tallocr_reset(sched->tallocs[i]);
-    }
-    // reset state for the next run
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
-    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
-    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
-    sched->is_reset = true;
 }
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1532,9 +1505,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
     struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
     // initialize hash table
-    sched->hash_set    = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
-    sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
-    sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
     sched->n_backends = n_backends;
     for (int i = 0; i < n_backends; i++) {
@@ -1542,14 +1516,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
         sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
     }
-    sched->galloc = ggml_gallocr_new();
-    // init measure allocs for each backend
-    for (int i = 0; i < n_backends; i++) {
-        sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
-    }
-    sched_reset(sched);
     return sched;
 }
@@ -1558,49 +1527,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     if (sched == NULL) {
         return;
     }
-    for (int i = 0; i < sched->n_backends; i++) {
-        ggml_tallocr_free(sched->tallocs[i]);
-    }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
     free(sched->hash_set.keys);
-    free(sched->node_talloc);
-    free(sched->node_copies);
     free(sched);
 }
-void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
-    GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
-    sched_split_graph(sched, measure_graph);
-    sched_alloc_splits(sched);
-    // allocate buffers and reset allocators
-    for (int i = 0; i < sched->n_backends; i++) {
-        size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
-        ggml_tallocr_free(sched->tallocs[i]);
-        sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
     }
-    sched_reset(sched);
 }
-void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
     if (!sched->is_reset) {
-        sched_reset(sched);
     }
-    sched_split_graph(sched, graph);
-    sched_alloc_splits(sched);
-    sched_compute_splits(sched);
-}
-void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
-    sched_reset(sched);
-}
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
     sched->callback_eval = callback;
@@ -1611,37 +1585,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
     return sched->n_splits;
 }
-ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    return sched->tallocs[backend_index];
-}
-ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
 }
 void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    node_allocr(node) = sched->tallocs[backend_index];
 }
 ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
-    ggml_tallocr_t allocr = node_allocr(node);
-    if (allocr == NULL) {
         return NULL;
     }
-    return get_allocr_backend(sched, allocr);
 }
 // utils
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
-    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
     GGML_ASSERT(tensor->view_src != NULL);
     GGML_ASSERT(tensor->view_src->buffer != NULL);
     GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1665,7 +1632,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
     ggml_backend_buffer_init_tensor(buffer, tensor);
 }
-static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
     struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
     GGML_ASSERT(src != NULL);
@@ -1678,7 +1645,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
     struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
     if (src->view_src != NULL) {
-        dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
         dst->view_offs = src->view_offs;
     }
     dst->op = src->op;
@@ -1691,14 +1658,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
         if (s == NULL) {
             break;
         }
-        dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
     }
     node_copies[id] = dst;
     return dst;
 }
-static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
     size_t id = ggml_hash_find(hash_set, src);
     if (node_init[id]) {
         return;
@@ -1707,7 +1674,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
     struct ggml_tensor * dst = node_copies[id];
     if (dst->view_src != NULL) {
-        graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
         ggml_backend_view_init(dst->view_src->buffer, dst);
     }
     else {
@@ -1720,17 +1687,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
         if (s == NULL) {
             break;
         }
-        graph_init_tensor(hash_set, node_copies, node_init, s);
     }
 }
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = {
         /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
     };
-    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
-    bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
     struct ggml_init_params params = {
         /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1759,7 +1726,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     // dup nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
     }
     // allocate nodes
@@ -1784,7 +1751,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     // copy data and init views
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        graph_init_tensor(hash_set, node_copies, node_init, node);
     }
     // build graph copy

 // backend CPU
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
     return "CPU";
 }
 GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    uintptr_t data = (uintptr_t)buffer->context;
+    // align the buffer
+    if (data % TENSOR_ALIGNMENT != 0) {
+        data = GGML_PAD(data, TENSOR_ALIGNMENT);
+    }
+    return (void *)data;
 }
 GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     /* .reset           = */ NULL,
 };
 GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
+    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
     return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+    if (ctx == NULL) {
+        return NULL;
+    }
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
     ctx->work_data           = NULL;
     ctx->abort_callback_data = NULL;
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+    if (cpu_backend == NULL) {
+        free(ctx);
+        return NULL;
+    }
     *cpu_backend = (struct ggml_backend) {
         /* .interface = */ cpu_backend_i,
     ctx->n_buffers = n_buffers;
     ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+    GGML_ASSERT(ctx->buffers != NULL);
     size_t total_size = 0;
     for (size_t i = 0; i < n_buffers; i++) {
         ctx->buffers[i] = buffers[i];
     }
 }
+// creates a copy of the tensor with the same memory layout
+static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
+    struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        dup->nb[i] = tensor->nb[i];
+    }
+    return dup;
+}
+static bool ggml_is_view_op(enum ggml_op op) {
+    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
+}
 // scheduler
 #define GGML_MAX_SPLIT_INPUTS 16
 struct ggml_backend_sched_split {
+    int backend_id;
     int i_start;
     int i_end;
     struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
     int n_backends;
     ggml_backend_t backends[GGML_MAX_BACKENDS];
     ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
     ggml_gallocr_t galloc;
     // hash keys of the nodes in the graph
     struct ggml_hash_set    hash_set;
+    // hash values
+    int * tensor_backend_id;
+    struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
+    int * node_backend_ids; // [n_nodes]
+    int n_nodes;
     // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
     struct ggml_context * ctx;
+    ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
     // align context_buffer to GGML_MEM_ALIGN
     #ifdef _MSC_VER
     __declspec(align(GGML_MEM_ALIGN))
     #else
     __attribute__((aligned(GGML_MEM_ALIGN)))
     #endif
+    char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
 };
 #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
+#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
+#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
+// returns the priority of the backend, lower id is higher priority
+static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
     for (int i = 0; i < sched->n_backends; i++) {
         if (sched->backends[i] == backend) {
             return i;
         }
     }
+    return -1;
 }
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
     if (buffer == NULL) {
+        return -1;
     }
     // find highest prio backend that supports the buffer type
     for (int i = 0; i < sched->n_backends; i++) {
         if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+            return i;
         }
     }
     GGML_ASSERT(false && "tensor buffer type not supported by any backend");
 }
 #if 0
 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #endif
 // returns the backend that should be used for the node based on the current locations
+static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
+    // TODO: use supports_op to check if the backend supports the op
     // assign pre-allocated nodes to their backend
     // dst
+    int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
+    if (cur_backend != -1) {
         SET_CAUSE(node, "1.dst");
+        return cur_backend;
     }
     // view_src
+    if (tensor->view_src != NULL) {
+        cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
+        if (cur_backend != -1) {
             SET_CAUSE(node, "1.vsrc");
+            return cur_backend;
         }
     }
     // assign nodes that use weights to the backend of the weights
     for (int i = 0; i < GGML_MAX_SRC; i++) {
+        const struct ggml_tensor * src = tensor->src[i];
         if (src == NULL) {
             break;
         }
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
             // operations with weights are always run on the same backend as the weights
             SET_CAUSE(node, "1.wgt%d", i);
+            return src_backend;
         }
     }
+    return -1;
 }
 static char * fmt_size(size_t size) {
     return buffer;
 }
+static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     int cur_split = 0;
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
+            ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
             fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
         if (ggml_is_view_op(node->op)) {
             continue;
         }
+        ggml_backend_t tensor_backend = tensor_backend(node);
         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+            fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
+            ggml_backend_t src_backend = tensor_backend(src);
             fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
     }
 }
 //#define DEBUG_PASS1
 //#define DEBUG_PASS2
 //#define DEBUG_PASS3
 //#define DEBUG_PASS4
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
+static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     // reset splits
     sched->n_splits = 0;
     sched->is_reset = false;
     // pass 1: assign backends to ops with pre-allocated inputs
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
+        if (tensor_backend_id(leaf) != -1) {
             // do not overwrite user assignments
             continue;
         }
+        tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
     }
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
+        if (tensor_backend_id(node) != -1) {
             // do not overwrite user assignments
             continue;
         }
+        tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
         // src
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
+            if (tensor_backend_id(src) == -1) {
+                tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
             }
         }
     }
     // pass 2.1 expand gpu up
     {
+        int cur_backend_id = -1;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
+            int tensor_backend_id = tensor_backend_id(node);
+            if (tensor_backend_id != -1) {
+                if (tensor_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
                 } else {
+                    cur_backend_id = tensor_backend_id;
                 }
             } else {
+                tensor_backend_id(node) = cur_backend_id;
                 SET_CAUSE(node, "2.1");
             }
         }
     // pass 2.2 expand gpu down
     {
+        int cur_backend_id = -1;
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
+            int tensor_backend_id = tensor_backend_id(node);
+            if (tensor_backend_id != -1) {
+                if (tensor_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
                 } else {
+                    cur_backend_id = tensor_backend_id;
                 }
             } else {
+                tensor_backend_id(node) = cur_backend_id;
                 SET_CAUSE(node, "2.2");
             }
         }
     // pass 2.3 expand rest up
     {
+        int cur_backend_id = -1;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
+            int tensor_backend_id = tensor_backend_id(node);
+            if (tensor_backend_id != -1) {
+                cur_backend_id = tensor_backend_id;
             } else {
+                tensor_backend_id(node) = cur_backend_id;
                 SET_CAUSE(node, "2.3");
             }
         }
     // pass 2.4 expand rest down
     {
+        int cur_backend_id = -1;
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
                 continue;
             }
+            int tensor_backend_id = tensor_backend_id(node);
+            if (tensor_backend_id != -1) {
+                cur_backend_id = tensor_backend_id;
             } else {
+                tensor_backend_id(node) = cur_backend_id;
                 SET_CAUSE(node, "2.4");
             }
         }
     // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
+        int cur_backend_id = tensor_backend_id(node);
+        if (node->view_src != NULL && cur_backend_id == -1) {
+            cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
             SET_CAUSE(node, "3.vsrc");
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             if (src == NULL) {
                 break;
             }
+            int src_backend_id = tensor_backend_id(src);
+            if (src_backend_id == -1) {
                 if (src->view_src != NULL) {
                     // views are always on the same backend as the source
+                    tensor_backend_id(src) = tensor_backend_id(src->view_src);
                     SET_CAUSE(src, "3.vsrc");
                 } else {
+                    tensor_backend_id(src) = cur_backend_id;
                     SET_CAUSE(src, "3.cur");
                 }
             }
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (!ggml_is_view_op(node->op)) {
+                sched->splits[0].backend_id = tensor_backend_id(node);
                 break;
             }
         }
         sched->splits[0].i_start = 0;
         sched->splits[0].n_inputs = 0;
         memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
+        int cur_backend_id = sched->splits[0].backend_id;
         for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
                 continue;
             }
+            int tensor_backend_id = tensor_backend_id(node);
+            GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
+            if (tensor_backend_id != cur_backend_id) {
                 sched->splits[cur_split].i_end = i;
                 cur_split++;
                 GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
+                sched->splits[cur_split].backend_id = tensor_backend_id;
                 sched->splits[cur_split].i_start = i;
                 sched->splits[cur_split].n_inputs = 0;
+                cur_backend_id = tensor_backend_id;
             }
             // find inputs that are not on the same backend
                 if (src == NULL) {
                     break;
                 }
+                int src_backend_id = tensor_backend_id(src);
+                assert(src_backend_id != -1); // all inputs should be assigned by now
+                if (src_backend_id != tensor_backend_id) {
                     // create a copy of the input in the split's backend
                     size_t id = hash_id(src);
+                    if (sched->tensor_copies[id][cur_backend_id] == NULL) {
+                        ggml_backend_t backend = sched->backends[cur_backend_id];
                         struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                         ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+                        sched->tensor_copies[id][cur_backend_id] = tensor_copy;
+                        tensor_backend_id(tensor_copy) = cur_backend_id;
                         SET_CAUSE(tensor_copy, "4.cpy");
                         int n_inputs = sched->splits[cur_split].n_inputs++;
                         GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                         sched->splits[cur_split].inputs[n_inputs] = src;
                     }
+                    node->src[j] = sched->tensor_copies[id][cur_backend_id];
                 }
             }
         }
     // sanity check: all sources should have the same backend as the node
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
+        ggml_backend_t tensor_backend = tensor_backend(node);
+        if (tensor_backend == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
+        if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
             fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
+                node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
+                node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
+            ggml_backend_t src_backend = tensor_backend(src);
+            if (src_backend != tensor_backend /* && src_backend != NULL */) {
                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
+                    node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
+                    j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
             }
+            if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
                 fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
+                    src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
+                    src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
             }
         }
     }
         struct ggml_backend_sched_split * split = &sched->splits[i];
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
+            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
             // add a dependency to the input source so that it is not freed before the copy is done
+            struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
+            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
+            graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
+            // add a dependency to the input copy so that it is allocated at the start of the split
+            sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
         }
         for (int j = split->i_start; j < split->i_end; j++) {
+            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
             graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
         }
     }
     sched->graph = graph_copy;
 }
+static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+    // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+    if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+#ifndef NDEBUG
+        fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
+#endif
+        ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+        if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+            fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
+            return false;
+        }
+    }
 }
+static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
     uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &splits[i];
+        int split_backend_id = split->backend_id;
+        ggml_backend_t split_backend = sched->backends[split_backend_id];
         // copy the input tensors to the split backend
         uint64_t copy_start_us = ggml_time_us();
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
+            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
             GGML_ASSERT(input->buffer != NULL);
             GGML_ASSERT(input_cpy->buffer != NULL);
             ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
         }
         //ggml_backend_synchronize(split_backend); // necessary to measure copy time
         uint64_t compute_start_us = ggml_time_us();
         if (!sched->callback_eval) {
+            if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
+                return false;
+            }
             //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
             // similar to ggml_backend_compare_graph_backend
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+                if (!ggml_backend_graph_compute(split_backend, &gv)) {
+                    return false;
+                }
                 if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
                     break;
         }
     }
 #endif
+    return true;
 }
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
     struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
     // initialize hash table
+    sched->hash_set          = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+    sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
+    sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
+    sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
     sched->n_backends = n_backends;
     for (int i = 0; i < n_backends; i++) {
         sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
     }
+    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    ggml_backend_sched_reset(sched);
     return sched;
 }
     if (sched == NULL) {
         return;
     }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
     free(sched->hash_set.keys);
+    free(sched->tensor_backend_id);
+    free(sched->tensor_copies);
+    free(sched->node_backend_ids);
     free(sched);
 }
+void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
+    // reset state for the next run
+    size_t hash_size = sched->hash_set.size;
+    memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
+    memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
+    memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
+    sched->is_reset = true;
+}
+bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
+    ggml_backend_sched_split_graph(sched, measure_graph);
+    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
+        return false;
     }
+    ggml_backend_sched_reset(sched);
+    return true;
 }
+bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
     if (!sched->is_reset) {
+        ggml_backend_sched_reset(sched);
     }
+    ggml_backend_sched_split_graph(sched, graph);
+    if (!ggml_backend_sched_alloc_splits(sched)) {
+        return false;
+    }
+    if (!ggml_backend_sched_compute_splits(sched)) {
+        return false;
+    }
+    return true;
+}
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
     sched->callback_eval = callback;
     return sched->n_splits;
 }
+size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    tensor_backend_id(node) = backend_index;
 }
 ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+    int backend_index = tensor_backend_id(node);
+    if (backend_index == -1) {
         return NULL;
     }
+    return sched->backends[backend_index];
 }
 // utils
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
     GGML_ASSERT(tensor->view_src != NULL);
     GGML_ASSERT(tensor->view_src->buffer != NULL);
     GGML_ASSERT(tensor->view_src->data != NULL);
     ggml_backend_buffer_init_tensor(buffer, tensor);
 }
+static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
     struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
     GGML_ASSERT(src != NULL);
     struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
     if (src->view_src != NULL) {
+        dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
         dst->view_offs = src->view_offs;
     }
     dst->op = src->op;
         if (s == NULL) {
             break;
         }
+        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
     }
     node_copies[id] = dst;
     return dst;
 }
+static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
     size_t id = ggml_hash_find(hash_set, src);
     if (node_init[id]) {
         return;
     struct ggml_tensor * dst = node_copies[id];
     if (dst->view_src != NULL) {
+        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
         ggml_backend_view_init(dst->view_src->buffer, dst);
     }
     else {
         if (s == NULL) {
             break;
         }
+        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
     }
 }
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = {
         /* .size = */ graph->visited_hash_table.size,
+        /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
     };
+    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
+    bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
     struct ggml_init_params params = {
         /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
     // dup nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
+        graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
     }
     // allocate nodes
     // copy data and init views
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
+        graph_copy_init_tensor(hash_set, node_copies, node_init, node);
     }
     // build graph copy

ggml-backend.h CHANGED Viewed

@@ -130,11 +130,7 @@ extern "C" {
         // in build_graph:
         build_graph(...) {
-            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
-            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
-            ggml_allocr_alloc(alloc_cpu, tensor);
-            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
             struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
             ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
         }
@@ -164,20 +160,19 @@ extern "C" {
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
     GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
-    GGML_API void                  ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
     // Get the number of splits of the last graph
     GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
     GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
     // Allocate and compute graph on the backend scheduler
-    GGML_API void                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
     GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
     // Set a callback to be called for each resulting node during graph compute

         // in build_graph:
         build_graph(...) {
+            // manually assign nodes to a backend (optional, should not be needed in most cases)
             struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
             ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
         }
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
     GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
+    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
     // Get the number of splits of the last graph
     GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
     GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
     // Allocate and compute graph on the backend scheduler
+    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    // Reset all assignments and allocators - must be called before changing the node backends
     GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
     // Set a callback to be called for each resulting node during graph compute

ggml.c CHANGED Viewed

@@ -2607,7 +2607,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.nb           =*/ { 0, 0, 0, 0 },
         /*.op           =*/ GGML_OP_NONE,
         /*.op_params    =*/ { 0 },
-        /*.is_param     =*/ false,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
         /*.perf_runs    =*/ 0,
@@ -6509,7 +6509,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
 void ggml_set_param(
         struct ggml_context * ctx,
         struct ggml_tensor * tensor) {
-    tensor->is_param = true;
     GGML_ASSERT(tensor->grad == NULL);
     tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -15311,7 +15311,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
         return NULL;
     }
-    if (node->is_param) {
         return node;
     }
@@ -15345,7 +15345,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
     clone->op       = node->op;
     clone->grad     = node->grad;
-    clone->is_param = node->is_param;
     clone->extra    = node->extra;
     for (int k = 0; k < GGML_MAX_DIMS; ++k) {
         clone->nb[k] = node->nb[k];
@@ -16377,7 +16377,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
     for (int i = 0; i < gf->n_nodes; i++) {
         struct ggml_tensor * node = gf->nodes[i];
-        if (node->is_param) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
             ggml_build_forward_expand(gb, node->grad);
         }
@@ -17862,7 +17862,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
-                ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                 (double) node->perf_time_us / 1000.0,
@@ -17955,7 +17955,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
             continue;
         }
-        if (node->is_param) {
             snprintf(color, sizeof(color), "yellow");
         } else if (node->grad) {
             if (ggml_graph_find(gf, node)) {
@@ -18129,7 +18129,7 @@ static enum ggml_opt_result ggml_opt_adam(
     int np = 0;
     int64_t nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
             GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18492,7 +18492,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     int np = 0;
     int nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
             GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18967,6 +18967,16 @@ enum ggml_opt_result ggml_opt_resume_g(
 ////////////////////////////////////////////////////////////////////////////////
 void ggml_quantize_init(enum ggml_type type) {
     ggml_critical_section_start();

         /*.nb           =*/ { 0, 0, 0, 0 },
         /*.op           =*/ GGML_OP_NONE,
         /*.op_params    =*/ { 0 },
+        /*.flags        =*/ 0,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
         /*.perf_runs    =*/ 0,
 void ggml_set_param(
         struct ggml_context * ctx,
         struct ggml_tensor * tensor) {
+    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
     GGML_ASSERT(tensor->grad == NULL);
     tensor->grad = ggml_dup_tensor(ctx, tensor);
         return NULL;
     }
+    if (node->flags & GGML_TENSOR_FLAG_PARAM) {
         return node;
     }
     clone->op       = node->op;
     clone->grad     = node->grad;
+    clone->flags    = node->flags;
     clone->extra    = node->extra;
     for (int k = 0; k < GGML_MAX_DIMS; ++k) {
         clone->nb[k] = node->nb[k];
     for (int i = 0; i < gf->n_nodes; i++) {
         struct ggml_tensor * node = gf->nodes[i];
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
             ggml_build_forward_expand(gb, node->grad);
         }
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
+                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                 (double) node->perf_time_us / 1000.0,
             continue;
         }
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             snprintf(color, sizeof(color), "yellow");
         } else if (node->grad) {
             if (ggml_graph_find(gf, node)) {
     int np = 0;
     int64_t nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
             GGML_ASSERT(np < GGML_MAX_PARAMS);
     int np = 0;
     int nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
             GGML_ASSERT(np < GGML_MAX_PARAMS);
 ////////////////////////////////////////////////////////////////////////////////
+void ggml_set_input(struct ggml_tensor * tensor) {
+    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
+}
+void ggml_set_output(struct ggml_tensor * tensor) {
+    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
+}
+////////////////////////////////////////////////////////////////////////////////
 void ggml_quantize_init(enum ggml_type type) {
     ggml_critical_section_start();

ggml.h CHANGED Viewed

@@ -505,11 +505,17 @@ extern "C" {
     enum ggml_log_level {
         GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4,
         GGML_LOG_LEVEL_DEBUG = 5
     };
     // ggml object
     struct ggml_object {
         size_t offs;
@@ -543,7 +549,7 @@ extern "C" {
         // op params - allocated as int32_t for alignment
         int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-        bool is_param;
         struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
@@ -2092,6 +2098,12 @@ extern "C" {
             ggml_opt_callback callback,
             void * callback_data);
     //
     // quantization
     //

     enum ggml_log_level {
         GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN  = 3,
+        GGML_LOG_LEVEL_INFO  = 4,
         GGML_LOG_LEVEL_DEBUG = 5
     };
+    enum ggml_tensor_flag {
+        GGML_TENSOR_FLAG_INPUT  = 1,
+        GGML_TENSOR_FLAG_OUTPUT = 2,
+        GGML_TENSOR_FLAG_PARAM  = 4,
+    };
     // ggml object
     struct ggml_object {
         size_t offs;
         // op params - allocated as int32_t for alignment
         int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+        int32_t flags;
         struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
             ggml_opt_callback callback,
             void * callback_data);
+    //
+    // tensor flags
+    //
+    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
     //
     // quantization
     //

whisper.cpp CHANGED Viewed

@@ -471,52 +471,32 @@ struct whisper_pair {
 // ggml_allocr wrapper for whisper usage
 struct whisper_allocr {
-    ggml_allocr * alloc = nullptr;
     std::vector<uint8_t> meta;
-    ggml_backend_buffer_t buffer;
 };
 static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
-    return allocr.meta.size() + ggml_allocr_max_size(allocr.alloc);
 }
 // measure the memory usage of a graph and prepare the allocr's internal data buffer
-static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
     auto & alloc = allocr.alloc;
     auto & meta  = allocr.meta;
-    alloc = ggml_allocr_new_measure_from_backend(backend);
     meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
-    ggml_allocr_alloc_graph(alloc, get_graph());
-}
-static void whisper_allocr_graph_realloc(struct whisper_allocr & allocr, ggml_backend_t backend) {
-    if (allocr.alloc == nullptr) {
-        // this can be null if we use external encoder like CoreML or OpenVINO
-        return;
-    }
-    auto & alloc  = allocr.alloc;
-    auto & buffer = allocr.buffer;
-    size_t size = ggml_allocr_max_size(alloc);
-    ggml_allocr_free(alloc);
-    buffer = ggml_backend_alloc_buffer(backend, size);
-    alloc = ggml_allocr_new_from_buffer(buffer);
-}
-static void whisper_allocr_free(struct whisper_allocr & allocr) {
-    if (allocr.alloc) {
-        ggml_allocr_free(allocr.alloc);
-        ggml_backend_buffer_free(allocr.buffer);
-        allocr.alloc = nullptr;
     }
 }
 // medium
@@ -658,9 +638,9 @@ struct whisper_kv_cache {
     struct ggml_tensor * k;
     struct ggml_tensor * v;
-    struct ggml_context * ctx;
-    ggml_backend_buffer_t buffer;
 };
 struct whisper_model {
@@ -698,10 +678,10 @@ struct whisper_model {
     std::vector<whisper_layer_decoder> layers_decoder;
     // ggml context that contains all the meta information about the model tensors
-    struct ggml_context * ctx;
     // the model backend data is read-only and can be shared between processors
-    std::vector<struct ggml_backend_buffer *> buffers;
     // tensors
     int n_loaded;
@@ -903,36 +883,26 @@ static bool kv_cache_init(
     cache.ctx = ggml_init(params);
     if (!cache.ctx) {
-        WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
     cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
     cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    const size_t mem_bytes = ggml_nbytes(cache.k) + ggml_nbytes(cache.v);
-    cache.buffer = ggml_backend_alloc_buffer(backend, mem_bytes);
-    // allocate the tensors into the backend buffer
-    {
-        ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer);
-        ggml_allocr_alloc(alloc, cache.k);
-        ggml_allocr_alloc(alloc, cache.v);
-        ggml_allocr_free(alloc);
     }
     return true;
 }
 static void kv_cache_free(struct whisper_kv_cache & cache) {
-    if (cache.ctx) {
-        ggml_free(cache.ctx);
-        ggml_backend_buffer_free(cache.buffer);
-        cache.ctx = nullptr;
-    }
 }
 static bool whisper_kv_cache_find_slot(
@@ -1513,68 +1483,21 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     }
     wctx.backend = whisper_backend_init(wctx.params);
-    // some devices have a limit on the maximum size of single memory buffer
-    // for example, iPhones are limited to 1GB per buffer
-    // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
-    // model weights between them
-    //
-    // the map_t2b maps tensor names to buffer indices
-    // as we iterate over the tensors, we will allocate new buffers when the current one is full
-    //
-    // finally, we create a separate allocator for each buffer and use it to allocate the tensors
-    // we keep the allocators alive until all the tensors are loaded
-    GGML_ASSERT(model.buffers.empty());
-    std::map<std::string, int> map_t2b;
-    {
-        size_t size_main = 0;
-        size_t size_cur  = 0;
-        static const size_t GB = 1024ull*1024ull*1024ull;
-        for (const auto & t : model.tensors) {
-            const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
-            // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
-            if (size_cur + cur > GB) {
-                GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
-                model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
-                size_cur = cur;
-            }
-            map_t2b[t.first] = model.buffers.size();
-            size_cur  += cur;
-            size_main += cur;
-        }
-        // allocate the last buffer if needed
-        if (size_cur > 0) {
-            model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
-        }
-        GGML_ASSERT(model.buffers.size() > 0);
-        WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
-    }
-    std::vector<ggml_allocr *> allocs(model.buffers.size());
-    for (size_t i = 0; i < allocs.size(); ++i) {
-        allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
     }
     // allocate tensors in the backend buffers
-    {
-        for (const auto & t : model.tensors) {
-            ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
-        }
     }
     // load weights
     {
         size_t total_size = 0;
@@ -1636,15 +1559,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                 return false;
             }
-            ggml_backend_t backend = wctx.backend;
             //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
-            if ((ggml_backend_is_cpu(backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(backend)
-#endif
-                )) {
                 // for the CPU and Metal backend, we can read directly into the tensor
                 loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
                 BYTESWAP_TENSOR(tensor);
@@ -1672,10 +1591,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         }
     }
-    for (auto & alloc : allocs) {
-        ggml_allocr_free(alloc);
-    }
     wctx.t_load_us = ggml_time_us() - t_start_us;
     return true;
@@ -1704,7 +1619,6 @@ static struct ggml_cgraph * whisper_build_graph_conv(
           whisper_state & wstate,
               const int   mel_offset) {
     const auto & model   = wctx.model;
-    const auto & mel_inp = wstate.mel;
     const auto & hparams = model.hparams;
     const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
@@ -1722,31 +1636,9 @@ static struct ggml_cgraph * whisper_build_graph_conv(
     ggml_cgraph * gf = ggml_new_graph(ctx0);
-    ggml_allocr * alloc = wstate.alloc_conv.alloc;
     struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
-    ggml_allocr_alloc(alloc, mel);
-    assert(mel->type == GGML_TYPE_F32);
-    if (!ggml_allocr_is_measure(alloc)) {
-        assert(mel_inp.n_mel == n_mels);
-        wstate.inp_mel.resize(ggml_nelements(mel));
-        float * dst = wstate.inp_mel.data();
-        memset(dst, 0, ggml_nbytes(mel));
-        const int i0 = std::min(mel_offset,           mel_inp.n_len);
-        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
-        for (int j = 0; j < mel_inp.n_mel; ++j) {
-            for (int i = i0; i < i1; ++i) {
-                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
-            }
-        }
-        ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
-    }
     struct ggml_tensor * cur = nullptr;
@@ -2138,11 +2030,39 @@ static bool whisper_encode_internal(
     {
         auto & alloc = wstate.alloc_conv.alloc;
-        ggml_allocr_reset(alloc);
         ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
-        ggml_allocr_alloc_graph(alloc, gf);
         if (!whisper_encode_external(wstate)) {
             if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
@@ -2155,11 +2075,12 @@ static bool whisper_encode_internal(
     if (!whisper_encode_external(wstate)) {
         auto & alloc = wstate.alloc_encode.alloc;
-        ggml_allocr_reset(alloc);
         ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
-        ggml_allocr_alloc_graph(alloc, gf);
         if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
             return false;
@@ -2170,11 +2091,12 @@ static bool whisper_encode_internal(
     {
         auto & alloc = wstate.alloc_cross.alloc;
-        ggml_allocr_reset(alloc);
         ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
-        ggml_allocr_alloc_graph(alloc, gf);
         if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
             return false;
@@ -2190,7 +2112,8 @@ static bool whisper_encode_internal(
 static struct ggml_cgraph * whisper_build_graph_decoder(
          whisper_context & wctx,
          whisper_state   & wstate,
-     const whisper_batch & batch) {
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
@@ -2198,8 +2121,6 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
     WHISPER_ASSERT(!!kv_self.ctx);
-    ggml_allocr * alloc = wstate.alloc_decode.alloc;
     const int n_ctx   = kv_self.size;
     const int n_state = hparams.n_text_state;
     const int n_head  = hparams.n_text_head;
@@ -2208,8 +2129,8 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
     const int n_tokens    = batch.n_tokens;
     const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int32_t n_kv     = ggml_allocr_is_measure(alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
     //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
@@ -2224,48 +2145,18 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
     ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_allocr_alloc(alloc, embd);
-    if (!ggml_allocr_is_measure(alloc)) {
-        ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
-    }
     struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_allocr_alloc(alloc, position);
-    if (!ggml_allocr_is_measure(alloc)) {
-        for (int i = 0; i < n_tokens; ++i) {
-            const int32_t val = batch.pos[i];
-            ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
-        }
-    }
     const float KQscale = pow(float(n_state)/n_head, -0.25);
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_allocr_alloc(alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(alloc)) {
-        wstate.inp_mask.resize(n_kv*n_tokens);
-        float * data = wstate.inp_mask.data();
-        memset(data, 0, ggml_nbytes(KQ_mask));
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const whisper_pos    pos    = batch.pos[j];
-                const whisper_seq_id seq_id = batch.seq_id[j][0];
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-        ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
-    }
     // token encoding + position encoding
     struct ggml_tensor * cur =
@@ -2592,11 +2483,53 @@ static bool whisper_decode_internal(
     {
         auto & alloc = wstate.alloc_decode.alloc;
-        ggml_allocr_reset(alloc);
-        ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch);
-        ggml_allocr_alloc_graph(alloc, gf);
         logits = gf->nodes[gf->n_nodes - 1];
@@ -3046,6 +2979,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     whisper_state * state = new whisper_state;
     state->backend = whisper_backend_init(ctx->params);
     // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
     // in theory, there can be a case where this is not enough, but in practice it should always be enough
@@ -3053,7 +2991,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
         WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
-        delete state;
         return nullptr;
     }
@@ -3064,7 +3002,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
         WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
-        delete state;
         return nullptr;
     }
@@ -3083,7 +3021,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     if (!state->ctx_coreml) {
         WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
 #ifndef WHISPER_COREML_ALLOW_FALLBACK
-        delete state;
         return nullptr;
 #endif
     } else {
@@ -3107,37 +3045,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     // conv allocator
     {
-        whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
                 [&]() {
                     return whisper_build_graph_conv(*ctx, *state, 0);
                 });
         WHISPER_LOG_INFO("%s: compute buffer (conv)   = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
     }
     // encoder allocator
     if (!whisper_encode_external(*state)) {
-        whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
                 [&]() {
                     return whisper_build_graph_encoder(*ctx, *state);
                 });
         WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
     }
     // cross allocator
     {
-        whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
                 [&]() {
                     return whisper_build_graph_cross(*ctx, *state);
                 });
         WHISPER_LOG_INFO("%s: compute buffer (cross)  = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
     }
     // decoder allocator
     {
-        whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
                 [&]() {
                     const auto & hparams = ctx->model.hparams;
@@ -3147,17 +3103,18 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
                     whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
-                    return whisper_build_graph_decoder(*ctx, *state, state->batch);
                 });
         WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
     }
-    whisper_allocr_graph_realloc(state->alloc_conv,   ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_cross,  ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend);
     return state;
 }
@@ -3380,8 +3337,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
     return whisper_init_with_params_no_state(loader, whisper_context_default_params());
 }
-void whisper_free_state(struct whisper_state * state)
-{
     if (state) {
         kv_cache_free(state->kv_self);
         kv_cache_free(state->kv_cross);
@@ -3402,10 +3358,10 @@ void whisper_free_state(struct whisper_state * state)
         whisper_batch_free(state->batch);
-        whisper_allocr_free(state->alloc_conv);
-        whisper_allocr_free(state->alloc_encode);
-        whisper_allocr_free(state->alloc_cross);
-        whisper_allocr_free(state->alloc_decode);
         ggml_backend_free(state->backend);
@@ -3415,15 +3371,9 @@ void whisper_free_state(struct whisper_state * state)
 void whisper_free(struct whisper_context * ctx) {
     if (ctx) {
-        if (ctx->model.ctx) {
-            ggml_free(ctx->model.ctx);
-        }
-        for (auto & buffer : ctx->model.buffers) {
-            if (buffer) {
-                ggml_backend_buffer_free(buffer);
-            }
-        }
         whisper_free_state(ctx->state);

 // ggml_allocr wrapper for whisper usage
 struct whisper_allocr {
+    ggml_gallocr_t alloc = nullptr;
     std::vector<uint8_t> meta;
 };
 static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
+    return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0);
 }
 // measure the memory usage of a graph and prepare the allocr's internal data buffer
+static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
     auto & alloc = allocr.alloc;
     auto & meta  = allocr.meta;
+    alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
     meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
+    // since there are dependencies between the different graphs,
+    // we need to allocate them instead of only reserving to get the correct compute buffer size
+    if (!ggml_gallocr_alloc_graph(alloc, get_graph())) {
+        // failed to allocate the compute buffer
+        WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__);
+        return false;
     }
+    return true;
 }
 // medium
     struct ggml_tensor * k;
     struct ggml_tensor * v;
+    struct ggml_context * ctx = nullptr;
+    ggml_backend_buffer_t buffer = nullptr;
 };
 struct whisper_model {
     std::vector<whisper_layer_decoder> layers_decoder;
     // ggml context that contains all the meta information about the model tensors
+    struct ggml_context * ctx = nullptr;
     // the model backend data is read-only and can be shared between processors
+    ggml_backend_buffer_t buffer = nullptr;
     // tensors
     int n_loaded;
     cache.ctx = ggml_init(params);
     if (!cache.ctx) {
+        WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
         return false;
     }
     cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
     cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
+    if (!cache.buffer) {
+        WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
+        return false;
     }
     return true;
 }
 static void kv_cache_free(struct whisper_kv_cache & cache) {
+    ggml_free(cache.ctx);
+    ggml_backend_buffer_free(cache.buffer);
+    cache.ctx = nullptr;
 }
 static bool whisper_kv_cache_find_slot(
     }
     wctx.backend = whisper_backend_init(wctx.params);
+    if (!wctx.backend) {
+        WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__);
+        return false;
     }
     // allocate tensors in the backend buffers
+    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend);
+    if (!model.buffer) {
+        WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__);
+        return false;
     }
+    size_t size_main = ggml_backend_buffer_get_size(model.buffer);
+    WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
     // load weights
     {
         size_t total_size = 0;
                 return false;
             }
+            //ggml_backend_t backend = wctx.backend;
             //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
+            if (ggml_backend_buffer_is_host(model.buffer)) {
                 // for the CPU and Metal backend, we can read directly into the tensor
                 loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
                 BYTESWAP_TENSOR(tensor);
         }
     }
     wctx.t_load_us = ggml_time_us() - t_start_us;
     return true;
           whisper_state & wstate,
               const int   mel_offset) {
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
     const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
     ggml_cgraph * gf = ggml_new_graph(ctx0);
     struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
+    ggml_set_name(mel, "mel");
+    ggml_set_input(mel);
     struct ggml_tensor * cur = nullptr;
     {
         auto & alloc = wstate.alloc_conv.alloc;
         ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            // should never happen as we pre-allocate the memory
+            return false;
+        }
+        // set the input
+        {
+            const auto & mel_inp = wstate.mel;
+            const int n_ctx      = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
+            struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
+            assert(mel->type == GGML_TYPE_F32);
+            assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
+            wstate.inp_mel.resize(ggml_nelements(mel));
+            float * dst = wstate.inp_mel.data();
+            memset(dst, 0, ggml_nbytes(mel));
+            const int i0 = std::min(mel_offset,           mel_inp.n_len);
+            const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
+            for (int j = 0; j < mel_inp.n_mel; ++j) {
+                for (int i = i0; i < i1; ++i) {
+                    dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+                }
+            }
+            ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
+        }
         if (!whisper_encode_external(wstate)) {
             if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
     if (!whisper_encode_external(wstate)) {
         auto & alloc = wstate.alloc_encode.alloc;
         ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            // should never happen as we pre-allocate the memory
+            return false;
+        }
         if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
             return false;
     {
         auto & alloc = wstate.alloc_cross.alloc;
         ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            // should never happen as we pre-allocate the memory
+            return false;
+        }
         if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
             return false;
 static struct ggml_cgraph * whisper_build_graph_decoder(
          whisper_context & wctx,
          whisper_state   & wstate,
+     const whisper_batch & batch,
+                    bool   worst_case) {
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
     WHISPER_ASSERT(!!kv_self.ctx);
     const int n_ctx   = kv_self.size;
     const int n_state = hparams.n_text_state;
     const int n_head  = hparams.n_text_head;
     const int n_tokens    = batch.n_tokens;
     const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int32_t n_kv     = worst_case ? n_ctx            : kv_self.n;
+    const int32_t kv_head  = worst_case ? n_ctx - n_tokens : kv_self.head;
     //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
     ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(embd, "embd");
+    ggml_set_input(embd);
     struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(position, "position");
+    ggml_set_input(position);
     const float KQscale = pow(float(n_state)/n_head, -0.25);
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_set_input(KQ_mask);
     // token encoding + position encoding
     struct ggml_tensor * cur =
     {
         auto & alloc = wstate.alloc_decode.alloc;
+        ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, false);
+        if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+            // should never happen as we pre-allocate the memory
+            return false;
+        }
+        // set the inputs
+        {
+            struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
+            ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
+        }
+        {
+            struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
+            for (int i = 0; i < n_tokens; ++i) {
+                const int32_t val = batch.pos[i];
+                ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
+            }
+        }
+        {
+            struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask");
+            auto & kv_self = wstate.kv_self;
+            const int32_t n_kv     = kv_self.n;
+            wstate.inp_mask.resize(n_kv*n_tokens);
+            float * data = wstate.inp_mask.data();
+            memset(data, 0, ggml_nbytes(KQ_mask));
+            for (int h = 0; h < 1; ++h) {
+                for (int j = 0; j < n_tokens; ++j) {
+                    const whisper_pos    pos    = batch.pos[j];
+                    const whisper_seq_id seq_id = batch.seq_id[j][0];
+                    for (int i = 0; i < n_kv; ++i) {
+                        if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                            data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+            ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
+        }
         logits = gf->nodes[gf->n_nodes - 1];
     whisper_state * state = new whisper_state;
     state->backend = whisper_backend_init(ctx->params);
+    if (!state->backend) {
+        WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
+        whisper_free_state(state);
+        return nullptr;
+    }
     // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
     // in theory, there can be a case where this is not enough, but in practice it should always be enough
     if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
         WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
+        whisper_free_state(state);
         return nullptr;
     }
     if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
         WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
+        whisper_free_state(state);
         return nullptr;
     }
     if (!state->ctx_coreml) {
         WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
 #ifndef WHISPER_COREML_ALLOW_FALLBACK
+        whisper_free_state(state);
         return nullptr;
 #endif
     } else {
     // conv allocator
     {
+        bool ok = whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
                 [&]() {
                     return whisper_build_graph_conv(*ctx, *state, 0);
                 });
+        if (!ok) {
+            WHISPER_LOG_ERROR("%s: failed to init conv allocator\n", __func__);
+            whisper_free_state(state);
+            return nullptr;
+        }
         WHISPER_LOG_INFO("%s: compute buffer (conv)   = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
     }
     // encoder allocator
     if (!whisper_encode_external(*state)) {
+        bool ok = whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
                 [&]() {
                     return whisper_build_graph_encoder(*ctx, *state);
                 });
+        if (!ok) {
+            WHISPER_LOG_ERROR("%s: failed to init encoder allocator\n", __func__);
+            whisper_free_state(state);
+            return nullptr;
+        }
         WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
     }
     // cross allocator
     {
+        bool ok = whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
                 [&]() {
                     return whisper_build_graph_cross(*ctx, *state);
                 });
+        if (!ok) {
+            WHISPER_LOG_ERROR("%s: failed to init cross allocator\n", __func__);
+            whisper_free_state(state);
+            return nullptr;
+        }
         WHISPER_LOG_INFO("%s: compute buffer (cross)  = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
     }
     // decoder allocator
     {
+        bool ok = whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
                 [&]() {
                     const auto & hparams = ctx->model.hparams;
                     whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
+                    return whisper_build_graph_decoder(*ctx, *state, state->batch, true);
                 });
+        if (!ok) {
+            WHISPER_LOG_ERROR("%s: failed to init decoder allocator\n", __func__);
+            whisper_free_state(state);
+            return nullptr;
+        }
         WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
     }
     return state;
 }
     return whisper_init_with_params_no_state(loader, whisper_context_default_params());
 }
+void whisper_free_state(struct whisper_state * state) {
     if (state) {
         kv_cache_free(state->kv_self);
         kv_cache_free(state->kv_cross);
         whisper_batch_free(state->batch);
+        ggml_gallocr_free(state->alloc_conv.alloc);
+        ggml_gallocr_free(state->alloc_encode.alloc);
+        ggml_gallocr_free(state->alloc_cross.alloc);
+        ggml_gallocr_free(state->alloc_decode.alloc);
         ggml_backend_free(state->backend);
 void whisper_free(struct whisper_context * ctx) {
     if (ctx) {
+        ggml_free(ctx->model.ctx);
+        ggml_backend_buffer_free(ctx->model.buffer);
         whisper_free_state(ctx->state);