Spaces:
Running
Running
slaren
commited on
ggml-alloc : allocate all leafs as if they were inputs (ggml/731)
Browse files* ggml-alloc : allocate all leafs as if they were inputs
* ensure static leafs are allocated
* gpt-2-backend : remove unnecesary ggml_new_tensor
* update other gpt-2 examples to remove ggml_new_tensor calls in the graph
- ggml-alloc.c +66 -36
ggml-alloc.c
CHANGED
|
@@ -377,6 +377,9 @@ struct ggml_gallocr {
|
|
| 377 |
|
| 378 |
struct node_alloc * node_allocs; // [n_nodes]
|
| 379 |
int n_nodes;
|
|
|
|
|
|
|
|
|
|
| 380 |
};
|
| 381 |
|
| 382 |
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
|
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
| 427 |
free(galloc->buffers);
|
| 428 |
free(galloc->buf_tallocs);
|
| 429 |
free(galloc->node_allocs);
|
|
|
|
| 430 |
free(galloc);
|
| 431 |
}
|
| 432 |
|
|
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 544 |
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
| 545 |
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
| 546 |
|
| 547 |
-
// allocate all graph inputs first to avoid overwriting them
|
| 548 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 549 |
-
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 550 |
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
| 551 |
-
}
|
| 552 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 553 |
-
if (graph->nodes[i]->src[j] == NULL) {
|
| 554 |
-
break;
|
| 555 |
-
}
|
| 556 |
-
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 557 |
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
| 558 |
-
}
|
| 559 |
-
}
|
| 560 |
-
}
|
| 561 |
-
|
| 562 |
// count number of children and views
|
|
|
|
| 563 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 564 |
struct ggml_tensor * node = graph->nodes[i];
|
| 565 |
|
|
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 568 |
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
| 569 |
}
|
| 570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 572 |
-
struct ggml_tensor *
|
| 573 |
-
if (
|
| 574 |
break;
|
| 575 |
}
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
}
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
|
| 580 |
// allocate tensors
|
| 581 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 696 |
}
|
| 697 |
}
|
| 698 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
| 700 |
// reallocate buffers if needed
|
| 701 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
| 722 |
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
| 723 |
}
|
| 724 |
|
| 725 |
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node,
|
| 726 |
-
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
| 727 |
|
| 728 |
if (node->view_src != NULL) {
|
| 729 |
if (node->buffer == NULL) {
|
|
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
| 732 |
// this tensor was allocated without ggml-backend
|
| 733 |
return;
|
| 734 |
}
|
| 735 |
-
ggml_backend_view_init(galloc->buffers[
|
| 736 |
}
|
| 737 |
} else {
|
| 738 |
if (node->data == NULL) {
|
| 739 |
assert(tensor_alloc->offset != SIZE_MAX);
|
| 740 |
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
| 741 |
-
void * base = ggml_backend_buffer_get_base(galloc->buffers[
|
| 742 |
void * addr = (char *)base + tensor_alloc->offset;
|
| 743 |
-
ggml_backend_tensor_alloc(galloc->buffers[
|
| 744 |
} else {
|
| 745 |
if (node->buffer == NULL) {
|
| 746 |
// this tensor was allocated without ggml-backend
|
| 747 |
return;
|
| 748 |
}
|
| 749 |
-
|
| 750 |
-
#ifndef NDEBUG
|
| 751 |
-
size_t offset =
|
| 752 |
-
(char *)node->data -
|
| 753 |
-
(char *)ggml_backend_buffer_get_base(node->buffer);
|
| 754 |
-
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
| 755 |
-
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
| 756 |
-
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
| 757 |
-
#endif
|
| 758 |
}
|
| 759 |
}
|
| 760 |
}
|
|
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
| 773 |
return true;
|
| 774 |
}
|
| 775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 777 |
struct ggml_tensor * node = graph->nodes[i];
|
| 778 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
@@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 827 |
}
|
| 828 |
|
| 829 |
// allocate the graph tensors from the previous assignments
|
|
|
|
| 830 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 831 |
struct ggml_tensor * node = graph->nodes[i];
|
| 832 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
@@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 835 |
if (src == NULL) {
|
| 836 |
break;
|
| 837 |
}
|
| 838 |
-
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
| 839 |
}
|
| 840 |
-
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
}
|
| 842 |
|
| 843 |
return true;
|
|
|
|
| 377 |
|
| 378 |
struct node_alloc * node_allocs; // [n_nodes]
|
| 379 |
int n_nodes;
|
| 380 |
+
|
| 381 |
+
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
| 382 |
+
int n_leafs;
|
| 383 |
};
|
| 384 |
|
| 385 |
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
|
|
|
| 430 |
free(galloc->buffers);
|
| 431 |
free(galloc->buf_tallocs);
|
| 432 |
free(galloc->node_allocs);
|
| 433 |
+
free(galloc->leaf_allocs);
|
| 434 |
free(galloc);
|
| 435 |
}
|
| 436 |
|
|
|
|
| 548 |
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
| 549 |
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
// count number of children and views
|
| 552 |
+
// allocate all graph inputs and leafs first to avoid overwriting them
|
| 553 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 554 |
struct ggml_tensor * node = graph->nodes[i];
|
| 555 |
|
|
|
|
| 558 |
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
| 559 |
}
|
| 560 |
|
| 561 |
+
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 562 |
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 566 |
+
struct ggml_tensor * src = node->src[j];
|
| 567 |
+
if (src == NULL) {
|
| 568 |
break;
|
| 569 |
}
|
| 570 |
+
|
| 571 |
+
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
| 572 |
+
|
| 573 |
+
// allocate explicit inputs and leafs
|
| 574 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
| 575 |
+
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
| 576 |
+
}
|
| 577 |
}
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
// allocate the remaining leafs that are unused on the graph
|
| 581 |
+
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
| 582 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 583 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 584 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 585 |
+
|
| 586 |
+
if (hn->n_children == 0) {
|
| 587 |
+
assert(!hn->allocated);
|
| 588 |
+
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
| 589 |
+
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
| 590 |
+
}
|
| 591 |
+
}
|
| 592 |
|
| 593 |
// allocate tensors
|
| 594 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
|
|
| 709 |
}
|
| 710 |
}
|
| 711 |
}
|
| 712 |
+
if (galloc->n_leafs < graph->n_leafs) {
|
| 713 |
+
free(galloc->leaf_allocs);
|
| 714 |
+
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
| 715 |
+
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
| 716 |
+
}
|
| 717 |
+
galloc->n_leafs = graph->n_leafs;
|
| 718 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 719 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 720 |
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 721 |
+
galloc->leaf_allocs[i].offset = hn->offset;
|
| 722 |
+
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
| 723 |
+
}
|
| 724 |
|
| 725 |
// reallocate buffers if needed
|
| 726 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
|
|
| 747 |
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
| 748 |
}
|
| 749 |
|
| 750 |
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
| 751 |
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
| 752 |
|
| 753 |
if (node->view_src != NULL) {
|
| 754 |
if (node->buffer == NULL) {
|
|
|
|
| 757 |
// this tensor was allocated without ggml-backend
|
| 758 |
return;
|
| 759 |
}
|
| 760 |
+
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
| 761 |
}
|
| 762 |
} else {
|
| 763 |
if (node->data == NULL) {
|
| 764 |
assert(tensor_alloc->offset != SIZE_MAX);
|
| 765 |
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
| 766 |
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
| 767 |
void * addr = (char *)base + tensor_alloc->offset;
|
| 768 |
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
| 769 |
} else {
|
| 770 |
if (node->buffer == NULL) {
|
| 771 |
// this tensor was allocated without ggml-backend
|
| 772 |
return;
|
| 773 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 774 |
}
|
| 775 |
}
|
| 776 |
}
|
|
|
|
| 789 |
return true;
|
| 790 |
}
|
| 791 |
|
| 792 |
+
if (galloc->n_leafs != graph->n_leafs) {
|
| 793 |
+
#ifndef NDEBUG
|
| 794 |
+
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
| 795 |
+
#endif
|
| 796 |
+
return true;
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 800 |
struct ggml_tensor * node = graph->nodes[i];
|
| 801 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
|
|
| 850 |
}
|
| 851 |
|
| 852 |
// allocate the graph tensors from the previous assignments
|
| 853 |
+
// nodes
|
| 854 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 855 |
struct ggml_tensor * node = graph->nodes[i];
|
| 856 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
|
|
| 859 |
if (src == NULL) {
|
| 860 |
break;
|
| 861 |
}
|
| 862 |
+
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
| 863 |
}
|
| 864 |
+
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
| 865 |
+
}
|
| 866 |
+
// leafs
|
| 867 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 868 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 869 |
+
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
| 870 |
+
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
| 871 |
}
|
| 872 |
|
| 873 |
return true;
|