Spaces:
Running
Running
slaren
commited on
Commit
·
061ca37
1
Parent(s):
480ad4d
cuda : fix defrag with quantized KV (llama/9319)
Browse files- ggml/src/ggml-backend.c +9 -1
- ggml/src/ggml-cuda.cu +12 -2
- ggml/src/ggml-cuda/cpy.cu +19 -16
ggml/src/ggml-backend.c
CHANGED
|
@@ -1169,6 +1169,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1169 |
}
|
| 1170 |
}
|
| 1171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1172 |
// graph input
|
| 1173 |
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1174 |
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
|
@@ -1648,7 +1653,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1648 |
sched->prev_leaf_backend_ids = tmp;
|
| 1649 |
}
|
| 1650 |
|
| 1651 |
-
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 1652 |
if (sched->graph.size < graph_size) {
|
| 1653 |
sched->graph.size = graph_size;
|
| 1654 |
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
@@ -1700,6 +1705,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1700 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1701 |
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1702 |
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
|
|
| 1703 |
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1704 |
}
|
| 1705 |
}
|
|
@@ -1713,6 +1719,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1713 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1714 |
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1715 |
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
|
|
| 1716 |
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1717 |
}
|
| 1718 |
}
|
|
@@ -1723,6 +1730,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1723 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1724 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1725 |
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
|
|
|
| 1726 |
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
| 1727 |
}
|
| 1728 |
}
|
|
|
|
| 1169 |
}
|
| 1170 |
}
|
| 1171 |
|
| 1172 |
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
| 1173 |
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
| 1174 |
+
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
| 1175 |
+
}
|
| 1176 |
+
|
| 1177 |
// graph input
|
| 1178 |
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1179 |
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
|
|
|
| 1653 |
sched->prev_leaf_backend_ids = tmp;
|
| 1654 |
}
|
| 1655 |
|
| 1656 |
+
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
| 1657 |
if (sched->graph.size < graph_size) {
|
| 1658 |
sched->graph.size = graph_size;
|
| 1659 |
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
|
|
| 1705 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1706 |
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1707 |
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1708 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1709 |
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1710 |
}
|
| 1711 |
}
|
|
|
|
| 1719 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1720 |
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1721 |
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1722 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1723 |
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1724 |
}
|
| 1725 |
}
|
|
|
|
| 1730 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1731 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1732 |
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
| 1733 |
+
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1734 |
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
| 1735 |
}
|
| 1736 |
}
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -2580,8 +2580,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2580 |
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2581 |
// store a pointer to each copy op CUDA kernel to identify it later
|
| 2582 |
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
| 2583 |
-
if (
|
| 2584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2585 |
}
|
| 2586 |
}
|
| 2587 |
|
|
@@ -2851,6 +2858,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 2851 |
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
| 2852 |
return true;
|
| 2853 |
}
|
|
|
|
|
|
|
|
|
|
| 2854 |
return false;
|
| 2855 |
} break;
|
| 2856 |
case GGML_OP_DUP:
|
|
|
|
| 2580 |
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2581 |
// store a pointer to each copy op CUDA kernel to identify it later
|
| 2582 |
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
| 2583 |
+
if (!ptr) {
|
| 2584 |
+
use_cuda_graph = false;
|
| 2585 |
+
#ifndef NDEBUG
|
| 2586 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
| 2587 |
+
#endif
|
| 2588 |
+
} else {
|
| 2589 |
+
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
| 2590 |
+
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
| 2591 |
+
}
|
| 2592 |
}
|
| 2593 |
}
|
| 2594 |
|
|
|
|
| 2858 |
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
| 2859 |
return true;
|
| 2860 |
}
|
| 2861 |
+
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
| 2862 |
+
return true;
|
| 2863 |
+
}
|
| 2864 |
return false;
|
| 2865 |
} break;
|
| 2866 |
case GGML_OP_DUP:
|
ggml/src/ggml-cuda/cpy.cu
CHANGED
|
@@ -428,7 +428,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 428 |
char * src0_ddc = (char *) src0->data;
|
| 429 |
char * src1_ddc = (char *) src1->data;
|
| 430 |
|
| 431 |
-
if (src0->type ==
|
|
|
|
|
|
|
|
|
|
| 432 |
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 433 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
| 434 |
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
@@ -449,9 +452,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 449 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
| 450 |
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 451 |
} else {
|
| 452 |
-
|
| 453 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 454 |
-
GGML_ABORT("fatal error");
|
| 455 |
}
|
| 456 |
}
|
| 457 |
|
|
@@ -461,29 +463,30 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 461 |
}
|
| 462 |
|
| 463 |
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
| 464 |
-
if (src0->type ==
|
| 465 |
-
|
|
|
|
|
|
|
| 466 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
| 467 |
-
|
| 468 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
| 469 |
-
|
| 470 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
| 471 |
-
|
| 472 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
| 473 |
-
|
| 474 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
| 475 |
-
|
| 476 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
| 477 |
-
|
| 478 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
| 479 |
-
|
| 480 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
| 481 |
-
|
| 482 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
| 483 |
-
|
| 484 |
} else {
|
| 485 |
-
|
| 486 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 487 |
-
GGML_ABORT("fatal error");
|
| 488 |
}
|
| 489 |
}
|
|
|
|
| 428 |
char * src0_ddc = (char *) src0->data;
|
| 429 |
char * src1_ddc = (char *) src1->data;
|
| 430 |
|
| 431 |
+
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
| 432 |
+
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
| 433 |
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
| 434 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
| 435 |
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 436 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
| 437 |
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
|
|
| 452 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
| 453 |
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
| 454 |
} else {
|
| 455 |
+
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
| 456 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
|
|
|
| 457 |
}
|
| 458 |
}
|
| 459 |
|
|
|
|
| 463 |
}
|
| 464 |
|
| 465 |
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
| 466 |
+
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
| 467 |
+
return nullptr;
|
| 468 |
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
| 469 |
+
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
| 470 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
| 471 |
+
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
| 472 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
| 473 |
+
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
| 474 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
| 475 |
+
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
| 476 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
| 477 |
+
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
| 478 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
| 479 |
+
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
| 480 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
| 481 |
+
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
| 482 |
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
| 483 |
+
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
| 484 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
| 485 |
+
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
| 486 |
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
| 487 |
+
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
| 488 |
} else {
|
| 489 |
+
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
| 490 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
|
|
|
| 491 |
}
|
| 492 |
}
|