slaren commited on
Commit
061ca37
·
1 Parent(s): 480ad4d

cuda : fix defrag with quantized KV (llama/9319)

Browse files
ggml/src/ggml-backend.c CHANGED
@@ -1169,6 +1169,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1169
  }
1170
  }
1171
 
 
 
 
 
 
1172
  // graph input
1173
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1174
  cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
@@ -1648,7 +1653,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1648
  sched->prev_leaf_backend_ids = tmp;
1649
  }
1650
 
1651
- int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1652
  if (sched->graph.size < graph_size) {
1653
  sched->graph.size = graph_size;
1654
  sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
@@ -1700,6 +1705,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1700
  for (int c = 0; c < sched->n_copies; c++) {
1701
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1702
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
 
1703
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1704
  }
1705
  }
@@ -1713,6 +1719,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1713
  for (int c = 0; c < sched->n_copies; c++) {
1714
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1715
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
 
1716
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1717
  }
1718
  }
@@ -1723,6 +1730,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1723
  for (int i = 0; i < graph->n_leafs; i++) {
1724
  struct ggml_tensor * leaf = graph->leafs[i];
1725
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
 
1726
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1727
  }
1728
  }
 
1169
  }
1170
  }
1171
 
1172
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1173
+ // since the tensor is pre-allocated, it cannot be moved to another backend
1174
+ GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1175
+ }
1176
+
1177
  // graph input
1178
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1179
  cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
 
1653
  sched->prev_leaf_backend_ids = tmp;
1654
  }
1655
 
1656
+ int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1657
  if (sched->graph.size < graph_size) {
1658
  sched->graph.size = graph_size;
1659
  sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
 
1705
  for (int c = 0; c < sched->n_copies; c++) {
1706
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1707
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1708
+ assert(graph_copy->size > graph_copy->n_leafs);
1709
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1710
  }
1711
  }
 
1719
  for (int c = 0; c < sched->n_copies; c++) {
1720
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1721
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1722
+ assert(graph_copy->size > graph_copy->n_leafs);
1723
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1724
  }
1725
  }
 
1730
  for (int i = 0; i < graph->n_leafs; i++) {
1731
  struct ggml_tensor * leaf = graph->leafs[i];
1732
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1733
+ assert(graph_copy->size > graph_copy->n_leafs);
1734
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1735
  }
1736
  }
ggml/src/ggml-cuda.cu CHANGED
@@ -2580,8 +2580,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2580
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2581
  // store a pointer to each copy op CUDA kernel to identify it later
2582
  void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2583
- if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2584
- ggml_cuda_cpy_fn_ptrs.push_back(ptr);
 
 
 
 
 
 
 
2585
  }
2586
  }
2587
 
@@ -2851,6 +2858,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2851
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2852
  return true;
2853
  }
 
 
 
2854
  return false;
2855
  } break;
2856
  case GGML_OP_DUP:
 
2580
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2581
  // store a pointer to each copy op CUDA kernel to identify it later
2582
  void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2583
+ if (!ptr) {
2584
+ use_cuda_graph = false;
2585
+ #ifndef NDEBUG
2586
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2587
+ #endif
2588
+ } else {
2589
+ if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2590
+ ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2591
+ }
2592
  }
2593
  }
2594
 
 
2858
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2859
  return true;
2860
  }
2861
+ if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
2862
+ return true;
2863
+ }
2864
  return false;
2865
  } break;
2866
  case GGML_OP_DUP:
ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -428,7 +428,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
428
  char * src0_ddc = (char *) src0->data;
429
  char * src1_ddc = (char *) src1->data;
430
 
431
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
 
 
 
432
  ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
433
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
434
  ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
@@ -449,9 +452,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
449
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
450
  ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
451
  } else {
452
- fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
453
  ggml_type_name(src0->type), ggml_type_name(src1->type));
454
- GGML_ABORT("fatal error");
455
  }
456
  }
457
 
@@ -461,29 +463,30 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
461
  }
462
 
463
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
464
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
465
- return (void*) cpy_f32_f16<cpy_1_f32_f32>;
 
 
466
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
467
- return (void*) cpy_f32_f16<cpy_1_f32_f16>;
468
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
469
- return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
470
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
471
- return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
472
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
473
- return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
474
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
475
- return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
476
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
477
- return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
478
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
479
- return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
480
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
481
- return (void*) cpy_f32_f16<cpy_1_f32_f16>;
482
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
483
- return (void*) cpy_f32_f16<cpy_1_f16_f32>;
484
  } else {
485
- fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
486
  ggml_type_name(src0->type), ggml_type_name(src1->type));
487
- GGML_ABORT("fatal error");
488
  }
489
  }
 
428
  char * src0_ddc = (char *) src0->data;
429
  char * src1_ddc = (char *) src1->data;
430
 
431
+ if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
432
+ GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
433
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
434
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
435
  ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
436
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
437
  ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 
452
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
453
  ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
454
  } else {
455
+ GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
456
  ggml_type_name(src0->type), ggml_type_name(src1->type));
 
457
  }
458
  }
459
 
 
463
  }
464
 
465
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
466
+ if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
467
+ return nullptr;
468
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
469
+ return (void*) cpy_f32_f16<cpy_1_f32_f32>;
470
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
471
+ return (void*) cpy_f32_f16<cpy_1_f32_f16>;
472
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
473
+ return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
474
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
475
+ return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
476
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
477
+ return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
478
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
479
+ return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
480
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
481
+ return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
482
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
483
+ return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
484
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
485
+ return (void*) cpy_f32_f16<cpy_1_f32_f16>;
486
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
487
+ return (void*) cpy_f32_f16<cpy_1_f16_f32>;
488
  } else {
489
+ GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
490
  ggml_type_name(src0->type), ggml_type_name(src1->type));
 
491
  }
492
  }