Sigbjørn Skjæret commited on
Commit
59c694d
·
1 Parent(s): f585fe7

cuda : fix GGML_CUDA_GRAPHS=OFF (llama/15300)

Browse files

* fix USE_CUDA_GRAPH=OFF

ggml-ci

* check capture status

* completely disable capturing check instead

Files changed (1) hide show
  1. ggml/src/ggml-cuda/mean.cu +7 -1
ggml/src/ggml-cuda/mean.cu CHANGED
@@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
25
 
26
  // Special case for reducing vectors
27
  #ifdef GGML_CUDA_USE_CUB
 
28
  cudaStreamCaptureStatus iscapturing;
29
  CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
 
30
  if ((nrows == 1) &&
 
31
  // CUDA_GRAPHS_DISABLED
32
  ((ncols > 65536) &&
33
  ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
@@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
38
  !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
39
  ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
40
  ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
 
 
 
41
  // Single row - use device-wide reduction
42
  size_t tmp_size = 0;
43
  ggml_cuda_pool & pool = ctx.pool();
@@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
51
  divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
52
  return;
53
  }
54
- #endif
55
 
56
  const dim3 block_nums(nrows, 1, 1);
57
 
 
25
 
26
  // Special case for reducing vectors
27
  #ifdef GGML_CUDA_USE_CUB
28
+ #ifdef USE_CUDA_GRAPH
29
  cudaStreamCaptureStatus iscapturing;
30
  CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
31
+ #endif // USE_CUDA_GRAPH
32
  if ((nrows == 1) &&
33
+ #ifdef USE_CUDA_GRAPH
34
  // CUDA_GRAPHS_DISABLED
35
  ((ncols > 65536) &&
36
  ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
 
41
  !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
42
  ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
43
  ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
44
+ #else
45
+ (ncols > 65536)) {
46
+ #endif // USE_CUDA_GRAPH
47
  // Single row - use device-wide reduction
48
  size_t tmp_size = 0;
49
  ggml_cuda_pool & pool = ctx.pool();
 
57
  divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
58
  return;
59
  }
60
+ #endif // GGML_CUDA_USE_CUB
61
 
62
  const dim3 block_nums(nrows, 1, 1);
63