Spaces:
Sleeping
Sleeping
Sigbjørn Skjæret
commited on
Commit
·
59c694d
1
Parent(s):
f585fe7
cuda : fix GGML_CUDA_GRAPHS=OFF (llama/15300)
Browse files* fix USE_CUDA_GRAPH=OFF
ggml-ci
* check capture status
* completely disable capturing check instead
ggml/src/ggml-cuda/mean.cu
CHANGED
|
@@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 25 |
|
| 26 |
// Special case for reducing vectors
|
| 27 |
#ifdef GGML_CUDA_USE_CUB
|
|
|
|
| 28 |
cudaStreamCaptureStatus iscapturing;
|
| 29 |
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
|
|
|
| 30 |
if ((nrows == 1) &&
|
|
|
|
| 31 |
// CUDA_GRAPHS_DISABLED
|
| 32 |
((ncols > 65536) &&
|
| 33 |
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
|
@@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 38 |
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
| 39 |
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
| 40 |
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
|
|
|
|
|
|
|
|
|
| 41 |
// Single row - use device-wide reduction
|
| 42 |
size_t tmp_size = 0;
|
| 43 |
ggml_cuda_pool & pool = ctx.pool();
|
|
@@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 51 |
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
| 52 |
return;
|
| 53 |
}
|
| 54 |
-
#endif
|
| 55 |
|
| 56 |
const dim3 block_nums(nrows, 1, 1);
|
| 57 |
|
|
|
|
| 25 |
|
| 26 |
// Special case for reducing vectors
|
| 27 |
#ifdef GGML_CUDA_USE_CUB
|
| 28 |
+
#ifdef USE_CUDA_GRAPH
|
| 29 |
cudaStreamCaptureStatus iscapturing;
|
| 30 |
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
| 31 |
+
#endif // USE_CUDA_GRAPH
|
| 32 |
if ((nrows == 1) &&
|
| 33 |
+
#ifdef USE_CUDA_GRAPH
|
| 34 |
// CUDA_GRAPHS_DISABLED
|
| 35 |
((ncols > 65536) &&
|
| 36 |
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
|
|
|
| 41 |
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
| 42 |
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
| 43 |
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
| 44 |
+
#else
|
| 45 |
+
(ncols > 65536)) {
|
| 46 |
+
#endif // USE_CUDA_GRAPH
|
| 47 |
// Single row - use device-wide reduction
|
| 48 |
size_t tmp_size = 0;
|
| 49 |
ggml_cuda_pool & pool = ctx.pool();
|
|
|
|
| 57 |
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
| 58 |
return;
|
| 59 |
}
|
| 60 |
+
#endif // GGML_CUDA_USE_CUB
|
| 61 |
|
| 62 |
const dim3 block_nums(nrows, 1, 1);
|
| 63 |
|