Spaces:
Sleeping
Sleeping
David Huang
commited on
Commit
·
143cb70
1
Parent(s):
5c44879
CUDA/HIP: Share the same unified memory allocation logic. (llama/12934)
Browse filesReplace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs.
- ggml/CMakeLists.txt +0 -1
- ggml/src/ggml-cuda/ggml-cuda.cu +16 -15
- ggml/src/ggml-cuda/vendors/hip.h +2 -0
- ggml/src/ggml-hip/CMakeLists.txt +0 -4
ggml/CMakeLists.txt
CHANGED
|
@@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP"
|
|
| 170 |
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
| 171 |
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
| 172 |
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
| 173 |
-
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
| 174 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 175 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 176 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
|
|
| 170 |
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
| 171 |
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
| 172 |
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
|
|
| 173 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 174 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 175 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
|
|
| 96 |
|
| 97 |
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
| 98 |
ggml_cuda_set_device(device);
|
| 99 |
-
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
| 100 |
-
auto res = hipMallocManaged(ptr, size);
|
| 101 |
-
if (res == hipSuccess) {
|
| 102 |
-
// if error we "need" to know why...
|
| 103 |
-
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
| 104 |
-
}
|
| 105 |
-
return res;
|
| 106 |
-
#else
|
| 107 |
-
|
| 108 |
-
#if !defined(GGML_USE_HIP)
|
| 109 |
cudaError_t err;
|
| 110 |
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
| 111 |
{
|
| 112 |
err = cudaMallocManaged(ptr, size);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
}
|
| 114 |
else
|
| 115 |
{
|
| 116 |
err = cudaMalloc(ptr, size);
|
| 117 |
}
|
| 118 |
return err;
|
| 119 |
-
#else
|
| 120 |
-
return cudaMalloc(ptr, size);
|
| 121 |
-
#endif // !defined(GGML_USE_HIP)
|
| 122 |
-
|
| 123 |
-
#endif
|
| 124 |
}
|
| 125 |
|
| 126 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
| 96 |
|
| 97 |
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
| 98 |
ggml_cuda_set_device(device);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
cudaError_t err;
|
| 100 |
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
| 101 |
{
|
| 102 |
err = cudaMallocManaged(ptr, size);
|
| 103 |
+
#if defined(GGML_USE_HIP)
|
| 104 |
+
if (err == hipSuccess) {
|
| 105 |
+
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
| 109 |
+
if (err == hipErrorNotSupported) {
|
| 110 |
+
static bool warned_unsupported = false;
|
| 111 |
+
if (!warned_unsupported) {
|
| 112 |
+
GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
|
| 113 |
+
warned_unsupported = true;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
err = cudaMalloc(ptr, size);
|
| 117 |
+
}
|
| 118 |
+
#endif // defined(GGML_USE_HIP)
|
| 119 |
}
|
| 120 |
else
|
| 121 |
{
|
| 122 |
err = cudaMalloc(ptr, size);
|
| 123 |
}
|
| 124 |
return err;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
}
|
| 126 |
|
| 127 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -71,6 +71,8 @@
|
|
| 71 |
#define cudaLaunchHostFunc hipLaunchHostFunc
|
| 72 |
#define cudaMalloc hipMalloc
|
| 73 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
|
|
|
|
|
| 74 |
#define cudaMemcpy hipMemcpy
|
| 75 |
#define cudaMemcpyAsync hipMemcpyAsync
|
| 76 |
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
|
|
| 71 |
#define cudaLaunchHostFunc hipLaunchHostFunc
|
| 72 |
#define cudaMalloc hipMalloc
|
| 73 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
| 74 |
+
#define cudaMallocManaged hipMallocManaged
|
| 75 |
+
#define cudaMemAdvise hipMemAdvise
|
| 76 |
#define cudaMemcpy hipMemcpy
|
| 77 |
#define cudaMemcpyAsync hipMemcpyAsync
|
| 78 |
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
ggml/src/ggml-hip/CMakeLists.txt
CHANGED
|
@@ -89,10 +89,6 @@ endif()
|
|
| 89 |
|
| 90 |
add_compile_definitions(GGML_USE_HIP)
|
| 91 |
|
| 92 |
-
if (GGML_HIP_UMA)
|
| 93 |
-
add_compile_definitions(GGML_HIP_UMA)
|
| 94 |
-
endif()
|
| 95 |
-
|
| 96 |
if (GGML_CUDA_FORCE_MMQ)
|
| 97 |
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
| 98 |
endif()
|
|
|
|
| 89 |
|
| 90 |
add_compile_definitions(GGML_USE_HIP)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
if (GGML_CUDA_FORCE_MMQ)
|
| 93 |
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
| 94 |
endif()
|