David Huang commited on
Commit
143cb70
·
1 Parent(s): 5c44879

CUDA/HIP: Share the same unified memory allocation logic. (llama/12934)

Browse files

Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs.

ggml/CMakeLists.txt CHANGED
@@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP"
170
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
171
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
172
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
173
- option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
174
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
175
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
176
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
 
170
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
171
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
172
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
 
173
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
174
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
175
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
96
 
97
  static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
98
  ggml_cuda_set_device(device);
99
- #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
100
- auto res = hipMallocManaged(ptr, size);
101
- if (res == hipSuccess) {
102
- // if error we "need" to know why...
103
- CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
104
- }
105
- return res;
106
- #else
107
-
108
- #if !defined(GGML_USE_HIP)
109
  cudaError_t err;
110
  if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
111
  {
112
  err = cudaMallocManaged(ptr, size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
  else
115
  {
116
  err = cudaMalloc(ptr, size);
117
  }
118
  return err;
119
- #else
120
- return cudaMalloc(ptr, size);
121
- #endif // !defined(GGML_USE_HIP)
122
-
123
- #endif
124
  }
125
 
126
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 
96
 
97
  static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
98
  ggml_cuda_set_device(device);
 
 
 
 
 
 
 
 
 
 
99
  cudaError_t err;
100
  if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
101
  {
102
  err = cudaMallocManaged(ptr, size);
103
+ #if defined(GGML_USE_HIP)
104
+ if (err == hipSuccess) {
105
+ CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
106
+ }
107
+
108
+ // fall back to cudaMalloc if not supported (e.g. on Windows)
109
+ if (err == hipErrorNotSupported) {
110
+ static bool warned_unsupported = false;
111
+ if (!warned_unsupported) {
112
+ GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
113
+ warned_unsupported = true;
114
+ }
115
+
116
+ err = cudaMalloc(ptr, size);
117
+ }
118
+ #endif // defined(GGML_USE_HIP)
119
  }
120
  else
121
  {
122
  err = cudaMalloc(ptr, size);
123
  }
124
  return err;
 
 
 
 
 
125
  }
126
 
127
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -71,6 +71,8 @@
71
  #define cudaLaunchHostFunc hipLaunchHostFunc
72
  #define cudaMalloc hipMalloc
73
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 
 
74
  #define cudaMemcpy hipMemcpy
75
  #define cudaMemcpyAsync hipMemcpyAsync
76
  #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
 
71
  #define cudaLaunchHostFunc hipLaunchHostFunc
72
  #define cudaMalloc hipMalloc
73
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
74
+ #define cudaMallocManaged hipMallocManaged
75
+ #define cudaMemAdvise hipMemAdvise
76
  #define cudaMemcpy hipMemcpy
77
  #define cudaMemcpyAsync hipMemcpyAsync
78
  #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
ggml/src/ggml-hip/CMakeLists.txt CHANGED
@@ -89,10 +89,6 @@ endif()
89
 
90
  add_compile_definitions(GGML_USE_HIP)
91
 
92
- if (GGML_HIP_UMA)
93
- add_compile_definitions(GGML_HIP_UMA)
94
- endif()
95
-
96
  if (GGML_CUDA_FORCE_MMQ)
97
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
98
  endif()
 
89
 
90
  add_compile_definitions(GGML_USE_HIP)
91
 
 
 
 
 
92
  if (GGML_CUDA_FORCE_MMQ)
93
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
94
  endif()