jeffbolznv commited on
Commit
1b7672d
·
1 Parent(s): 5bdb244

vulkan: use smaller combined allocations to avoid fragmentation (llama/11551)

Browse files
ggml/src/ggml-alloc.c CHANGED
@@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
989
  this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
990
  }
991
 
992
- if (this_size > max_size) {
993
- GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
- __func__, t->name,
995
- ggml_backend_buft_name(buft),
996
- this_size, max_size);
997
- for (size_t i = 0; i < n_buffers; i++) {
998
- ggml_backend_buffer_free(buffers[i]);
999
- }
1000
- free(buffers);
1001
- return NULL;
1002
- }
1003
-
1004
- if ((cur_buf_size + this_size) > max_size) {
1005
  // allocate tensors in the current buffer
1006
  if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1007
  return NULL;
 
989
  this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
990
  }
991
 
992
+ if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
 
 
 
 
 
 
 
 
 
 
 
 
993
  // allocate tensors in the current buffer
994
  if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
995
  return NULL;
ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -156,6 +156,7 @@ struct vk_device_struct {
156
  vk::PhysicalDeviceProperties properties;
157
  std::string name;
158
  uint64_t max_memory_allocation_size;
 
159
  bool fp16;
160
  bool pipeline_robustness;
161
  vk::Device device;
@@ -2269,6 +2270,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
2269
 
2270
  device->physical_device.getProperties2(&props2);
2271
  device->properties = props2.properties;
 
2272
 
2273
  const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
2274
 
@@ -2280,7 +2282,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
2280
  device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
2281
  }
2282
 
2283
- device->vendor_id = device->properties.vendorID;
 
 
 
 
 
 
 
 
 
 
 
 
 
2284
  device->subgroup_size = subgroup_props.subgroupSize;
2285
  device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
2286
  if (sm_builtins) {
@@ -7561,7 +7576,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
7561
 
7562
  static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
7563
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
7564
- return ctx->device->max_memory_allocation_size;
7565
  }
7566
 
7567
  static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
 
156
  vk::PhysicalDeviceProperties properties;
157
  std::string name;
158
  uint64_t max_memory_allocation_size;
159
+ uint64_t suballocation_block_size;
160
  bool fp16;
161
  bool pipeline_robustness;
162
  vk::Device device;
 
2270
 
2271
  device->physical_device.getProperties2(&props2);
2272
  device->properties = props2.properties;
2273
+ device->vendor_id = device->properties.vendorID;
2274
 
2275
  const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
2276
 
 
2282
  device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
2283
  }
2284
 
2285
+ const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
2286
+
2287
+ if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
2288
+ device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
2289
+ #if defined(_WIN32)
2290
+ } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
2291
+ // Limit batching of allocations to 1GB by default to avoid fragmentation issues
2292
+ device->suballocation_block_size = 1024*1024*1024;
2293
+ #endif
2294
+ } else {
2295
+ device->suballocation_block_size = device->max_memory_allocation_size;
2296
+ }
2297
+ device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
2298
+
2299
  device->subgroup_size = subgroup_props.subgroupSize;
2300
  device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
2301
  if (sm_builtins) {
 
7576
 
7577
  static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
7578
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
7579
+ return ctx->device->suballocation_block_size;
7580
  }
7581
 
7582
  static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {