Spaces:
Sleeping
Sleeping
Commit
·
1b7672d
1
Parent(s):
5bdb244
vulkan: use smaller combined allocations to avoid fragmentation (llama/11551)
Browse files- ggml/src/ggml-alloc.c +1 -13
- ggml/src/ggml-vulkan/ggml-vulkan.cpp +17 -2
ggml/src/ggml-alloc.c
CHANGED
|
@@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 989 |
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
| 990 |
}
|
| 991 |
|
| 992 |
-
if (this_size > max_size) {
|
| 993 |
-
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
| 994 |
-
__func__, t->name,
|
| 995 |
-
ggml_backend_buft_name(buft),
|
| 996 |
-
this_size, max_size);
|
| 997 |
-
for (size_t i = 0; i < n_buffers; i++) {
|
| 998 |
-
ggml_backend_buffer_free(buffers[i]);
|
| 999 |
-
}
|
| 1000 |
-
free(buffers);
|
| 1001 |
-
return NULL;
|
| 1002 |
-
}
|
| 1003 |
-
|
| 1004 |
-
if ((cur_buf_size + this_size) > max_size) {
|
| 1005 |
// allocate tensors in the current buffer
|
| 1006 |
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
| 1007 |
return NULL;
|
|
|
|
| 989 |
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
| 990 |
}
|
| 991 |
|
| 992 |
+
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 993 |
// allocate tensors in the current buffer
|
| 994 |
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
| 995 |
return NULL;
|
ggml/src/ggml-vulkan/ggml-vulkan.cpp
CHANGED
|
@@ -156,6 +156,7 @@ struct vk_device_struct {
|
|
| 156 |
vk::PhysicalDeviceProperties properties;
|
| 157 |
std::string name;
|
| 158 |
uint64_t max_memory_allocation_size;
|
|
|
|
| 159 |
bool fp16;
|
| 160 |
bool pipeline_robustness;
|
| 161 |
vk::Device device;
|
|
@@ -2269,6 +2270,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
| 2269 |
|
| 2270 |
device->physical_device.getProperties2(&props2);
|
| 2271 |
device->properties = props2.properties;
|
|
|
|
| 2272 |
|
| 2273 |
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
| 2274 |
|
|
@@ -2280,7 +2282,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
| 2280 |
device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
|
| 2281 |
}
|
| 2282 |
|
| 2283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2284 |
device->subgroup_size = subgroup_props.subgroupSize;
|
| 2285 |
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
| 2286 |
if (sm_builtins) {
|
|
@@ -7561,7 +7576,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
|
|
| 7561 |
|
| 7562 |
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 7563 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 7564 |
-
return ctx->device->
|
| 7565 |
}
|
| 7566 |
|
| 7567 |
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
|
|
| 156 |
vk::PhysicalDeviceProperties properties;
|
| 157 |
std::string name;
|
| 158 |
uint64_t max_memory_allocation_size;
|
| 159 |
+
uint64_t suballocation_block_size;
|
| 160 |
bool fp16;
|
| 161 |
bool pipeline_robustness;
|
| 162 |
vk::Device device;
|
|
|
|
| 2270 |
|
| 2271 |
device->physical_device.getProperties2(&props2);
|
| 2272 |
device->properties = props2.properties;
|
| 2273 |
+
device->vendor_id = device->properties.vendorID;
|
| 2274 |
|
| 2275 |
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
| 2276 |
|
|
|
|
| 2282 |
device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
|
| 2283 |
}
|
| 2284 |
|
| 2285 |
+
const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
|
| 2286 |
+
|
| 2287 |
+
if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
|
| 2288 |
+
device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
|
| 2289 |
+
#if defined(_WIN32)
|
| 2290 |
+
} else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
|
| 2291 |
+
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
|
| 2292 |
+
device->suballocation_block_size = 1024*1024*1024;
|
| 2293 |
+
#endif
|
| 2294 |
+
} else {
|
| 2295 |
+
device->suballocation_block_size = device->max_memory_allocation_size;
|
| 2296 |
+
}
|
| 2297 |
+
device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
|
| 2298 |
+
|
| 2299 |
device->subgroup_size = subgroup_props.subgroupSize;
|
| 2300 |
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
| 2301 |
if (sm_builtins) {
|
|
|
|
| 7576 |
|
| 7577 |
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 7578 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 7579 |
+
return ctx->device->suballocation_block_size;
|
| 7580 |
}
|
| 7581 |
|
| 7582 |
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|